1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #include "libxsmm_trace.h"
12 #include "libxsmm_xcopy.h"
13 #include "libxsmm_gemm.h"
14 #include "libxsmm_hash.h"
15 #include "libxsmm_diff.h"
16 #include "libxsmm_main.h"
17 #if defined(LIBXSMM_PERF)
18 # include "libxsmm_perf.h"
19 #endif
20 #include "generator_common.h"
21
22 #if defined(LIBXSMM_OFFLOAD_TARGET)
23 # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
24 #endif
25 #if !defined(NDEBUG)
26 # include <errno.h>
27 #endif
28 #if defined(_WIN32)
29 # include <Windows.h>
30 #else
31 # include <sys/types.h>
32 # include <sys/mman.h>
33 # include <sys/stat.h>
34 # include <unistd.h>
35 # include <fcntl.h>
36 #endif
37 #if defined(LIBXSMM_OFFLOAD_TARGET)
38 # pragma offload_attribute(pop)
39 #endif
40
41 #if !defined(LIBXSMM_CODE_MAXSIZE)
42 # define LIBXSMM_CODE_MAXSIZE 131072
43 #endif
44 #if !defined(LIBXSMM_DIFF_SIZE)
45 # define LIBXSMM_DIFF_SIZE LIBXSMM_DESCRIPTOR_SIGSIZE
46 #endif
47 #if !defined(LIBXSMM_HASH_SIZE)
48 # define LIBXSMM_HASH_SIZE 32
49 #endif
50 #if !defined(LIBXSMM_HASH_SEED)
51 # define LIBXSMM_HASH_SEED 25071975
52 #endif
53 #if !defined(LIBXSMM_MALLOC_HOOK_ALIGN) && 1
54 # define LIBXSMM_MALLOC_HOOK_ALIGN
55 #endif
56 #if !defined(LIBXSMM_MALLOC_HOOK_INIT) && 0
57 # define LIBXSMM_MALLOC_HOOK_INIT
58 #endif
59 #if !defined(LIBXSMM_ENABLE_DEREG) && 0
60 # define LIBXSMM_ENABLE_DEREG
61 #endif
62 #if !defined(LIBXSMM_REGLOCK_TRY) && 0
63 # define LIBXSMM_REGLOCK_TRY
64 #endif
65 #if !defined(LIBXSMM_UNIFY_LOCKS) && 1
66 # define LIBXSMM_UNIFY_LOCKS
67 #endif
68 #if !defined(LIBXSMM_DIFF_INLINE) && 1
69 # define LIBXSMM_DIFF_INLINE
70 #endif
71 #if !defined(LIBXSMM_DESC_INLINE) && 0
72 # define LIBXSMM_DESC_INLINE
73 #endif
74 #if !defined(LIBXSMM_DESC_PAD) && 1
75 # define LIBXSMM_DESC_PAD
76 #endif
77 #if !defined(LIBXSMM_CACHE_PAD) && 1
78 # define LIBXSMM_CACHE_PAD
79 #endif
80 #if !defined(LIBXSMM_AUTOPIN) && 1
81 # define LIBXSMM_AUTOPIN
82 #endif
83 #if !defined(INTERNAL_DELIMS)
84 # define INTERNAL_DELIMS ";,:"
85 #endif
86
87 #if defined(LIBXSMM_AUTOPIN) && !defined(_WIN32)
88 LIBXSMM_EXTERN int putenv(char*) LIBXSMM_THROW;
89 #endif
90
91 /* flag fused into the memory address of a code version in case of non-JIT */
92 #define LIBXSMM_CODE_STATIC (1ULL << (8 * sizeof(void*) - 1))
93 /* flag fused into the memory address of a code version in case of collision */
94 #if 1 /* beneficial when registry approaches capacity (collisions) */
95 # define LIBXSMM_HASH_COLLISION (1ULL << (8 * sizeof(void*) - 2))
96 #endif
97
98 /** Helper macro determining the default prefetch strategy which is used for statically generated kernels. */
99 #if (0 > LIBXSMM_PREFETCH) /* auto-prefetch (frontend) */ || (defined(_WIN32) || defined(__CYGWIN__))
100 # define INTERNAL_PREFETCH LIBXSMM_GEMM_PREFETCH_NONE
101 #else
102 # define INTERNAL_PREFETCH ((libxsmm_gemm_prefetch_type)LIBXSMM_PREFETCH)
103 #endif
104
105 #if (0 != LIBXSMM_SYNC)
106 # if !defined(INTERNAL_REGLOCK_MAXN)
107 # if defined(_MSC_VER)
108 # define INTERNAL_REGLOCK_MAXN 0
109 # else
110 # define INTERNAL_REGLOCK_MAXN 0
111 # endif
112 # endif
113 # if (1 < INTERNAL_REGLOCK_MAXN)
114 # if !defined(LIBXSMM_CACHE_MAXSIZE) && (8 > INTERNAL_REGLOCK_MAXN)
115 # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
116 # endif
117 # if !defined(LIBXSMM_REGLOCK)
118 # define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT
119 # endif
120 # if !defined(LIBXSMM_CLEANUP_NTRY)
121 # define LIBXSMM_CLEANUP_NTRY 7
122 # endif
123 # if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_REGLOCK)
124 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype {
125 char pad[LIBXSMM_CACHELINE];
126 LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state;
127 } internal_reglocktype;
128 # else
129 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype {
130 LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state;
131 } internal_reglocktype;
132 # endif
133 LIBXSMM_APIVAR_DEFINE(internal_reglocktype internal_reglock[INTERNAL_REGLOCK_MAXN]);
134 # else /* RW-lock */
135 # if !defined(LIBXSMM_CACHE_MAXSIZE)
136 # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
137 # endif
138 # if !defined(LIBXSMM_REGLOCK)
139 # if defined(LIBXSMM_UNIFY_LOCKS)
140 # define LIBXSMM_REGLOCK LIBXSMM_LOCK
141 # elif defined(_MSC_VER)
142 # define LIBXSMM_REGLOCK LIBXSMM_LOCK_MUTEX
143 # elif 0
144 # define LIBXSMM_REGLOCK LIBXSMM_LOCK_RWLOCK
145 # else
146 # define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT
147 # endif
148 # endif
149 LIBXSMM_APIVAR_DEFINE(LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK)* internal_reglock_ptr);
150 # endif
151 #elif !defined(LIBXSMM_CACHE_MAXSIZE)
152 # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
153 #endif
154 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
155 # define LIBXSMM_CACHE_STRIDE LIBXSMM_MAX(sizeof(libxsmm_descriptor), LIBXSMM_DESCRIPTOR_MAXSIZE)
156 #else
157 # define LIBXSMM_CACHE_STRIDE LIBXSMM_DESCRIPTOR_MAXSIZE
158 #endif
159
160 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
161 # define INTERNAL_FIND_CODE_CACHE_GROW(RESULT_INDEX, CACHE_SIZE) \
162 RESULT_INDEX = CACHE_SIZE; CACHE_SIZE = (unsigned char)(0 != (CACHE_SIZE) ? ((CACHE_SIZE) << 1) : 1)
163 # define INTERNAL_FIND_CODE_CACHE_EVICT(RESULT_INDEX, CACHE_SIZE, CACHE_HIT) \
164 RESULT_INDEX = (unsigned char)LIBXSMM_MOD2((CACHE_HIT) + ((CACHE_SIZE) - 1), CACHE_SIZE)
165 #endif
166
167 #if (0 == LIBXSMM_SYNC)
168 # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) {
169 # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) }
170 #else
171 # if defined(LIBXSMM_REGLOCK_TRY)
172 # define INTERNAL_REGLOCK_TRY(DIFF, CODE) \
173 if (1 != internal_reglock_count) { /* (re-)try and get (meanwhile) generated code */ \
174 LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \
175 continue; \
176 } \
177 else { /* exit dispatch and let client fall back */ \
178 DIFF = 0; CODE = 0; break; \
179 }
180 # else
181 # define INTERNAL_REGLOCK_TRY(DIFF, CODE) \
182 LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \
183 continue
184 # endif
185 # if (1 < INTERNAL_REGLOCK_MAXN)
186 # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \
187 const unsigned int LOCKINDEX = (0 != internal_reglock_count ? LIBXSMM_MOD2(INDEX, internal_reglock_count) : 0); \
188 if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state)) { \
189 INTERNAL_REGLOCK_TRY(DIFF, CODE); \
190 }
191 # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state); }
192 # else /* RW-lock */
193 # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \
194 if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, internal_reglock_ptr)) { \
195 INTERNAL_REGLOCK_TRY(DIFF, CODE); \
196 }
197 # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); }
198 # endif
199 #endif
200
201
202 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_statistic_type {
203 unsigned int ntry, ncol, njit, nsta;
204 } internal_statistic_type;
205
206 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
207 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_cache_entry_type {
208 libxsmm_descriptor keys[LIBXSMM_CACHE_MAXSIZE];
209 libxsmm_code_pointer code[LIBXSMM_CACHE_MAXSIZE];
210 unsigned int id; /* to invalidate */
211 unsigned char size, hit;
212 } internal_cache_entry_type;
213
214 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_cache_type {
215 # if defined(LIBXSMM_CACHE_PAD)
216 char pad[LIBXSMM_UP2(sizeof(internal_cache_entry_type),LIBXSMM_CACHELINE)];
217 # endif
218 internal_cache_entry_type entry;
219 } internal_cache_type;
220
221 # if defined(LIBXSMM_NTHREADS_USE)
222 LIBXSMM_APIVAR_DEFINE(internal_cache_type* internal_cache_buffer);
223 # endif
224 LIBXSMM_APIVAR_DEFINE(int internal_cache_size);
225 #endif /*defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))*/
226
227 /** Determines the try-lock property (1<N: disabled, N=1: enabled [N=0: disabled in case of RW-lock]). */
228 LIBXSMM_APIVAR_DEFINE(int internal_reglock_count);
229 LIBXSMM_APIVAR_DEFINE(size_t internal_registry_nbytes);
230 LIBXSMM_APIVAR_DEFINE(unsigned int internal_registry_nleaks);
231 LIBXSMM_APIVAR_DEFINE(libxsmm_descriptor* internal_registry_keys);
232 LIBXSMM_APIVAR_DEFINE(libxsmm_code_pointer* internal_registry);
233 LIBXSMM_APIVAR_DEFINE(internal_statistic_type internal_statistic[2/*DP/SP*/][4/*sml/med/big/xxx*/]);
234 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_sml);
235 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_med);
236 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_mnk);
237 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_gemv);
238 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_mcopy);
239 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_meltw);
240 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_tcopy);
241 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_trsm);
242 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_trmm);
243 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_user);
244 LIBXSMM_APIVAR_DEFINE(int internal_gemm_auto_prefetch_locked);
245 LIBXSMM_APIVAR_DEFINE(const char* internal_build_state);
246 /** Time stamp (startup time of library). */
247 LIBXSMM_APIVAR_DEFINE(libxsmm_timer_tickint internal_timer_start);
248 LIBXSMM_APIVAR_DEFINE(libxsmm_cpuid_x86_info internal_cpuid_info);
249
250 #if defined(_WIN32)
251 # define INTERNAL_SINGLETON_HANDLE HANDLE
252 # define INTERNAL_SINGLETON(HANDLE) (NULL != (HANDLE))
253 #else
254 # define INTERNAL_SINGLETON_HANDLE int
255 # define INTERNAL_SINGLETON(HANDLE) (0 <= (HANDLE) && 0 != *internal_singleton_fname)
256 LIBXSMM_APIVAR_DEFINE(char internal_singleton_fname[64]);
257 #endif
258 LIBXSMM_APIVAR_DEFINE(INTERNAL_SINGLETON_HANDLE internal_singleton_handle);
259
260 /* definition of corresponding variables */
261 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_function libxsmm_default_malloc_fn);
262 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_function libxsmm_scratch_malloc_fn);
263 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_free_function libxsmm_default_free_fn);
264 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_free_function libxsmm_scratch_free_fn);
265 LIBXSMM_APIVAR_PRIVATE_DEF(const void* libxsmm_default_allocator_context);
266 LIBXSMM_APIVAR_PRIVATE_DEF(const void* libxsmm_scratch_allocator_context);
267 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_scratch_pools);
268 LIBXSMM_APIVAR_PRIVATE_DEF(double libxsmm_scratch_scale);
269 LIBXSMM_APIVAR_PRIVATE_DEF(double libxsmm_timer_scale);
270 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_statistic_num_spmdm);
271 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_thread_count);
272 /* definition of corresponding variables */
273 LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK) libxsmm_lock_global);
274 LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_nosync);
275
276 #if (0 != LIBXSMM_SYNC)
277 LIBXSMM_APIVAR_PRIVATE_DEF(LIBXSMM_TLS_TYPE libxsmm_tlskey);
278 #endif
279
280
libxsmm_memalign_internal(size_t alignment,size_t size)281 LIBXSMM_API_INTERN void* libxsmm_memalign_internal(size_t alignment, size_t size)
282 {
283 void* result;
284 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
285 result = __libc_memalign(alignment, size);
286 #elif defined(_WIN32) || defined(__CYGWIN__)
287 LIBXSMM_UNUSED(alignment);
288 result = malloc(size);
289 #else
290 if (0 != posix_memalign(&result, alignment, size)) result = NULL;
291 #endif
292 return result;
293 }
294
295
__real_memalign(size_t alignment,size_t size)296 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_memalign(size_t alignment, size_t size)
297 {
298 void* result;
299 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
300 if (
301 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
302 1 < libxsmm_ninit &&
303 # endif
304 NULL != libxsmm_malloc_fn.memalign.ptr)
305 {
306 result = libxsmm_malloc_fn.memalign.ptr(alignment, size);
307 }
308 else
309 #endif
310 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
311 result = __libc_memalign(alignment, size);
312 #else
313 result = libxsmm_memalign_internal(alignment, size);
314 #endif
315 return result;
316 }
317
318
__real_malloc(size_t size)319 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_malloc(size_t size)
320 {
321 void* result;
322 #if defined(LIBXSMM_MALLOC_HOOK_ALIGN)
323 const size_t alignment = libxsmm_alignment(size, 0/*auto*/);
324 result = __real_memalign(alignment, size);
325 #else
326 # if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
327 if (
328 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
329 1 < libxsmm_ninit &&
330 # endif
331 NULL != libxsmm_malloc_fn.malloc.ptr)
332 {
333 LIBXSMM_ASSERT(malloc != libxsmm_malloc_fn.malloc.ptr);
334 result = libxsmm_malloc_fn.malloc.ptr(size);
335 }
336 else
337 # endif
338 # if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
339 result = __libc_malloc(size);
340 # else
341 result = malloc(size);
342 # endif
343 #endif
344 return result;
345 }
346
347
348 #if defined(LIBXSMM_MALLOC_HOOK_CALLOC)
__real_calloc(size_t num,size_t size)349 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_calloc(size_t num, size_t size)
350 {
351 void* result;
352 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
353 if (
354 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
355 1 < libxsmm_ninit &&
356 # endif
357 NULL != libxsmm_malloc_fn.calloc.ptr)
358 {
359 LIBXSMM_ASSERT(calloc != libxsmm_malloc_fn.calloc.ptr);
360 result = libxsmm_malloc_fn.calloc.ptr(num, size);
361 }
362 else
363 #endif
364 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
365 result = __libc_calloc(num, size);
366 #else
367 result = calloc(num, size);
368 #endif
369 return result;
370 }
371 #endif
372
373
374 #if defined(LIBXSMM_MALLOC_HOOK_REALLOC)
__real_realloc(void * ptr,size_t size)375 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_realloc(void* ptr, size_t size)
376 {
377 void* result;
378 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
379 if (
380 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
381 1 < libxsmm_ninit &&
382 # endif
383 NULL != libxsmm_malloc_fn.realloc.ptr)
384 {
385 LIBXSMM_ASSERT(realloc != libxsmm_malloc_fn.realloc.ptr);
386 result = libxsmm_malloc_fn.realloc.ptr(ptr, size);
387 }
388 else
389 #endif
390 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
391 result = __libc_realloc(ptr, size);
392 #else
393 result = realloc(ptr, size);
394 #endif
395 return result;
396 }
397 #endif
398
399
__real_free(void * ptr)400 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void __real_free(void* ptr)
401 {
402 if (NULL != ptr) {
403 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
404 if (
405 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
406 1 < libxsmm_ninit &&
407 # endif
408 NULL != libxsmm_malloc_fn.free.ptr)
409 {
410 LIBXSMM_ASSERT(free != libxsmm_malloc_fn.free.ptr);
411 libxsmm_malloc_fn.free.ptr(ptr);
412 }
413 else
414 #endif
415 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
416 __libc_free(ptr);
417 #else
418 free(ptr);
419 #endif
420 }
421 }
422
423
internal_update_mmstatistic(const libxsmm_gemm_descriptor * desc,unsigned int ntry,unsigned int ncol,unsigned int njit,unsigned int nsta)424 LIBXSMM_API_INLINE void internal_update_mmstatistic(const libxsmm_gemm_descriptor* desc,
425 unsigned int ntry, unsigned int ncol, unsigned int njit, unsigned int nsta)
426 {
427 LIBXSMM_ASSERT(NULL != desc);
428 if (1 < desc->m && 1 < desc->n) { /* only record matrix-matrix multiplication */
429 const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k);
430 const int idx = (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(desc->datatype) ? 0 : 1);
431 int bucket;
432 if (LIBXSMM_MNK_SIZE(internal_statistic_sml, internal_statistic_sml, internal_statistic_sml) >= kernel_size) {
433 bucket = 0;
434 }
435 else if (LIBXSMM_MNK_SIZE(internal_statistic_med, internal_statistic_med, internal_statistic_med) >= kernel_size) {
436 bucket = 1;
437 }
438 else if (LIBXSMM_MNK_SIZE(internal_statistic_mnk, internal_statistic_mnk, internal_statistic_mnk) >= kernel_size) {
439 bucket = 2;
440 }
441 else { /*huge*/
442 bucket = 3;
443 }
444 if (0 != ncol) ncol/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ncol, ncol, LIBXSMM_ATOMIC_RELAXED);
445 if (0 != ntry) ntry/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ntry, ntry, LIBXSMM_ATOMIC_RELAXED);
446 /* the following counters are not manipulated concurrently (no need for atomic increment) */
447 if (0 != njit) internal_statistic[idx][bucket].njit += njit;
448 if (0 != nsta) internal_statistic[idx][bucket].nsta += nsta;
449 }
450 }
451
452
internal_print_number(unsigned int n,char default_unit,char * unit)453 LIBXSMM_API_INLINE unsigned int internal_print_number(unsigned int n, char default_unit, char* unit)
454 {
455 unsigned int number = n;
456 LIBXSMM_ASSERT(NULL != unit);
457 *unit = default_unit;
458 if ((1000000) <= n) {
459 number = (n + 500000) / 1000000;
460 *unit = 'm';
461 }
462 else if (9999 < n) {
463 number = (n + 500) / 1000;
464 *unit = 'k';
465 }
466 return number;
467 }
468
469
internal_print_statistic(FILE * ostream,const char * target_arch,int precision,unsigned int linebreaks,unsigned int indent)470 LIBXSMM_API_INLINE unsigned int internal_print_statistic(FILE* ostream,
471 const char* target_arch, int precision, unsigned int linebreaks, unsigned int indent)
472 {
473 const internal_statistic_type statistic_sml = internal_statistic[precision][0/*SML*/];
474 const internal_statistic_type statistic_med = internal_statistic[precision][1/*MED*/];
475 const internal_statistic_type statistic_big = internal_statistic[precision][2/*BIG*/];
476 const internal_statistic_type statistic_xxx = internal_statistic[precision][3/*XXX*/];
477 int printed = 0;
478 LIBXSMM_ASSERT(NULL != ostream && (0 <= precision && precision < 2));
479
480 if (/* omit to print anything if it is superfluous */
481 0 != statistic_sml.ntry || 0 != statistic_sml.njit || 0 != statistic_sml.nsta || 0 != statistic_sml.ncol ||
482 0 != statistic_med.ntry || 0 != statistic_med.njit || 0 != statistic_med.nsta || 0 != statistic_med.ncol ||
483 0 != statistic_big.ntry || 0 != statistic_big.njit || 0 != statistic_big.nsta || 0 != statistic_big.ncol ||
484 0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol)
485 {
486 char title[256], range[256], unit[4];
487 unsigned int counter[4];
488 {
489 unsigned int n;
490 if (NULL != target_arch && 0 != *target_arch) {
491 assert(strlen(target_arch) < sizeof(title)); /* !LIBXSMM_ASSERT */
492 for (n = 0; 0 != target_arch[n] /*avoid code-gen. issue with some clang versions: && n < sizeof(title)*/; ++n) {
493 const char c = target_arch[n];
494 title[n] = (char)(('a' <= c && c <= 'z') ? (c - 32) : c); /* toupper */
495 }
496 LIBXSMM_SNPRINTF(title + n, sizeof(title) - n, "/%s", 0 == precision ? "DP" : "SP");
497 }
498 else {
499 LIBXSMM_SNPRINTF(title, sizeof(title), "%s", 0 == precision ? "DP" : "SP");
500 }
501 for (n = 0; n < linebreaks; ++n) fprintf(ostream, "\n");
502 }
503 fprintf(ostream, "%*s%-8s %6s %6s %6s %6s\n", (int)indent, "", title, "TRY", "JIT", "STA", "COL");
504 LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", 0u, internal_statistic_sml);
505 counter[0] = internal_print_number(statistic_sml.ntry, ' ', unit + 0);
506 counter[1] = internal_print_number(statistic_sml.njit, ' ', unit + 1);
507 counter[2] = internal_print_number(statistic_sml.nsta, ' ', unit + 2);
508 counter[3] = internal_print_number(statistic_sml.ncol, ' ', unit + 3);
509 fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
510 counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
511 LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_sml + 1u, internal_statistic_med);
512 counter[0] = internal_print_number(statistic_med.ntry, ' ', unit + 0);
513 counter[1] = internal_print_number(statistic_med.njit, ' ', unit + 1);
514 counter[2] = internal_print_number(statistic_med.nsta, ' ', unit + 2);
515 counter[3] = internal_print_number(statistic_med.ncol, ' ', unit + 3);
516 fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
517 counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
518 LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_med + 1u, internal_statistic_mnk);
519 counter[0] = internal_print_number(statistic_big.ntry, ' ', unit + 0);
520 counter[1] = internal_print_number(statistic_big.njit, ' ', unit + 1);
521 counter[2] = internal_print_number(statistic_big.nsta, ' ', unit + 2);
522 counter[3] = internal_print_number(statistic_big.ncol, ' ', unit + 3);
523 fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
524 counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
525 if (0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) {
526 LIBXSMM_SNPRINTF(range, sizeof(range), "> %u", internal_statistic_mnk);
527 counter[0] = internal_print_number(statistic_xxx.ntry, ' ', unit + 0);
528 counter[1] = internal_print_number(statistic_xxx.njit, ' ', unit + 1);
529 counter[2] = internal_print_number(statistic_xxx.nsta, ' ', unit + 2);
530 counter[3] = internal_print_number(statistic_xxx.ncol, ' ', unit + 3);
531 fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
532 counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
533 }
534 printed = 1;
535 }
536
537 return printed;
538 }
539
540
541 #if !(defined(_WIN32) || defined(__CYGWIN__))
internal_statistic_ntry(int precision)542 LIBXSMM_API_INLINE unsigned int internal_statistic_ntry(int precision)
543 {
544 return internal_statistic[precision][0/*SML*/].ntry + internal_statistic[precision][1/*MED*/].ntry
545 + internal_statistic[precision][2/*BIG*/].ntry + internal_statistic[precision][3/*XXX*/].ntry;
546 }
547 #endif
548
549
550 #if !defined(_WIN32)
internal_register_static_code(libxsmm_gemm_precision precision,libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_xmmfunction xgemm,libxsmm_code_pointer * registry)551 LIBXSMM_API_INLINE void internal_register_static_code(
552 libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
553 libxsmm_xmmfunction xgemm, libxsmm_code_pointer* registry)
554 {
555 const libxsmm_blasint lda = m, ldb = k, ldc = m;
556 /*const*/ int precondition = LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc);
557 if (precondition) {
558 const size_t size = (LIBXSMM_HASH_SIZE) - sizeof(libxsmm_descriptor_kind);
559 libxsmm_descriptor_blob blob;
560 const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob, precision,
561 m, n, k, lda, ldb, ldc, LIBXSMM_ALPHA, LIBXSMM_BETA, LIBXSMM_FLAGS, INTERNAL_PREFETCH);
562 unsigned int i = LIBXSMM_MOD2(
563 libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(sizeof(libxsmm_gemm_descriptor), size)),
564 LIBXSMM_CAPACITY_REGISTRY);
565 libxsmm_code_pointer* dst_entry = registry + i;
566 #if !defined(NDEBUG)
567 libxsmm_code_pointer code; code.xgemm = xgemm;
568 LIBXSMM_ASSERT(NULL != code.ptr_const && NULL != registry);
569 LIBXSMM_ASSERT(0 == (LIBXSMM_CODE_STATIC & code.uval));
570 #endif
571 if (NULL != dst_entry->ptr_const) { /* collision */
572 const unsigned int i0 = i;
573 do { /* continue to linearly search for an available slot */
574 i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
575 if (NULL == registry[i].ptr_const) break;
576 } while (i != i0);
577 #if defined(LIBXSMM_HASH_COLLISION) /* mark entry as a collision */
578 dst_entry->uval |= LIBXSMM_HASH_COLLISION;
579 #endif
580 dst_entry = registry + i; /* update destination */
581 internal_update_mmstatistic(desc, 0, 1/*collision*/, 0, 0);
582 /* out of capacity (no registry slot available) */
583 LIBXSMM_ASSERT(NULL == dst_entry->ptr_const || i == i0);
584 }
585 if (NULL == dst_entry->ptr_const) { /* registry not exhausted */
586 internal_registry_keys[i].kind = LIBXSMM_KERNEL_KIND_MATMUL;
587 LIBXSMM_ASSIGN127(&internal_registry_keys[i].gemm.desc, desc);
588 dst_entry->xgemm = xgemm;
589 /* mark current entry as static code (non-JIT) */
590 dst_entry->uval |= LIBXSMM_CODE_STATIC;
591 }
592 internal_update_mmstatistic(desc, 1/*try*/, 0, 0, 0);
593 }
594 }
595 #endif
596
597
598 LIBXSMM_API_INTERN void internal_release_scratch(void);
internal_release_scratch(void)599 LIBXSMM_API_INTERN void internal_release_scratch(void)
600 {
601 libxsmm_xrelease_scratch(NULL/*lock*/);
602 /* release global services */
603 libxsmm_memory_finalize();
604 libxsmm_hash_finalize();
605 libxsmm_malloc_finalize();
606 }
607
608
609 /* Caution: cannot be used multiple time in a single expression! */
libxsmm_format_size(char buffer[32],int buffer_size,size_t nbytes,const char scale[],const char * unit,int base)610 LIBXSMM_API_INTERN size_t libxsmm_format_size(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base)
611 {
612 const int len = (NULL != scale ? ((int)strlen(scale)) : 0);
613 const int m = LIBXSMM_INTRINSICS_BITSCANBWD64(nbytes) / base, n = LIBXSMM_MIN(m, len);
614 int i;
615 buffer[0] = 0; /* clear */
616 LIBXSMM_ASSERT(NULL != unit && 0 <= base);
617 for (i = 0; i < n; ++i) nbytes >>= base;
618 LIBXSMM_SNPRINTF(buffer, buffer_size, "%i %c%s",
619 (int)nbytes, 0 < n ? scale[n-1] : *unit, 0 < n ? unit : "");
620 return nbytes;
621 }
622
623
624 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_dump(FILE* ostream, int urgent);
internal_dump(FILE * ostream,int urgent)625 LIBXSMM_API_INTERN void internal_dump(FILE* ostream, int urgent)
626 {
627 char *const env_dump_build = getenv("LIBXSMM_DUMP_BUILD");
628 char *const env_dump_files = (NULL != getenv("LIBXSMM_DUMP_FILES")
629 ? getenv("LIBXSMM_DUMP_FILES")
630 : getenv("LIBXSMM_DUMP_FILE"));
631 LIBXSMM_ASSERT_MSG(INTERNAL_SINGLETON(internal_singleton_handle), "Invalid handle");
632 /* determine whether this instance is unique or not */
633 if (NULL != env_dump_files && 0 != *env_dump_files && 0 == urgent) { /* dump per-node info */
634 const char* filename = strtok(env_dump_files, INTERNAL_DELIMS);
635 for (; NULL != filename; filename = strtok(NULL, INTERNAL_DELIMS)) {
636 FILE* const file = fopen(filename, "r");
637 if (NULL != file) {
638 int c = fgetc(file);
639 fprintf(ostream, "\n\nLIBXSMM_DUMP_FILE: %s\n", filename);
640 /* coverity[tainted_data] */
641 while (EOF != c) {
642 fputc(c, stdout);
643 c = fgetc(file);
644 }
645 fputc('\n', stdout);
646 fclose(file);
647 }
648 }
649 }
650 if (NULL != internal_build_state /* dump build state */
651 && NULL != env_dump_build && 0 != *env_dump_build)
652 {
653 const int dump_build = atoi(env_dump_build);
654 if (0 == urgent ? (0 < dump_build) : (0 > dump_build)) {
655 fprintf(ostream, "\n\nBUILD_DATE=%i\n", LIBXSMM_CONFIG_BUILD_DATE);
656 fprintf(ostream, "%s\n", internal_build_state);
657 }
658 }
659 }
660
661
662 LIBXSMM_API_INTERN void internal_finalize(void);
internal_finalize(void)663 LIBXSMM_API_INTERN void internal_finalize(void)
664 {
665 libxsmm_finalize();
666 LIBXSMM_STDIO_ACQUIRE(); /* synchronize I/O */
667 if (0 != libxsmm_verbosity) { /* print statistic on termination */
668 const char *const env_target_hidden = getenv("LIBXSMM_TARGET_HIDDEN");
669 const char *const target_arch = (NULL == env_target_hidden || 0 == atoi(env_target_hidden))
670 ? libxsmm_cpuid_name(libxsmm_target_archid) : NULL/*hidden*/;
671 fprintf(stderr, "\nLIBXSMM_VERSION: %s%s%s (%i)", LIBXSMM_BRANCH,
672 0 != *(LIBXSMM_BRANCH) ? "-" : "", 0 != *(LIBXSMM_VERSION) ? (LIBXSMM_VERSION) : "unconfigured",
673 LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH));
674 if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) {
675 unsigned int linebreak = (0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0)) ? 1 : 0;
676 const int high_verbosity = (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity);
677 size_t size_scratch = 0, size_private = 0;
678 libxsmm_scratch_info scratch_info;
679 libxsmm_cpuid_x86_info info;
680 libxsmm_cpuid_x86(&info);
681 if ((LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) &&
682 0 == internal_cpuid_info.has_context && 0 != info.has_context)
683 {
684 fprintf(stderr, "\nLIBXSMM: CPU features have been promoted.");
685 }
686 if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak && NULL != target_arch) {
687 fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch);
688 }
689 if (EXIT_SUCCESS == libxsmm_get_scratch_info(&scratch_info)) {
690 size_private = scratch_info.internal;
691 size_scratch = scratch_info.size;
692 }
693 if (0 != size_private) { /* should be always true */
694 char size_private_buffer[32], size_code_buffer[32];
695 /* coverity[check_return] */
696 libxsmm_format_size(size_private_buffer, sizeof(size_private_buffer), size_private, "KM", "B", 10);
697 fprintf(stderr, "Registry and code: %s", size_private_buffer);
698 if (0 != libxsmm_format_size(size_code_buffer, sizeof(size_code_buffer), internal_registry_nbytes, "KM", "B", 10)) {
699 fprintf(stderr, " + %s", size_code_buffer);
700 }
701 }
702 if (0 != high_verbosity) {
703 unsigned int ngemms = 0;
704 int i; for (i = 0; i < 4; ++i) {
705 ngemms += internal_statistic[0/*DP*/][i].nsta + internal_statistic[1/*SP*/][i].nsta;
706 ngemms += internal_statistic[0/*DP*/][i].njit + internal_statistic[1/*SP*/][i].njit;
707 }
708 if (0 != ngemms || 0 != internal_statistic_num_gemv
709 || 0 != internal_statistic_num_mcopy || 0 != internal_statistic_num_tcopy
710 || 0 != libxsmm_statistic_num_spmdm
711 || 0 != internal_statistic_num_user
712 || 0 != internal_registry_nleaks)
713 {
714 const char sep[] = " ", *s = "";
715 fprintf(stderr, " (");
716 if (0 != ngemms) { fprintf(stderr, "gemm=%u", ngemms); s = sep; }
717 if (0 != internal_statistic_num_gemv) { fprintf(stderr, "%sgemv=%u", s, internal_statistic_num_gemv); s = sep; }
718 if (0 != internal_statistic_num_mcopy) { fprintf(stderr, "%smcopy=%u", s, internal_statistic_num_mcopy); s = sep; }
719 if (0 != internal_statistic_num_meltw) { fprintf(stderr, "%smeltw=%u", s, internal_statistic_num_meltw); s = sep; }
720 if (0 != internal_statistic_num_tcopy) { fprintf(stderr, "%stcopy=%u", s, internal_statistic_num_tcopy); s = sep; }
721 if (0 != libxsmm_statistic_num_spmdm) { fprintf(stderr, "%sspmdm=%u", s, libxsmm_statistic_num_spmdm); s = sep; }
722 if (0 != internal_statistic_num_user) { fprintf(stderr, "%suser=%u", s, internal_statistic_num_user); s = sep; }
723 if (0 != internal_registry_nleaks) { fprintf(stderr, "%snleaks=%u", s, internal_registry_nleaks); s = sep; }
724 fprintf(stderr, ")");
725 }
726 }
727 fprintf(stderr, "\n");
728 if (0 != size_scratch) {
729 char size_scratch_buffer[32];
730 /* coverity[check_return] */
731 libxsmm_format_size(size_scratch_buffer, sizeof(size_scratch_buffer), size_scratch, "KM", "B", 10);
732 fprintf(stderr, "Scratch: %s", size_scratch_buffer);
733 if (0 != high_verbosity) {
734 fprintf(stderr, " (mallocs=%lu, pools=%u)\n", (unsigned long int)scratch_info.nmallocs, scratch_info.npools);
735 }
736 else {
737 fprintf(stderr, "\n");
738 }
739 }
740 if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) {
741 fprintf(stderr, "Uptime: %f s", libxsmm_timer_duration(internal_timer_start, libxsmm_timer_tick()));
742 if (1 < libxsmm_thread_count && INT_MAX == libxsmm_verbosity) {
743 fprintf(stderr, " (nthreads=%u)", libxsmm_thread_count);
744 }
745 fprintf(stderr, "\n");
746 }
747 }
748 else {
749 fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch);
750 }
751 }
752 /* release scratch memory pool */
753 if (EXIT_SUCCESS != atexit(internal_release_scratch) && 0 != libxsmm_verbosity) {
754 fprintf(stderr, "LIBXSMM ERROR: failed to perform final cleanup!\n");
755 }
756 /* determine whether this instance is unique or not */
757 if (INTERNAL_SINGLETON(internal_singleton_handle)) {
758 internal_dump(stdout, 0/*urgent*/);
759 /* cleanup singleton */
760 #if defined(_WIN32)
761 ReleaseMutex(internal_singleton_handle);
762 CloseHandle(internal_singleton_handle);
763 #else
764 unlink(internal_singleton_fname);
765 close(internal_singleton_handle);
766 #endif
767 }
768 LIBXSMM_STDIO_RELEASE(); /* synchronize I/O */
769 #if (0 != LIBXSMM_SYNC)
770 { /* release locks */
771 # if (1 < INTERNAL_REGLOCK_MAXN)
772 int i; for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, &internal_reglock[i].state);
773 # elif !defined(LIBXSMM_UNIFY_LOCKS)
774 LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, internal_reglock_ptr);
775 # endif
776 LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK, &libxsmm_lock_global);
777 }
778 #endif
779 }
780
781
782 #if defined(LIBXSMM_INTERCEPT_DYNAMIC)
783 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* /*message*/, int /*len*/, int /*quiet*/);
_gfortran_stop_string(const char * message,int len,int quiet)784 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* message, int len, int quiet)
785 { /* STOP termination handler for GNU Fortran runtime */
786 static int once = 0;
787 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
788 union { const void* dlsym; void (*ptr)(const char*, int, int); } stop;
789 dlerror(); /* clear an eventual error status */
790 stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "_gfortran_stop_string");
791 if (NULL != stop.dlsym) {
792 stop.ptr(message, len, quiet);
793 }
794 else exit(EXIT_SUCCESS); /* statically linked runtime */
795 }
796 }
797
798 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* /*message*/, int /*len*/);
for_stop_core(const char * message,int len)799 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* message, int len)
800 { /* STOP termination handler for Intel Fortran runtime */
801 static int once = 0;
802 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
803 union { const void* dlsym; void (*ptr)(const char*, int); } stop;
804 dlerror(); /* clear an eventual error status */
805 stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core");
806 if (NULL != stop.dlsym) {
807 stop.ptr(message, len);
808 }
809 else exit(EXIT_SUCCESS); /* statically linked runtime */
810 }
811 }
812
813 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void);
for_stop_core_quiet(void)814 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void)
815 { /* STOP termination handler for Intel Fortran runtime */
816 static int once = 0;
817 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
818 union { const void* dlsym; void (*ptr)(void); } stop;
819 dlerror(); /* clear an eventual error status */
820 stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core_quiet");
821 if (NULL != stop.dlsym) {
822 stop.ptr();
823 }
824 else exit(EXIT_SUCCESS); /* statically linked runtime */
825 }
826 }
827 #endif
828
829
830 LIBXSMM_API_INTERN size_t internal_strlen(const char* /*cstr*/, size_t /*maxlen*/);
internal_strlen(const char * cstr,size_t maxlen)831 LIBXSMM_API_INTERN size_t internal_strlen(const char* cstr, size_t maxlen)
832 {
833 size_t result = 0;
834 if (NULL != cstr) {
835 while (0 != cstr[result] && result < maxlen) ++result;
836 }
837 return result;
838 }
839
840
841 LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* /*nbytes*/, size_t /*ndefault*/);
internal_parse_nbytes(const char * nbytes,size_t ndefault)842 LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* nbytes, size_t ndefault)
843 {
844 size_t result = ndefault;
845 if (NULL != nbytes && 0 != *nbytes) {
846 size_t u = internal_strlen(nbytes, 32) - 1;
847 const char unit[] = "kmgKMG", *const hit = strchr(unit, nbytes[u]);
848 const long long int ibytes = atol(nbytes); /* take with increased type-width */
849 result = (size_t)ibytes;
850 if ((size_t)LIBXSMM_UNLIMITED != result) {
851 u = (0 != hit ? ((hit - unit) % 3) : 3);
852 if (u < 3) {
853 result <<= (u + 1) * 10;
854 }
855 }
856 }
857 return result;
858 }
859
860
861 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_init(void);
internal_init(void)862 LIBXSMM_API_INTERN void internal_init(void)
863 {
864 int i;
865 #if (0 != LIBXSMM_SYNC) /* setup the locks in a thread-safe fashion */
866 LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global);
867 # if (1 < INTERNAL_REGLOCK_MAXN)
868 for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
869 # elif !defined(LIBXSMM_UNIFY_LOCKS)
870 LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr);
871 # endif
872 #endif
873 if (NULL == internal_registry) { /* double-check after acquiring the lock(s) */
874 #if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_AUTOPIN)
875 /* clear error status (dummy condition: it does not matter if MPI_Init or MPI_Abort) */
876 const char* const dlsymname = (NULL == dlerror() ? "MPI_Init" : "MPI_Abort");
877 const void* const dlsymbol = dlsym(LIBXSMM_RTLD_NEXT, dlsymname);
878 const void* const dlmpi = (NULL == dlerror() ? dlsymbol : NULL);
879 #endif
880 const char* const env_verbose = getenv("LIBXSMM_VERBOSE");
881 void* new_registry = NULL, * new_keys = NULL;
882 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
883 # if defined(LIBXSMM_NTHREADS_USE)
884 void* new_cache = NULL;
885 # endif
886 const char* const env_cache = getenv("LIBXSMM_CACHE");
887 if (NULL != env_cache && 0 != *env_cache) {
888 const int cache_size = atoi(env_cache), cache_size2 = LIBXSMM_UP2POT(cache_size);
889 internal_cache_size = LIBXSMM_MIN(cache_size2, LIBXSMM_CACHE_MAXSIZE);
890 }
891 else {
892 internal_cache_size = LIBXSMM_CACHE_MAXSIZE;
893 }
894 #endif
895 /* setup verbosity as early as possible since below code may rely on verbose output */
896 if (NULL != env_verbose && 0 != *env_verbose) {
897 libxsmm_verbosity = atoi(env_verbose);
898 }
899 #if !defined(NDEBUG)
900 else {
901 libxsmm_verbosity = INT_MAX; /* quiet -> verbose */
902 }
903 #endif
904 #if (0 == LIBXSMM_JIT)
905 if (2 > libxsmm_ninit && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) {
906 fprintf(stderr, "LIBXSMM: JIT-code generation was disabled at compile-time.\n");
907 }
908 #endif
909 #if defined(LIBXSMM_AUTOPIN)
910 # if defined(LIBXSMM_INTERCEPT_DYNAMIC)
911 /* MPI: unwanted affinity can slow-down unrelated jobs (over-subscription), e.g., CP2K regtests */
912 if (NULL == dlmpi)
913 # endif
914 { /* setup some viable affinity if nothing else is present */
915 const char *const gomp_cpu_affinity = getenv("GOMP_CPU_AFFINITY");
916 const char *const kmp_affinity = getenv("KMP_AFFINITY");
917 const char *const omp_proc_bind = getenv("OMP_PROC_BIND");
918 if ((NULL == gomp_cpu_affinity || 0 == *gomp_cpu_affinity)
919 && (NULL == kmp_affinity || 0 == *kmp_affinity)
920 && (NULL == omp_proc_bind || 0 == *omp_proc_bind))
921 {
922 static char affinity[] = "OMP_PROC_BIND=TRUE";
923 LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(affinity));
924 if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */
925 fprintf(stderr, "LIBXSMM: prepared to pin threads.\n");
926 }
927 }
928 }
929 # if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_MALLOC)
930 else if (NULL == getenv("I_MPI_SHM_HEAP")) {
931 static char shmheap[] = "I_MPI_SHM_HEAP=1";
932 LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(shmheap));
933 }
934 # endif
935 #endif
936 #if !defined(_WIN32) && 0
937 umask(S_IRUSR | S_IWUSR); /* setup default/secure file mask */
938 #endif
939 #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))
940 { const char *const env = getenv("LIBXSMM_SCRATCH_POOLS");
941 if (NULL == env || 0 == *env) {
942 libxsmm_scratch_pools = LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS;
943 }
944 else {
945 libxsmm_scratch_pools = LIBXSMM_CLMP(atoi(env), 0, LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS);
946 /*libxsmm_scratch_pools_locked = 1;*/
947 }
948 LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS);
949 }
950 { const char *const env = getenv("LIBXSMM_SCRATCH_SCALE");
951 if (NULL == env || 0 == *env) {
952 libxsmm_scratch_scale = LIBXSMM_MALLOC_SCRATCH_SCALE;
953 }
954 else {
955 libxsmm_scratch_scale = LIBXSMM_CLMP(atof(env), 1.0, 10.0);
956 /*libxsmm_scratch_scale_locked = 1;*/
957 }
958 LIBXSMM_ASSERT(1 <= libxsmm_scratch_scale);
959 }
960 libxsmm_set_scratch_limit(internal_parse_nbytes(getenv("LIBXSMM_SCRATCH_LIMIT"), LIBXSMM_SCRATCH_DEFAULT));
961 #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/
962 { /* setup malloc-interception after internal allocations */
963 const libxsmm_malloc_function null_malloc_fn = { 0 };
964 const libxsmm_free_function null_free_fn = { 0 };
965 const char *const env_k = getenv("LIBXSMM_MALLOC");
966 char *const env_t = getenv("LIBXSMM_MALLOC_LIMIT");
967 const char* env_i = (NULL != env_t ? strtok(env_t, INTERNAL_DELIMS) : NULL);
968 const size_t malloc_lo = internal_parse_nbytes(env_i, LIBXSMM_MALLOC_LIMIT);
969 const size_t malloc_hi = (NULL != env_i ? internal_parse_nbytes(
970 strtok(NULL, INTERNAL_DELIMS), LIBXSMM_SCRATCH_UNLIMITED) : LIBXSMM_SCRATCH_UNLIMITED);
971 const int malloc_kind = ((NULL == env_k || 0 == *env_k) ? 0/*disabled*/ : atoi(env_k));
972 libxsmm_xset_default_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn);
973 libxsmm_xset_scratch_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn);
974 libxsmm_set_malloc(malloc_kind, &malloc_lo, &malloc_hi); /* implies libxsmm_malloc_init */
975 }
976 #if defined(LIBXSMM_MAXTARGET)
977 libxsmm_set_target_arch(LIBXSMM_STRINGIFY(LIBXSMM_MAXTARGET));
978 #else /* attempt to set libxsmm_target_archid per environment variable */
979 libxsmm_set_target_arch(getenv("LIBXSMM_TARGET"));
980 #endif
981 { const char *const env = getenv("LIBXSMM_SYNC");
982 libxsmm_nosync = (NULL == env || 0 == *env) ? 0/*default*/ : atoi(env);
983 }
984 /* clear internal counters/statistic */
985 for (i = 0; i < 4/*sml/med/big/xxx*/; ++i) {
986 LIBXSMM_MEMZERO127(&internal_statistic[0/*DP*/][i]);
987 LIBXSMM_MEMZERO127(&internal_statistic[1/*SP*/][i]);
988 }
989 internal_statistic_mnk = LIBXSMM_MAX_DIM;
990 internal_statistic_sml = 13;
991 internal_statistic_med = 23;
992 LIBXSMM_ASSERT(LIBXSMM_CAPACITY_REGISTRY == LIBXSMM_UP2POT(LIBXSMM_CAPACITY_REGISTRY));
993 libxsmm_hash_init(libxsmm_target_archid); /* used by debug memory allocation (checksum) */
994 libxsmm_memory_init(libxsmm_target_archid);
995 if (
996 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
997 (EXIT_SUCCESS == libxsmm_xmalloc(&new_cache, /* if internal_cache_size is zero, allocation must still happen (later control-flow too expensive) */
998 sizeof(internal_cache_type) * (LIBXSMM_NTHREADS_MAX), LIBXSMM_CACHELINE/*alignment*/,
999 LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_cache) &&
1000 #endif
1001 (EXIT_SUCCESS == libxsmm_xmalloc(&new_keys, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_descriptor), 0/*auto-align*/,
1002 LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_keys) &&
1003 (EXIT_SUCCESS == libxsmm_xmalloc(&new_registry, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_code_pointer), 0/*auto-align*/,
1004 LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_registry))
1005 {
1006 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1007 LIBXSMM_ASSERT(NULL != new_cache); /* SA: suppress false positive */
1008 memset(new_cache, 0, (LIBXSMM_NTHREADS_MAX) * sizeof(internal_cache_type));
1009 #endif
1010 libxsmm_xcopy_init(libxsmm_target_archid);
1011 libxsmm_dnn_init(libxsmm_target_archid);
1012 #if defined(LIBXSMM_PERF)
1013 libxsmm_perf_init();
1014 #endif
1015 { const char *const env = getenv("LIBXSMM_GEMM_PREFETCH");
1016 #if defined(_WIN32) || defined(__CYGWIN__)
1017 libxsmm_gemm_auto_prefetch_default = INTERNAL_PREFETCH;
1018 #else
1019 libxsmm_gemm_auto_prefetch_default = (0 == internal_statistic_ntry(0/*DP*/) && 0 == internal_statistic_ntry(1/*SP*/))
1020 /* avoid special prefetch if static code is present, since such code uses INTERNAL_PREFETCH */
1021 ? (((LIBXSMM_X86_AVX512 >= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid))
1022 ? LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C : LIBXSMM_GEMM_PREFETCH_BL2_VIA_C)
1023 : INTERNAL_PREFETCH;
1024 #endif
1025 libxsmm_gemm_auto_prefetch = INTERNAL_PREFETCH;
1026 if (NULL != env && 0 != *env) { /* user input beyond auto-prefetch is always considered */
1027 const int uid = atoi(env);
1028 if (0 <= uid) {
1029 libxsmm_gemm_auto_prefetch_default = libxsmm_gemm_uid2prefetch(uid);
1030 libxsmm_gemm_auto_prefetch = libxsmm_gemm_auto_prefetch_default;
1031 internal_gemm_auto_prefetch_locked = 1;
1032 }
1033 }
1034 }
1035 for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) ((libxsmm_code_pointer*)new_registry)[i].ptr = NULL;
1036 LIBXSMM_ASSERT(NULL == internal_registry && NULL == internal_registry_keys);
1037 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1038 LIBXSMM_ASSERT(NULL == internal_cache_buffer);
1039 internal_cache_buffer = (internal_cache_type*)new_cache;
1040 #endif
1041 internal_registry_keys = (libxsmm_descriptor*)new_keys; /* prior to registering static kernels */
1042 #if defined(LIBXSMM_BUILD) && !defined(LIBXSMM_DEFAULT_CONFIG)
1043 # include <libxsmm_dispatch.h>
1044 #endif
1045 libxsmm_gemm_init(libxsmm_target_archid);
1046 #if defined(LIBXSMM_TRACE)
1047 { int filter_threadid = 0/*only main-thread*/, filter_mindepth = 0, filter_maxnsyms = 0;
1048 const int init_code = libxsmm_trace_init(filter_threadid, filter_mindepth, filter_maxnsyms);
1049 if (EXIT_SUCCESS != init_code && 0 != libxsmm_verbosity) { /* library code is expected to be mute */
1050 fprintf(stderr, "LIBXSMM ERROR: failed to initialize TRACE (error #%i)!\n", init_code);
1051 }
1052 }
1053 #endif
1054 { /* commit the registry buffer and enable global visibility */
1055 void *const pv_registry = &internal_registry;
1056 LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)((void**)pv_registry, (void*)new_registry, LIBXSMM_ATOMIC_SEQ_CST);
1057 }
1058 }
1059 else {
1060 if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1061 fprintf(stderr, "LIBXSMM ERROR: failed to allocate internal buffers!\n");
1062 }
1063 libxsmm_xfree(new_registry, 0/*no check*/);
1064 libxsmm_xfree(new_keys, 0/*no check*/);
1065 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1066 libxsmm_xfree(new_cache, 0/*no check*/);
1067 #endif
1068 }
1069 }
1070 #if (0 != LIBXSMM_SYNC) /* release locks */
1071 # if (1 < INTERNAL_REGLOCK_MAXN)
1072 for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
1073 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1074 LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1075 # endif
1076 LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global);
1077 #endif
1078 }
1079
1080
libxsmm_init(void)1081 LIBXSMM_API LIBXSMM_ATTRIBUTE_CTOR void libxsmm_init(void)
1082 {
1083 if (0 == LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED)) {
1084 static unsigned int ninit = 0, gid = 0;
1085 const unsigned int tid = LIBXSMM_ATOMIC_ADD_FETCH(&ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1086 LIBXSMM_ASSERT(0 < tid);
1087 /* libxsmm_ninit (1: initialization started, 2: library initialized, higher: to invalidate code-TLS) */
1088 if (1 == tid) {
1089 libxsmm_timer_tickint s0 = libxsmm_timer_tick_rtc(); /* warm-up */
1090 libxsmm_timer_tickint t0 = libxsmm_timer_tick_tsc(); /* warm-up */
1091 s0 = libxsmm_timer_tick_rtc(); t0 = libxsmm_timer_tick_tsc(); /* start timing */
1092 assert(0 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */
1093 /* coverity[check_return] */
1094 LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1095 gid = tid; /* protect initialization */
1096 #if (0 != LIBXSMM_SYNC)
1097 /* coverity[check_return] */
1098 LIBXSMM_TLS_CREATE(&libxsmm_tlskey);
1099 { /* construct and initialize locks */
1100 # if defined(LIBXSMM_REGLOCK_TRY)
1101 const char *const env_trylock = getenv("LIBXSMM_TRYLOCK");
1102 # endif
1103 LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK) attr_global;
1104 # if (1 < INTERNAL_REGLOCK_MAXN)
1105 int i;
1106 LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr;
1107 LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr);
1108 # elif defined(LIBXSMM_UNIFY_LOCKS)
1109 internal_reglock_ptr = &libxsmm_lock_global;
1110 # else
1111 static LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) internal_reglock;
1112 internal_reglock_ptr = &internal_reglock;
1113 LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr;
1114 LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr);
1115 LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, internal_reglock_ptr, &attr);
1116 LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr);
1117 # endif
1118 LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK, &attr_global);
1119 LIBXSMM_LOCK_INIT(LIBXSMM_LOCK, &libxsmm_lock_global, &attr_global);
1120 LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK, &attr_global);
1121 /* control number of locks needed; LIBXSMM_TRYLOCK implies only 1 lock */
1122 # if defined(LIBXSMM_REGLOCK_TRY)
1123 if (NULL == env_trylock || 0 == *env_trylock)
1124 # endif
1125 { /* no LIBXSMM_TRYLOCK */
1126 # if defined(LIBXSMM_VTUNE)
1127 internal_reglock_count = 1; /* avoid duplicated kernels */
1128 # elif (1 < INTERNAL_REGLOCK_MAXN)
1129 const char *const env_nlocks = getenv("LIBXSMM_NLOCKS");
1130 const int reglock_count = (NULL == env_nlocks || 0 == *env_nlocks || 1 > atoi(env_nlocks))
1131 ? (INTERNAL_REGLOCK_MAXN) : LIBXSMM_MIN(atoi(env_nlocks), INTERNAL_REGLOCK_MAXN);
1132 internal_reglock_count = LIBXSMM_LO2POT(reglock_count);
1133 # else
1134 internal_reglock_count = 0;
1135 # endif
1136 }
1137 # if defined(LIBXSMM_REGLOCK_TRY)
1138 else { /* LIBXSMM_TRYLOCK environment variable specified */
1139 internal_reglock_count = (0 != atoi(env_trylock) ? 1
1140 # if (1 < INTERNAL_REGLOCK_MAXN)
1141 : INTERNAL_REGLOCK_MAXN);
1142 # else
1143 : 0);
1144 # endif
1145 }
1146 # endif
1147 # if (1 < INTERNAL_REGLOCK_MAXN)
1148 LIBXSMM_ASSERT(1 <= internal_reglock_count);
1149 for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, &internal_reglock[i].state, &attr);
1150 LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr);
1151 # endif
1152 }
1153 #endif
1154 { /* determine whether this instance is unique or not */
1155 #if defined(_WIN32)
1156 internal_singleton_handle = CreateMutex(NULL, TRUE, "GlobalLIBXSMM");
1157 #else
1158 const int result = LIBXSMM_SNPRINTF(internal_singleton_fname, sizeof(internal_singleton_fname), "/tmp/.libxsmm.%u",
1159 /*rely on user id to avoid permission issues in case of left-over files*/(unsigned int)getuid());
1160 struct flock singleton_flock;
1161 int singleton_handle;
1162 singleton_flock.l_start = 0;
1163 singleton_flock.l_len = 0; /* entire file */
1164 singleton_flock.l_type = F_WRLCK; /* exclusive across PIDs */
1165 singleton_flock.l_whence = SEEK_SET;
1166 singleton_handle = ((0 < result && (int)sizeof(internal_singleton_fname) > result) ? open(
1167 internal_singleton_fname, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR) : -1);
1168 internal_singleton_handle = fcntl(singleton_handle, F_SETLK, &singleton_flock);
1169 if (0 > internal_singleton_handle && 0 <= singleton_handle) close(singleton_handle);
1170 #endif /* coverity[leaked_handle] */
1171 }
1172 { /* calibrate timer */
1173 int register_termination_proc;
1174 libxsmm_timer_tickint s1, t1;
1175 internal_init(); /* must be first to initialize verbosity, etc. */
1176 if (INTERNAL_SINGLETON(internal_singleton_handle)) { /* after internal_init */
1177 internal_dump(stdout, 1/*urgent*/);
1178 }
1179 s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* mid-timing */
1180 libxsmm_cpuid_x86(&internal_cpuid_info);
1181 if (0 != internal_cpuid_info.constant_tsc && t0 < t1) {
1182 libxsmm_timer_scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0);
1183 }
1184 register_termination_proc = atexit(internal_finalize);
1185 s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* final timing */
1186 /* set timer-scale and determine start of the "uptime" (shown at termination) */
1187 if (t0 < t1 && 0.0 < libxsmm_timer_scale) {
1188 const double scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0);
1189 const double diff = LIBXSMM_DELTA(libxsmm_timer_scale, scale) / scale;
1190 if (5E-5 > diff) {
1191 libxsmm_timer_scale = scale;
1192 internal_timer_start = t0;
1193 }
1194 else {
1195 libxsmm_timer_scale = 0;
1196 internal_timer_start = s0;
1197 #if !defined(NDEBUG)
1198 libxsmm_se = 1;
1199 #endif
1200 }
1201 }
1202 else {
1203 internal_timer_start = s0;
1204 libxsmm_timer_scale = 0;
1205 }
1206 if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1207 if (EXIT_SUCCESS != register_termination_proc) {
1208 fprintf(stderr, "LIBXSMM ERROR: failed to register termination procedure!\n");
1209 }
1210 if (0 == libxsmm_timer_scale) {
1211 fprintf(stderr, "LIBXSMM WARNING: timer is maybe not cycle-accurate!\n");
1212 }
1213 }
1214 }
1215 assert(1 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */
1216 /* coverity[check_return] */
1217 LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1218 }
1219 else /*if (gid != tid)*/ { /* avoid recursion */
1220 LIBXSMM_ASSERT(gid != tid);
1221 while (2 > LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_RELAXED)) LIBXSMM_SYNC_YIELD;
1222 internal_init();
1223 }
1224 }
1225 LIBXSMM_ASSERT(1 < libxsmm_ninit);
1226 }
1227
1228
1229 LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void libxsmm_finalize(void);
libxsmm_finalize(void)1230 LIBXSMM_API LIBXSMM_ATTRIBUTE_DTOR void libxsmm_finalize(void)
1231 {
1232 void *const regaddr = &internal_registry;
1233 uintptr_t regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED);
1234 libxsmm_code_pointer* registry = (libxsmm_code_pointer*)regptr;
1235 if (NULL != registry) {
1236 int i;
1237 #if (0 != LIBXSMM_SYNC)
1238 LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global);
1239 # if (1 < INTERNAL_REGLOCK_MAXN)
1240 { /* acquire locks and thereby shortcut lazy initialization later on */
1241 int ntry = 0, n;
1242 do {
1243 for (i = 0, n = 0; i < internal_reglock_count; ++i) {
1244 if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[i].state)) ++n;
1245 }
1246 ntry += (0 == n ? 1 : 0);
1247 } while (n < internal_reglock_count && ntry < LIBXSMM_CLEANUP_NTRY);
1248 }
1249 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1250 LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1251 # endif
1252 #endif
1253 regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED);
1254 registry = (libxsmm_code_pointer*)regptr;
1255 if (NULL != registry) {
1256 libxsmm_descriptor *const registry_keys = internal_registry_keys;
1257 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1258 internal_cache_type *const cache_buffer = internal_cache_buffer;
1259 #endif
1260 unsigned int rest = 0, errors = 0;
1261 #if defined(LIBXSMM_TRACE)
1262 i = libxsmm_trace_finalize();
1263 if (EXIT_SUCCESS != i && 0 != libxsmm_verbosity) { /* library code is expected to be mute */
1264 fprintf(stderr, "LIBXSMM ERROR: failed to finalize trace (error #%i)!\n", i);
1265 }
1266 #endif
1267 #if defined(LIBXSMM_PERF)
1268 libxsmm_perf_finalize();
1269 #endif
1270 libxsmm_xcopy_finalize();
1271 libxsmm_gemm_finalize();
1272 libxsmm_dnn_finalize();
1273 /* coverity[check_return] */
1274 LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */
1275 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1276 internal_cache_buffer = NULL;
1277 #endif
1278 internal_registry_keys = NULL; /* make registry keys unavailable */
1279 LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_SEQ_CST);
1280 internal_registry_nbytes = 0; internal_registry_nleaks = 0;
1281 for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) {
1282 /*const*/ libxsmm_code_pointer code = registry[i];
1283 if (NULL != code.ptr_const) {
1284 /* check if the registered entity is a GEMM kernel */
1285 switch (registry_keys[i].kind) {
1286 case LIBXSMM_KERNEL_KIND_MATMUL: {
1287 const libxsmm_gemm_descriptor *const desc = ®istry_keys[i].gemm.desc;
1288 if (1 < desc->m && 1 < desc->n) {
1289 const unsigned int njit = (0 == (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0);
1290 const unsigned int nsta = (0 != (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0);
1291 /* count whether kernel is static or JIT-code */
1292 internal_update_mmstatistic(desc, 0, 0, njit, nsta);
1293 }
1294 else {
1295 ++internal_statistic_num_gemv;
1296 }
1297 ++rest;
1298 } break;
1299 case LIBXSMM_KERNEL_KIND_MCOPY: {
1300 ++internal_statistic_num_mcopy;
1301 } break;
1302 case LIBXSMM_KERNEL_KIND_MELTW: {
1303 ++internal_statistic_num_meltw;
1304 } break;
1305 case LIBXSMM_KERNEL_KIND_TRANS: {
1306 ++internal_statistic_num_tcopy;
1307 } break;
1308 case LIBXSMM_KERNEL_KIND_TRSM: {
1309 ++internal_statistic_num_trsm;
1310 } break;
1311 case LIBXSMM_KERNEL_KIND_TRMM: {
1312 ++internal_statistic_num_trmm;
1313 } break;
1314 case LIBXSMM_KERNEL_KIND_USER: {
1315 ++internal_statistic_num_user;
1316 } break;
1317 default: if (LIBXSMM_KERNEL_UNREGISTERED <= registry_keys[i].kind) {
1318 ++errors;
1319 }
1320 else {
1321 ++rest;
1322 }
1323 }
1324 if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1325 if (0 != errors) {
1326 fprintf(stderr, "LIBXSMM ERROR: code registry is corrupted!\n");
1327 }
1328 if (LIBXSMM_CAPACITY_REGISTRY == (rest + errors + internal_statistic_num_gemv +
1329 internal_statistic_num_mcopy + internal_statistic_num_meltw +
1330 internal_statistic_num_tcopy + internal_statistic_num_trsm +
1331 internal_statistic_num_trmm + internal_statistic_num_user))
1332 {
1333 fprintf(stderr, "LIBXSMM WARNING: code registry was exhausted!\n");
1334 }
1335 }
1336 if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */
1337 void* buffer = NULL;
1338 size_t size = 0;
1339 #if defined(LIBXSMM_HASH_COLLISION)
1340 code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
1341 #endif
1342 if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, &size, NULL/*flags*/, &buffer)) {
1343 #if !defined(NDEBUG)
1344 registry[i].ptr = NULL;
1345 #endif
1346 libxsmm_xfree(code.ptr_const, 0/*no check*/);
1347 /* round-up size (it is fine to assume 4 KB pages since it is likely more accurate than not rounding up) */
1348 internal_registry_nbytes += LIBXSMM_UP2(size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE);
1349 }
1350 else ++internal_registry_nleaks;
1351 }
1352 }
1353 }
1354 /* release buffers (registry, keys, cache) */
1355 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1356 libxsmm_xfree(cache_buffer, 0/*no check*/);
1357 #endif
1358 libxsmm_xfree(registry_keys, 0/*no check*/);
1359 libxsmm_xfree(registry, 0/*no check*/);
1360 }
1361 #if (0 != LIBXSMM_SYNC) /* LIBXSMM_LOCK_RELEASE, but no LIBXSMM_LOCK_DESTROY */
1362 # if (1 < INTERNAL_REGLOCK_MAXN)
1363 for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
1364 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1365 LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1366 # endif
1367 LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global);
1368 /* coverity[check_return] */
1369 LIBXSMM_TLS_DESTROY(libxsmm_tlskey);
1370 #endif
1371 }
1372 }
1373
1374
libxsmm_sink(LIBXSMM_VARIADIC)1375 LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC)
1376 {
1377 /* does nothing else but sinking given arguments */
1378 }
1379
1380
libxsmm_get_target_archid(void)1381 LIBXSMM_API int libxsmm_get_target_archid(void)
1382 {
1383 LIBXSMM_INIT
1384 #if !defined(__MIC__)
1385 return libxsmm_target_archid;
1386 #else /* no JIT support */
1387 return LIBXSMM_MIN(libxsmm_target_archid, LIBXSMM_X86_SSE3);
1388 #endif
1389 }
1390
1391
libxsmm_set_target_archid(int id)1392 LIBXSMM_API void libxsmm_set_target_archid(int id)
1393 {
1394 int target_archid = LIBXSMM_TARGET_ARCH_UNKNOWN;
1395 switch (id) {
1396 case LIBXSMM_X86_AVX512_CPX:
1397 case LIBXSMM_X86_AVX512_CLX:
1398 case LIBXSMM_X86_AVX512_CORE:
1399 case LIBXSMM_X86_AVX512_KNM:
1400 case LIBXSMM_X86_AVX512_MIC:
1401 case LIBXSMM_X86_AVX512:
1402 case LIBXSMM_X86_AVX2:
1403 case LIBXSMM_X86_AVX:
1404 case LIBXSMM_X86_SSE4:
1405 case LIBXSMM_X86_SSE3:
1406 case LIBXSMM_TARGET_ARCH_GENERIC: {
1407 target_archid = id;
1408 } break;
1409 default: if (LIBXSMM_X86_GENERIC <= id) {
1410 target_archid = LIBXSMM_X86_GENERIC;
1411 }
1412 else {
1413 target_archid = libxsmm_cpuid();
1414 }
1415 }
1416 LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED);
1417 if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1418 const int cpuid = libxsmm_cpuid();
1419 if (cpuid < target_archid) {
1420 const char *const target_arch = libxsmm_cpuid_name(target_archid);
1421 fprintf(stderr, "LIBXSMM WARNING: \"%s\" code may fail to run on \"%s\"!\n",
1422 target_arch, libxsmm_cpuid_name(cpuid));
1423 }
1424 }
1425 }
1426
1427
libxsmm_get_target_arch(void)1428 LIBXSMM_API const char* libxsmm_get_target_arch(void)
1429 {
1430 LIBXSMM_INIT
1431 return libxsmm_cpuid_name(libxsmm_target_archid);
1432 }
1433
1434
1435 /* function serves as a helper for implementing the Fortran interface */
1436 LIBXSMM_API const char* libxsmmf_get_target_arch(int* length);
libxsmmf_get_target_arch(int * length)1437 LIBXSMM_API const char* libxsmmf_get_target_arch(int* length)
1438 {
1439 const char *const arch = libxsmm_get_target_arch();
1440 /* valid here since function is not in the public interface */
1441 LIBXSMM_ASSERT(NULL != arch && 0 != length);
1442 *length = (int)strlen(arch);
1443 return arch;
1444 }
1445
1446
libxsmm_set_target_arch(const char * arch)1447 LIBXSMM_API void libxsmm_set_target_arch(const char* arch)
1448 {
1449 const int cpuid = libxsmm_cpuid();
1450 int target_archid;
1451 if (NULL != arch && 0 != *arch) {
1452 const int jit = atoi(arch);
1453 if (0 == strcmp("0", arch)) {
1454 target_archid = LIBXSMM_X86_SSE3;
1455 }
1456 else if (0 < jit) {
1457 target_archid = LIBXSMM_X86_GENERIC + jit;
1458 }
1459 else if (0 == strcmp("cpx", arch)) {
1460 target_archid = LIBXSMM_X86_AVX512_CPX;
1461 }
1462 else if (0 == strcmp("clx", arch)) {
1463 target_archid = LIBXSMM_X86_AVX512_CLX;
1464 }
1465 else if (0 == strcmp("skx", arch) || 0 == strcmp("skl", arch)
1466 /* "avx3"/"avx512" previously enabled LIBXSMM_X86_AVX512 */
1467 || 0 == strcmp("avx3", arch) || 0 == strcmp("avx512", arch))
1468 {
1469 target_archid = LIBXSMM_X86_AVX512_CORE;
1470 }
1471 else if (0 == strcmp("knm", arch)) {
1472 target_archid = LIBXSMM_X86_AVX512_KNM;
1473 }
1474 else if (0 == strcmp("knl", arch) || 0 == strcmp("mic", arch)) {
1475 target_archid = LIBXSMM_X86_AVX512_MIC;
1476 }
1477 else if (0 == strcmp("hsw", arch) || 0 == strcmp("avx2", arch)) {
1478 target_archid = LIBXSMM_X86_AVX2;
1479 }
1480 else if (0 == strcmp("snb", arch) || 0 == strcmp("avx", arch)) {
1481 target_archid = LIBXSMM_X86_AVX;
1482 }
1483 else if (0 == strcmp("wsm", arch) || 0 == strcmp("nhm", arch) || 0 == strcmp("sse4", arch)
1484 || 0 == strcmp("sse4_1", arch) || 0 == strcmp("sse4.1", arch)
1485 || 0 == strcmp("sse4_2", arch) || 0 == strcmp("sse4.2", arch))
1486 {
1487 target_archid = LIBXSMM_X86_SSE4;
1488 }
1489 else if (0 == strcmp("sse", arch) || 0 == strcmp("sse3", arch)
1490 || 0 == strcmp("ssse3", arch) || 0 == strcmp("ssse", arch))
1491 {
1492 target_archid = LIBXSMM_X86_SSE3;
1493 }
1494 else if (0 == strcmp("x86", arch) || 0 == strcmp("x64", arch) || 0 == strcmp("sse2", arch)) {
1495 target_archid = LIBXSMM_X86_GENERIC;
1496 }
1497 else if (0 == strcmp("generic", arch) || 0 == strcmp("none", arch)) {
1498 target_archid = LIBXSMM_TARGET_ARCH_GENERIC;
1499 }
1500 else {
1501 target_archid = cpuid;
1502 }
1503 }
1504 else {
1505 target_archid = cpuid;
1506 }
1507 if (cpuid < target_archid) { /* warn about code path if beyond CPUID */
1508 static int error_once = 0;
1509 if ( 0 != libxsmm_verbosity /* library code is expected to be mute */
1510 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
1511 {
1512 const char *const target_arch = libxsmm_cpuid_name(target_archid);
1513 fprintf(stderr, "LIBXSMM WARNING: \"%s\" code will fail to run on \"%s\"!\n",
1514 target_arch, libxsmm_cpuid_name(cpuid));
1515 }
1516 #if 0 /* limit code path to confirmed features */
1517 target_archid = cpuid;
1518 #endif
1519 }
1520 LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED);
1521 }
1522
1523
libxsmm_get_verbosity(void)1524 LIBXSMM_API int libxsmm_get_verbosity(void)
1525 {
1526 LIBXSMM_INIT
1527 return libxsmm_verbosity;
1528 }
1529
1530
libxsmm_set_verbosity(int level)1531 LIBXSMM_API void libxsmm_set_verbosity(int level)
1532 {
1533 LIBXSMM_INIT
1534 LIBXSMM_ATOMIC_STORE(&libxsmm_verbosity, level, LIBXSMM_ATOMIC_RELAXED);
1535 }
1536
1537
libxsmm_get_gemm_auto_prefetch(void)1538 LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void)
1539 {
1540 return (libxsmm_gemm_prefetch_type)libxsmm_gemm_auto_prefetch;
1541 }
1542
1543
libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy)1544 LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy)
1545 {
1546 if (0 == internal_gemm_auto_prefetch_locked) { /* LIBXSMM_GEMM_PREFETCH environment takes precedence */
1547 LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch_default, strategy, LIBXSMM_ATOMIC_RELAXED);
1548 LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch, strategy, LIBXSMM_ATOMIC_RELAXED);
1549 }
1550 }
1551
1552
libxsmm_typesize(libxsmm_datatype datatype)1553 LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype)
1554 {
1555 switch (datatype) {
1556 case LIBXSMM_DATATYPE_F64: return 8;
1557 case LIBXSMM_DATATYPE_F32: return 4;
1558 case LIBXSMM_DATATYPE_BF16: return 2;
1559 case LIBXSMM_DATATYPE_I64: return 8;
1560 case LIBXSMM_DATATYPE_I32: return 4;
1561 case LIBXSMM_DATATYPE_I16: return 2;
1562 case LIBXSMM_DATATYPE_I8: return 1;
1563 case LIBXSMM_DATATYPE_UNSUPPORTED: {
1564 static int error_once = 0;
1565 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
1566 fprintf(stderr, "LIBXSMM ERROR: unsupported data type!\n");
1567 }
1568 } break;
1569 }
1570 LIBXSMM_ASSERT_MSG(0, "unsupported data type");
1571 return 1; /* avoid to return 0 to avoid div-by-zero in static analysis of depending code */
1572 }
1573
1574
libxsmm_dvalue(libxsmm_datatype datatype,const void * value,double * dvalue)1575 LIBXSMM_API_INTERN int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue)
1576 {
1577 int result = EXIT_SUCCESS;
1578 if (NULL != value && NULL != dvalue) {
1579 switch (datatype) {
1580 case LIBXSMM_DATATYPE_F64: *dvalue = (*(const double*)value); break;
1581 case LIBXSMM_DATATYPE_F32: *dvalue = (double)(*(const float *)value); break;
1582 case LIBXSMM_DATATYPE_I32: *dvalue = (double)(*(const int *)value); break;
1583 case LIBXSMM_DATATYPE_I16: *dvalue = (double)(*(const short *)value); break;
1584 case LIBXSMM_DATATYPE_I8: *dvalue = (double)(*(const char *)value); break;
1585 default: result = EXIT_FAILURE;
1586 }
1587 }
1588 else {
1589 result = EXIT_FAILURE;
1590 }
1591 return result;
1592 }
1593
1594
libxsmm_typename(libxsmm_datatype datatype)1595 LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype)
1596 {
1597 switch (datatype) {
1598 case LIBXSMM_DATATYPE_F64: return "f64";
1599 case LIBXSMM_DATATYPE_F32: return "f32";
1600 case LIBXSMM_DATATYPE_BF16: return "bf16";
1601 case LIBXSMM_DATATYPE_I64: return "i64";
1602 case LIBXSMM_DATATYPE_I32: return "i32";
1603 case LIBXSMM_DATATYPE_I16: return "i16";
1604 case LIBXSMM_DATATYPE_I8: return "i8";
1605 default: {
1606 if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) &&
1607 LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype))
1608 {
1609 return "i16i32";
1610 }
1611 else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) &&
1612 LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype))
1613 {
1614 return "i16f32";
1615 }
1616 else if (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP(datatype) &&
1617 LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype))
1618 {
1619 return "i8i32";
1620 }
1621 else if (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP(datatype) &&
1622 LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype))
1623 {
1624 return "bf16f32";
1625 }
1626 else {
1627 return "void";
1628 }
1629 }
1630 }
1631 }
1632
1633
internal_get_typesize_string(char buffer[4],int buffer_size,size_t typesize)1634 LIBXSMM_API_INLINE void internal_get_typesize_string(char buffer[4], int buffer_size, size_t typesize)
1635 {
1636 LIBXSMM_ASSERT(256 > typesize && 4 <= buffer_size);
1637 if (10 > typesize) {
1638 buffer[0] = (char)('0' + typesize);
1639 buffer[1] = 0;
1640 }
1641 else {
1642 LIBXSMM_SNPRINTF(buffer, buffer_size, "%i", (int)typesize);
1643 }
1644 }
1645
1646
libxsmm_build(const libxsmm_build_request * request,unsigned int regindex,libxsmm_code_pointer * code)1647 LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code)
1648 {
1649 int result = EXIT_SUCCESS;
1650 #if !defined(__MIC__)
1651 const char * /*const*/ target_arch = libxsmm_cpuid_name(libxsmm_target_archid);
1652 /* large enough temporary buffer for generated code */
1653 char jit_buffer[LIBXSMM_CODE_MAXSIZE], jit_name[256] = { 0 };
1654 libxsmm_generated_code generated_code;
1655 libxsmm_kernel_xinfo extra;
1656
1657 LIBXSMM_MEMZERO127(&generated_code);
1658 generated_code.generated_code = jit_buffer;
1659 generated_code.buffer_size = sizeof(jit_buffer);
1660 /* setup code generation */
1661 generated_code.arch = libxsmm_target_archid;
1662 generated_code.code_type = 2;
1663
1664 # if !defined(NDEBUG) /* should not be needed (all members will be initialized below) */
1665 LIBXSMM_MEMZERO127(&extra);
1666 # endif
1667 extra.registered = regindex;
1668 extra.nflops = 0;
1669
1670 LIBXSMM_ASSERT(NULL != generated_code.generated_code || 0 == generated_code.buffer_size);
1671 LIBXSMM_ASSERT(NULL != request && 0 != libxsmm_target_archid);
1672 LIBXSMM_ASSERT(NULL != code && NULL == code->ptr_const);
1673
1674 switch (request->kind) { /* generate kernel */
1675 case LIBXSMM_BUILD_KIND_GEMM: { /* small MxM kernel */
1676 LIBXSMM_ASSERT(NULL != request->descriptor.gemm);
1677 # if 0 /* dummy kernel for an empty shape is desired */
1678 if (0 < request->descriptor.gemm->m && 0 < request->descriptor.gemm->n && 0 < request->descriptor.gemm->k &&
1679 0 < request->descriptor.gemm->lda && 0 < request->descriptor.gemm->ldb && 0 < request->descriptor.gemm->ldc)
1680 # endif
1681 {
1682 const unsigned int m = request->descriptor.gemm->m, n = request->descriptor.gemm->n, k = request->descriptor.gemm->k;
1683 extra.nflops = 2 * m * n * k;
1684 # if !defined(LIBXSMM_DENY_RETARGET) /* disable: ECFLAGS=-DLIBXSMM_DENY_RETARGET */
1685 if (LIBXSMM_X86_AVX2 < libxsmm_target_archid &&
1686 (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype) ||
1687 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype)) &&
1688 (16 >= (m * k) || 16 >= (k * n) || 16 >= (m * n)))
1689 {
1690 /* TODO: shall we update variable "target_arch" (name)? */
1691 generated_code.arch = LIBXSMM_X86_AVX2;
1692 }
1693 # endif
1694 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_gemm_kernel, &generated_code, request->descriptor.gemm);
1695 # if !defined(LIBXSMM_VTUNE)
1696 if (0 > libxsmm_verbosity)
1697 # endif
1698 {
1699 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.gemm->prefetch);
1700 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.gemm->datatype);
1701 int typesigns = 0, br = 0;
1702 /* query batch reduce variant */
1703 if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & request->descriptor.gemm->flags) > 1 ) {
1704 br = 1;
1705 } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET & request->descriptor.gemm->flags) > 1 ) {
1706 br = 2;
1707 } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE & request->descriptor.gemm->flags) > 1 ) {
1708 br = 3;
1709 } else {
1710 br = 0;
1711 }
1712 /* query A/B sign combinations */
1713 if ( (LIBXSMM_GEMM_FLAG_A_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1714 typesigns = 1;
1715 } else if ( (LIBXSMM_GEMM_FLAG_B_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1716 typesigns = 2;
1717 } else if ( (LIBXSMM_GEMM_FLAG_AB_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1718 typesigns = 3;
1719 } else {
1720 typesigns = 0;
1721 }
1722 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1723 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i_br%i_uh%u_si%i.mxm", target_arch, tname,
1724 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.gemm->flags) ? 'n' : 't',
1725 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.gemm->flags) ? 'n' : 't', m, n, k,
1726 request->descriptor.gemm->lda, request->descriptor.gemm->ldb, request->descriptor.gemm->ldc,
1727 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.gemm->flags) ? 0 : */1,
1728 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.gemm->flags) ? 0 : 1, uid, br, (unsigned int)request->descriptor.gemm->c3, typesigns);
1729 }
1730 }
1731 } break;
1732 case LIBXSMM_BUILD_KIND_SRSOA: { /* sparse SOA kernel, CSR format */
1733 LIBXSMM_ASSERT(NULL != request->descriptor.srsoa && 0 != request->descriptor.srsoa->gemm);
1734 LIBXSMM_ASSERT(NULL != request->descriptor.srsoa->row_ptr && 0 != request->descriptor.srsoa->column_idx && 0 != request->descriptor.srsoa->values);
1735 /* only floating point */
1736 if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype) ||
1737 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype))
1738 {
1739 const unsigned int nnz = (request->descriptor.srsoa->gemm->lda == 0) ?
1740 request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->m] : request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->k];
1741 const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype)) ?
1742 libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid);
1743 const unsigned int gemm_factor = (request->descriptor.srsoa->gemm->lda == 0) ? request->descriptor.srsoa->gemm->n : request->descriptor.srsoa->gemm->m;
1744 extra.nflops = 2 * nnz * gemm_factor * simdw;
1745 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_soa_kernel, &generated_code, request->descriptor.srsoa->gemm, target_arch,
1746 request->descriptor.srsoa->row_ptr, request->descriptor.srsoa->column_idx, request->descriptor.srsoa->values, request->descriptor.srsoa->packed_width);
1747 # if !defined(LIBXSMM_VTUNE)
1748 if (0 > libxsmm_verbosity)
1749 # endif
1750 {
1751 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.srsoa->gemm->prefetch);
1752 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.srsoa->gemm->datatype);
1753 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1754 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.srsoa", target_arch, tname,
1755 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.srsoa->gemm->flags) ? 'n' : 't',
1756 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.srsoa->gemm->flags) ? 'n' : 't',
1757 request->descriptor.srsoa->gemm->m, request->descriptor.srsoa->gemm->n, request->descriptor.srsoa->gemm->k,
1758 request->descriptor.srsoa->gemm->lda, request->descriptor.srsoa->gemm->ldb, request->descriptor.srsoa->gemm->ldc,
1759 request->descriptor.srsoa->packed_width,
1760 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.srsoa->gemm->flags) ? 0 : */1,
1761 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.srsoa->gemm->flags) ? 0 : 1,
1762 uid, nnz);
1763 }
1764 }
1765 } break;
1766 case LIBXSMM_BUILD_KIND_SCSOA: { /* sparse SOA kernel, CSC format */
1767 LIBXSMM_ASSERT(NULL != request->descriptor.scsoa && 0 != request->descriptor.scsoa->gemm);
1768 LIBXSMM_ASSERT(NULL != request->descriptor.scsoa->row_idx && 0 != request->descriptor.scsoa->column_ptr && 0 != request->descriptor.scsoa->values);
1769 /* only floating point */
1770 if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype) ||
1771 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype))
1772 {
1773 const unsigned int nnz = (request->descriptor.scsoa->gemm->lda == 0) ?
1774 request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->k] : request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->n];
1775 const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype)) ?
1776 libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid);
1777 const unsigned int gemm_factor = (request->descriptor.scsoa->gemm->lda == 0) ? request->descriptor.scsoa->gemm->n : request->descriptor.scsoa->gemm->m;
1778 extra.nflops = 2 * nnz * gemm_factor * simdw;
1779 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csc_soa_kernel, &generated_code, request->descriptor.scsoa->gemm, target_arch,
1780 request->descriptor.scsoa->row_idx, request->descriptor.scsoa->column_ptr, request->descriptor.scsoa->values, request->descriptor.scsoa->packed_width);
1781 # if !defined(LIBXSMM_VTUNE)
1782 if (0 > libxsmm_verbosity)
1783 # endif
1784 {
1785 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.scsoa->gemm->prefetch);
1786 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.scsoa->gemm->datatype);
1787 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1788 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.scsoa", target_arch, tname,
1789 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.scsoa->gemm->flags) ? 'n' : 't',
1790 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.scsoa->gemm->flags) ? 'n' : 't',
1791 request->descriptor.scsoa->gemm->m, request->descriptor.scsoa->gemm->n, request->descriptor.scsoa->gemm->k,
1792 request->descriptor.scsoa->gemm->lda, request->descriptor.scsoa->gemm->ldb, request->descriptor.scsoa->gemm->ldc,
1793 request->descriptor.scsoa->packed_width,
1794 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.scsoa->gemm->flags) ? 0 : */1,
1795 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.scsoa->gemm->flags) ? 0 : 1,
1796 uid, nnz);
1797 }
1798 }
1799 } break;
1800 case LIBXSMM_BUILD_KIND_PGEMMRMAC: { /* packed GEMM, B regular matrix, row-major */
1801 LIBXSMM_ASSERT(NULL != request->descriptor.pgemmacrm && 0 != request->descriptor.pgemmacrm->gemm);
1802 /* only floating point */
1803 if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype) ||
1804 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype))
1805 {
1806 extra.nflops = 2 * request->descriptor.pgemmacrm->packed_width * request->descriptor.pgemmacrm->gemm->m * request->descriptor.pgemmacrm->gemm->n * request->descriptor.pgemmacrm->gemm->k;
1807 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_ac_rm, &generated_code, request->descriptor.pgemmacrm->gemm, request->descriptor.pgemmacrm->packed_width, target_arch);
1808 # if !defined(LIBXSMM_VTUNE)
1809 if (0 > libxsmm_verbosity)
1810 # endif
1811 {
1812 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmacrm->gemm->prefetch);
1813 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmacrm->gemm->datatype);
1814 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1815 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmacrm", target_arch, tname,
1816 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't',
1817 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't',
1818 request->descriptor.pgemmacrm->gemm->m, request->descriptor.pgemmacrm->gemm->n, request->descriptor.pgemmacrm->gemm->k,
1819 request->descriptor.pgemmacrm->gemm->lda, request->descriptor.pgemmacrm->gemm->ldb, request->descriptor.pgemmacrm->gemm->ldc,
1820 request->descriptor.pgemmacrm->packed_width,
1821 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : */1,
1822 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : 1,
1823 uid);
1824 }
1825 }
1826 } break;
1827 case LIBXSMM_BUILD_KIND_PGEMMRMBC: { /* packed GEMM, A regular matrix, row-major */
1828 LIBXSMM_ASSERT(NULL != request->descriptor.pgemmbcrm && 0 != request->descriptor.pgemmbcrm->gemm);
1829 /* only floating point */
1830 if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype) ||
1831 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype))
1832 {
1833 extra.nflops = 2 * request->descriptor.pgemmbcrm->packed_width * request->descriptor.pgemmbcrm->gemm->m * request->descriptor.pgemmbcrm->gemm->n * request->descriptor.pgemmbcrm->gemm->k;
1834 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_bc_rm, &generated_code, request->descriptor.pgemmbcrm->gemm, request->descriptor.pgemmbcrm->packed_width, target_arch);
1835 # if !defined(LIBXSMM_VTUNE)
1836 if (0 > libxsmm_verbosity)
1837 # endif
1838 {
1839 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmbcrm->gemm->prefetch);
1840 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmbcrm->gemm->datatype);
1841 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1842 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmbcrm", target_arch, tname,
1843 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't',
1844 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't',
1845 request->descriptor.pgemmbcrm->gemm->m, request->descriptor.pgemmbcrm->gemm->n, request->descriptor.pgemmbcrm->gemm->k,
1846 request->descriptor.pgemmbcrm->gemm->lda, request->descriptor.pgemmbcrm->gemm->ldb, request->descriptor.pgemmbcrm->gemm->ldc,
1847 request->descriptor.pgemmbcrm->packed_width,
1848 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : */1,
1849 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : 1,
1850 uid);
1851 }
1852 }
1853 } break;
1854 case LIBXSMM_BUILD_KIND_SREG: { /* sparse register kernel */
1855 LIBXSMM_ASSERT(NULL != request->descriptor.sreg && 0 != request->descriptor.sreg->gemm);
1856 LIBXSMM_ASSERT(NULL != request->descriptor.sreg->row_ptr && 0 != request->descriptor.sreg->column_idx && 0 != request->descriptor.sreg->values);
1857 /* only floating point */
1858 if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype) ||
1859 LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype))
1860 {
1861 const unsigned int nnz = request->descriptor.sreg->row_ptr[request->descriptor.sreg->gemm->m];
1862 extra.nflops = 2 * libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 * request->descriptor.sreg->gemm->n * nnz;
1863 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_reg_kernel, &generated_code, request->descriptor.sreg->gemm, target_arch,
1864 request->descriptor.sreg->row_ptr, request->descriptor.sreg->column_idx,
1865 (const double*)request->descriptor.sreg->values);
1866 # if !defined(LIBXSMM_VTUNE)
1867 if (0 > libxsmm_verbosity)
1868 # endif
1869 {
1870 const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.sreg->gemm->prefetch);
1871 const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.sreg->gemm->datatype);
1872 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1873 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i.sreg", target_arch, tname,
1874 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.sreg->gemm->flags) ? 'n' : 't',
1875 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.sreg->gemm->flags) ? 'n' : 't',
1876 request->descriptor.sreg->gemm->m, request->descriptor.sreg->gemm->n, request->descriptor.sreg->gemm->k,
1877 request->descriptor.sreg->gemm->lda, request->descriptor.sreg->gemm->ldb, request->descriptor.sreg->gemm->ldc,
1878 /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.sreg->gemm->flags) ? 0 : */1,
1879 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.sreg->gemm->flags) ? 0 : 1,
1880 uid);
1881 }
1882 }
1883 } break;
1884 case LIBXSMM_BUILD_KIND_MCOPY: { /* matcopy kernel */
1885 LIBXSMM_ASSERT(NULL != request->descriptor.mcopy);
1886 # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */
1887 if (4 == request->descriptor.mcopy->typesize)
1888 # endif
1889 {
1890 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_matcopy_kernel, &generated_code, request->descriptor.mcopy, target_arch);
1891 # if !defined(LIBXSMM_VTUNE)
1892 if (0 > libxsmm_verbosity)
1893 # endif
1894 {
1895 char tsizename[4];
1896 internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.mcopy->typesize);
1897 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1898 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_p%u.mcopy", target_arch, tsizename,
1899 request->descriptor.mcopy->m, request->descriptor.mcopy->n, request->descriptor.mcopy->ldi, request->descriptor.mcopy->ldo,
1900 (unsigned int)request->descriptor.mcopy->prefetch);
1901 }
1902 }
1903 } break;
1904 case LIBXSMM_BUILD_KIND_MELTW: { /* matcopy kernel */
1905 LIBXSMM_ASSERT(NULL != request->descriptor.meltw);
1906 # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */
1907 if (4 == request->descriptor.meltw->typesize)
1908 # endif
1909 {
1910 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_mateltwise_kernel, &generated_code, request->descriptor.meltw);
1911 # if !defined(LIBXSMM_VTUNE)
1912 if (0 > libxsmm_verbosity)
1913 # endif
1914 {
1915 char tsizename[4];
1916 internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.meltw->datatype);
1917 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1918 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_opcode%u_flags%u.meltw", target_arch, tsizename,
1919 request->descriptor.meltw->m, request->descriptor.meltw->n, request->descriptor.meltw->ldi, request->descriptor.meltw->ldo,
1920 (unsigned int)request->descriptor.meltw->operation, (unsigned int)request->descriptor.meltw->flags);
1921 }
1922 }
1923 } break;
1924 case LIBXSMM_BUILD_KIND_TRANS: { /* transpose kernel */
1925 LIBXSMM_ASSERT(NULL != request->descriptor.trans);
1926 if (4 == request->descriptor.trans->typesize || 8 == request->descriptor.trans->typesize) {
1927 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_transpose_kernel, &generated_code, request->descriptor.trans, libxsmm_target_archid);
1928 # if !defined(LIBXSMM_VTUNE)
1929 if (0 > libxsmm_verbosity)
1930 # endif
1931 {
1932 char tsizename[4];
1933 internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.trans->typesize);
1934 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1935 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%u.trans", target_arch, tsizename,
1936 request->descriptor.trans->m, request->descriptor.trans->n, request->descriptor.trans->ldo);
1937 }
1938 }
1939 } break;
1940 case LIBXSMM_BUILD_KIND_PGEMM: { /* compact P/GEMM-kernel (packed) */
1941 unsigned int tsize;
1942 LIBXSMM_ASSERT(NULL != request->descriptor.pgemm);
1943 tsize = (unsigned int)request->descriptor.pgemm->typesize;
1944 if (4 == tsize || 8 == tsize) {
1945 extra.nflops = 0; /* TODO */
1946 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_pgemm_kernel, &generated_code, request->descriptor.pgemm, libxsmm_target_archid);
1947 # if !defined(LIBXSMM_VTUNE)
1948 if (0 > libxsmm_verbosity)
1949 # endif
1950 {
1951 char tsizename[4];
1952 internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1953 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1954 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c_%ux%ux%u_%u_%u_%u_%i.pgemm", target_arch, tsizename,
1955 request->descriptor.pgemm->transa, request->descriptor.pgemm->transb, request->descriptor.pgemm->layout,
1956 request->descriptor.pgemm->m, request->descriptor.pgemm->n, request->descriptor.pgemm->k,
1957 request->descriptor.pgemm->lda, request->descriptor.pgemm->ldb, request->descriptor.pgemm->ldc,
1958 (int)request->descriptor.pgemm->alpha_val);
1959 }
1960 }
1961 } break;
1962 case LIBXSMM_BUILD_KIND_GETRF: { /* compact GETRF kernel (packed) */
1963 unsigned int tsize;
1964 LIBXSMM_ASSERT(NULL != request->descriptor.getrf);
1965 tsize = (unsigned int)request->descriptor.getrf->typesize;
1966 if (4 == tsize || 8 == tsize) {
1967 extra.nflops = 0; /* TODO */
1968 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_getrf_kernel, &generated_code, request->descriptor.getrf, libxsmm_target_archid);
1969 # if !defined(LIBXSMM_VTUNE)
1970 if (0 > libxsmm_verbosity)
1971 # endif
1972 {
1973 char tsizename[4];
1974 internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1975 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1976 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c_%ux%u_%u.getrf", target_arch, tsizename,
1977 request->descriptor.getrf->layout, request->descriptor.getrf->m, request->descriptor.getrf->n, request->descriptor.getrf->lda);
1978 }
1979 }
1980 } break;
1981 case LIBXSMM_BUILD_KIND_TRMM: { /* compact TRMM kernel (packed) */
1982 unsigned int tsize;
1983 LIBXSMM_ASSERT(NULL != request->descriptor.trmm);
1984 tsize = (unsigned int)request->descriptor.trmm->typesize;
1985 if (4 == tsize || 8 == tsize) {
1986 extra.nflops = 0; /* TODO */
1987 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trmm_kernel, &generated_code, request->descriptor.trmm, target_arch);
1988 # if !defined(LIBXSMM_VTUNE)
1989 if (0 > libxsmm_verbosity)
1990 # endif
1991 {
1992 char tsizename[4];
1993 internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1994 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1995 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trmm", target_arch, tsizename,
1996 request->descriptor.trmm->transa, request->descriptor.trmm->layout, request->descriptor.trmm->side, request->descriptor.trmm->uplo,
1997 request->descriptor.trmm->m, request->descriptor.trmm->n, request->descriptor.trmm->lda, request->descriptor.trmm->ldb); /* TODO: alpha */
1998 }
1999 }
2000 } break;
2001 case LIBXSMM_BUILD_KIND_TRSM: if (NULL != request->descriptor.trsm) { /* compact TRSM kernel (packed) */
2002 const unsigned int tsize = (unsigned int)request->descriptor.trsm->typesize;
2003 if (4 == tsize || 8 == tsize) {
2004 extra.nflops = 0; /* TODO */
2005 LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trsm_kernel, &generated_code, request->descriptor.trsm, target_arch);
2006 # if !defined(LIBXSMM_VTUNE)
2007 if (0 > libxsmm_verbosity)
2008 # endif
2009 {
2010 char tsizename[4];
2011 internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
2012 /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
2013 LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trsm", target_arch, tsizename,
2014 request->descriptor.trsm->transa, request->descriptor.trsm->layout, request->descriptor.trsm->side, request->descriptor.trsm->uplo,
2015 request->descriptor.trsm->m, request->descriptor.trsm->n, request->descriptor.trsm->lda, request->descriptor.trsm->ldb); /* TODO: alpha */
2016 }
2017 }
2018 } break;
2019 case LIBXSMM_BUILD_KIND_USER: break;
2020 # if !defined(NDEBUG) /* library code is expected to be mute */
2021 default: { /* unknown kind */
2022 static int error_once = 0;
2023 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
2024 fprintf(stderr, "LIBXSMM ERROR: invalid build request discovered!\n");
2025 }
2026 /*result = EXIT_FAILURE;*/
2027 }
2028 # endif
2029 }
2030
2031 if (0 == generated_code.last_error /* no error raised */
2032 && 0 != generated_code.code_size /*check (tcopy issue?)*/)
2033 {
2034 char* code_buffer = NULL;
2035 void* code_buffer_result = &code_buffer;
2036 LIBXSMM_ASSERT(generated_code.code_size <= LIBXSMM_CODE_MAXSIZE);
2037 LIBXSMM_ASSERT(NULL != generated_code.generated_code);
2038 /* attempt to create executable buffer */
2039 result = libxsmm_xmalloc((void**)code_buffer_result, generated_code.code_size, 0/*auto*/,
2040 /* flag must be a superset of what's populated by libxsmm_malloc_attrib */
2041 LIBXSMM_MALLOC_FLAG_RWX, &extra, sizeof(extra));
2042 if (EXIT_SUCCESS == result) { /* check for success */
2043 LIBXSMM_ASSERT(NULL != code_buffer);
2044 /* copy temporary buffer into the prepared executable buffer */
2045 # if defined(NDEBUG)
2046 { int i; /* precondition: jit_buffer == generated_code.generated_code */
2047 for (i = 0; i < (int)generated_code.code_size; ++i) code_buffer[i] = jit_buffer[i];
2048 }
2049 # else
2050 memcpy(code_buffer, generated_code.generated_code, generated_code.code_size);
2051 # endif
2052 /* attribute/protect buffer and revoke unnecessary flags */
2053 result = libxsmm_malloc_attrib((void**)code_buffer_result, LIBXSMM_MALLOC_FLAG_X, jit_name);
2054 if (EXIT_SUCCESS == result) { /* check for success */
2055 code->ptr = code_buffer; /* commit buffer */
2056 LIBXSMM_ASSERT(NULL != code->ptr && 0 == (LIBXSMM_CODE_STATIC & code->uval));
2057 }
2058 else { /* release buffer */
2059 libxsmm_xfree(code_buffer, 0/*no check*/);
2060 }
2061 }
2062 }
2063 else if (request->kind == LIBXSMM_BUILD_KIND_USER && NULL != request->descriptor.ptr) { /* user-data */
2064 if (0 != request->user_size) {
2065 void* user_data = &code->ptr;
2066 result = libxsmm_xmalloc((void**)user_data, request->user_size, 0/*auto*/,
2067 LIBXSMM_MALLOC_FLAG_PRIVATE, &extra, sizeof(extra));
2068 }
2069 else {
2070 result = EXIT_SUCCESS;
2071 code->ptr = NULL;
2072 }
2073 }
2074 else {
2075 result = (0 != generated_code.last_error ? generated_code.last_error : EXIT_FAILURE);
2076 }
2077 #else /* unsupported platform */
2078 LIBXSMM_UNUSED(request); LIBXSMM_UNUSED(regindex); LIBXSMM_UNUSED(code);
2079 /* libxsmm_get_target_arch also serves as a runtime check whether JIT is available or not */
2080 if (LIBXSMM_X86_SSE3 <= libxsmm_target_archid) result = EXIT_FAILURE;
2081 #endif
2082 return result;
2083 }
2084
2085
2086 #if defined(LIBXSMM_DESC_PAD)
internal_pad_descriptor(libxsmm_descriptor * desc,size_t size)2087 LIBXSMM_API_INLINE void internal_pad_descriptor(libxsmm_descriptor* desc, size_t size)
2088 {
2089 const signed char s = (signed char)LIBXSMM_MAX(LIBXSMM_DIFF_SIZE, LIBXSMM_HASH_SIZE); signed char i;
2090 LIBXSMM_ASSERT(NULL != desc && s <= LIBXSMM_DESCRIPTOR_MAXSIZE);
2091 for (i = (signed char)size; i < s; ++i) desc->data[i] = 0;
2092 }
2093 #endif
2094
2095
internal_find_code(libxsmm_descriptor * desc,size_t desc_size,size_t user_size)2096 LIBXSMM_API_INLINE libxsmm_code_pointer internal_find_code(libxsmm_descriptor* desc, size_t desc_size, size_t user_size)
2097 {
2098 libxsmm_code_pointer flux_entry = { 0 };
2099 const size_t size = LIBXSMM_MIN(sizeof(libxsmm_descriptor_kind) + desc_size, LIBXSMM_DIFF_SIZE);
2100 #if !defined(NDEBUG) && (0 != LIBXSMM_JIT)
2101 int build = EXIT_SUCCESS;
2102 #endif
2103 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2104 # if defined(LIBXSMM_NTHREADS_USE)
2105 const unsigned int tid = libxsmm_get_tid();
2106 internal_cache_type *const cache = internal_cache_buffer + tid;
2107 # else
2108 static LIBXSMM_TLS internal_cache_type internal_cache_buffer;
2109 internal_cache_type *const cache = &internal_cache_buffer;
2110 # endif
2111 unsigned char cache_index;
2112 # if defined(LIBXSMM_DESC_PAD)
2113 # if defined(LIBXSMM_DESC_INLINE)
2114 LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2115 internal_pad_descriptor(desc, size);
2116 LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2117 LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys,
2118 LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2119 # else
2120 internal_pad_descriptor(desc, size);
2121 cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys,
2122 LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2123 # endif
2124 # elif defined(LIBXSMM_DESC_INLINE)
2125 LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2126 LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2127 LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys,
2128 size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2129 # else
2130 LIBXSMM_ASSERT(NULL != desc);
2131 cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys,
2132 size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2133 # endif
2134 if (cache->entry.id == libxsmm_ninit && cache_index < cache->entry.size) { /* valid hit */
2135 flux_entry = cache->entry.code[cache_index];
2136 cache->entry.hit = cache_index;
2137 }
2138 else
2139 #else
2140 LIBXSMM_ASSERT(NULL != desc);
2141 # if defined(LIBXSMM_DESC_PAD)
2142 # if defined(LIBXSMM_DESC_INLINE)
2143 LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2144 internal_pad_descriptor(desc, size);
2145 LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2146 # else
2147 internal_pad_descriptor(desc, size);
2148 # endif
2149 # endif
2150 #endif
2151 {
2152 #if defined(LIBXSMM_DESC_PAD)
2153 unsigned int i = LIBXSMM_CRC32(LIBXSMM_HASH_SIZE)(LIBXSMM_HASH_SEED, desc);
2154 #else
2155 unsigned int i = libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(size, LIBXSMM_HASH_SIZE));
2156 #endif
2157 unsigned int i0 = i = LIBXSMM_MOD2(i, LIBXSMM_CAPACITY_REGISTRY), mode = 0, diff = 1;
2158 LIBXSMM_ASSERT(NULL != internal_registry);
2159 LIBXSMM_ASSERT(&desc->kind == &desc->gemm.pad && desc->kind == desc->gemm.pad);
2160 do { /* use calculated location and check if the requested code is already JITted */
2161 #if (1 < INTERNAL_REGLOCK_MAXN) || !LIBXSMM_LOCK_TYPE_ISRW(LIBXSMM_REGLOCK) /* read registered code */
2162 # if 1 /* omitting an atomic load is safe but avoids race-detectors to highlight this location */
2163 uintptr_t *const fluxaddr = &internal_registry[i].uval;
2164 flux_entry.uval = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)(fluxaddr, LIBXSMM_ATOMIC_RELAXED);
2165 # else
2166 flux_entry = internal_registry[i];
2167 # endif
2168 #else
2169 LIBXSMM_LOCK_ACQREAD(LIBXSMM_REGLOCK, internal_reglock_ptr);
2170 flux_entry = internal_registry[i]; /* read registered code */
2171 LIBXSMM_LOCK_RELREAD(LIBXSMM_REGLOCK, internal_reglock_ptr);
2172 #endif
2173 if ((NULL != flux_entry.ptr_const || 1 == mode) && 2 > mode) { /* check existing entry further */
2174 if (NULL != flux_entry.ptr_const) {
2175 #if defined(LIBXSMM_DESC_PAD)
2176 # if defined(LIBXSMM_DIFF_INLINE)
2177 # if !defined(LIBXSMM_DESC_INLINE)
2178 LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2179 LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2180 # endif
2181 diff = LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE)(xdesc, internal_registry_keys + i, 0/*dummy*/);
2182 # else
2183 diff = libxsmm_diff(desc, internal_registry_keys + i, LIBXSMM_DIFF_SIZE);
2184 # endif
2185 #else
2186 diff = libxsmm_diff(desc, internal_registry_keys + i, size);
2187 #endif
2188 }
2189 #if !defined(NDEBUG)
2190 else LIBXSMM_ASSERT(0 != diff);
2191 #endif
2192 if (0 != diff) { /* search for code version */
2193 if (0 == mode) { /* transition to higher mode */
2194 i0 = i; /* keep current position on record */
2195 #if defined(LIBXSMM_HASH_COLLISION)
2196 /* enter code generation, and collision fix-up */
2197 if (0 == (LIBXSMM_HASH_COLLISION & flux_entry.uval)) {
2198 LIBXSMM_ASSERT(NULL != flux_entry.ptr_const); /* collision */
2199 mode = 3;
2200 }
2201 else
2202 #endif /* search for an existing code version */
2203 mode = 1; /* else */
2204 }
2205 i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
2206 if (i == i0) { /* search finished, no code version exists */
2207 #if defined(LIBXSMM_HASH_COLLISION)
2208 mode = 3; /* enter code generation, and collision fix-up */
2209 #else
2210 mode = 2; /* enter code generation */
2211 #endif
2212 if (LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) {
2213 internal_update_mmstatistic(&desc->gemm.desc, 0, 1/*collision*/, 0, 0);
2214 }
2215 }
2216 LIBXSMM_ASSERT(0 != diff); /* continue */
2217 }
2218 }
2219 else { /* enter code generation (there is no code version yet) */
2220 LIBXSMM_ASSERT(0 == mode || 1 < mode);
2221 #if (0 == LIBXSMM_JIT)
2222 LIBXSMM_UNUSED(user_size);
2223 #else
2224 if (LIBXSMM_X86_AVX <= libxsmm_target_archid || /* check if JIT is supported (CPUID) */
2225 (LIBXSMM_X86_SSE3 <= libxsmm_target_archid && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) ||
2226 (LIBXSMM_KERNEL_KIND_USER == desc->kind))
2227 {
2228 LIBXSMM_ASSERT(0 != mode || NULL == flux_entry.ptr_const/*code version does not exist*/);
2229 INTERNAL_FIND_CODE_LOCK(lock, i, diff, flux_entry.ptr); /* lock the registry entry */
2230 if (NULL == internal_registry[i].ptr_const) { /* double-check registry after acquiring the lock */
2231 libxsmm_build_request request; /* setup the code build request */
2232 LIBXSMM_ASSERT(desc->kind < LIBXSMM_KERNEL_UNREGISTERED);
2233 request.kind = (libxsmm_build_kind)desc->kind;
2234 request.descriptor.ptr = &desc->gemm.desc;
2235 request.user_size = user_size;
2236 # if defined(NDEBUG)
2237 if (EXIT_SUCCESS == libxsmm_build(&request, i, &flux_entry) && NULL != flux_entry.ptr_const)
2238 # else
2239 build = libxsmm_build(&request, i, &flux_entry);
2240 if (EXIT_SUCCESS == build && NULL != flux_entry.ptr_const)
2241 # endif
2242 {
2243 LIBXSMM_ASSIGN127(internal_registry_keys + i, desc);
2244 # if (1 < INTERNAL_REGLOCK_MAXN)
2245 LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_registry[i].ptr, flux_entry.ptr, LIBXSMM_ATOMIC_SEQ_CST);
2246 # else
2247 internal_registry[i] = flux_entry;
2248 # endif
2249 # if defined(LIBXSMM_HASH_COLLISION)
2250 if (2 < mode) { /* arrived from collision state; now mark as collision */
2251 libxsmm_code_pointer fix_entry;
2252 # if (1 < INTERNAL_REGLOCK_MAXN)
2253 fix_entry.ptr = LIBXSMM_ATOMIC_LOAD(&internal_registry[i0].ptr, LIBXSMM_ATOMIC_RELAXED);
2254 # else
2255 fix_entry = internal_registry[i0];
2256 # endif
2257 LIBXSMM_ASSERT(NULL != fix_entry.ptr_const);
2258 if (0 == (LIBXSMM_HASH_COLLISION & fix_entry.uval)) {
2259 fix_entry.uval |= LIBXSMM_HASH_COLLISION; /* mark current entry as collision */
2260 # if (1 < INTERNAL_REGLOCK_MAXN)
2261 LIBXSMM_ATOMIC_STORE(&internal_registry[i0].ptr, fix_entry.ptr, LIBXSMM_ATOMIC_RELAXED);
2262 # else
2263 internal_registry[i0] = fix_entry;
2264 # endif
2265 }
2266 }
2267 # endif
2268 }
2269 if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) {
2270 internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0);
2271 }
2272 /* leave here even in case of a build-error; do not use break (inside of locked region) */
2273 diff = 0;
2274 }
2275 INTERNAL_FIND_CODE_UNLOCK(lock);
2276 if (0 != diff) { /* acquire registry slot */
2277 if (0 == mode) { /* initial condition */
2278 mode = 2; /* continue to linearly search for an empty slot */
2279 i0 = i; /* keep current position on record */
2280 }
2281 do { /* continue to linearly search for an available slot */
2282 i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
2283 if (NULL == internal_registry[i].ptr_const) break;
2284 } while (i != i0);
2285 if (i == i0) { /* out of capacity (no registry slot available) */
2286 diff = 0; /* do not use break if inside of locked region */
2287 }
2288 flux_entry.ptr = NULL; /* no result */
2289 }
2290 }
2291 else /* JIT-code generation not available */
2292 #endif
2293 { /* leave the dispatch loop */
2294 if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) {
2295 internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0);
2296 }
2297 #if !defined(NDEBUG) && (0 != LIBXSMM_JIT)
2298 build = EXIT_FAILURE;
2299 #endif
2300 flux_entry.ptr = NULL;
2301 diff = 0;
2302 }
2303 }
2304 } while (0 != diff);
2305 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2306 if (NULL != flux_entry.ptr_const) { /* keep code version on record (cache) */
2307 LIBXSMM_ASSERT(0 == diff);
2308 if (cache->entry.id == libxsmm_ninit) { /* maintain cache */
2309 if (cache->entry.size < internal_cache_size) { /* grow */
2310 INTERNAL_FIND_CODE_CACHE_GROW(cache_index, cache->entry.size);
2311 LIBXSMM_ASSERT(cache->entry.size <= internal_cache_size);
2312 }
2313 else { /* evict */
2314 LIBXSMM_ASSERT(cache->entry.hit < cache->entry.size);
2315 INTERNAL_FIND_CODE_CACHE_EVICT(cache_index, cache->entry.size, cache->entry.hit);
2316 }
2317 }
2318 else if (0 != internal_cache_size) { /* reset cache */
2319 # if !defined(NDEBUG)
2320 LIBXSMM_MEMZERO127(cache->entry.keys);
2321 # endif
2322 cache->entry.id = libxsmm_ninit;
2323 cache->entry.size = 1;
2324 cache_index = 0;
2325 }
2326 LIBXSMM_ASSIGN127(cache->entry.keys + cache_index, desc);
2327 cache->entry.code[cache_index] = flux_entry;
2328 cache->entry.hit = cache_index;
2329 }
2330 #endif
2331 }
2332 #if defined(LIBXSMM_HASH_COLLISION)
2333 flux_entry.uval &= ~(LIBXSMM_CODE_STATIC | LIBXSMM_HASH_COLLISION); /* clear non-JIT and collision flag */
2334 #else
2335 flux_entry.uval &= ~LIBXSMM_CODE_STATIC; /* clear non-JIT flag */
2336 #endif
2337 #if (0 != LIBXSMM_JIT)
2338 assert(LIBXSMM_KERNEL_KIND_MATMUL != desc->kind || NULL != flux_entry.ptr_const || EXIT_SUCCESS != build || 1 == internal_reglock_count); /*!LIBXSMM_ASSERT*/
2339 #endif
2340 return flux_entry;
2341 }
2342
2343
libxsmm_get_kernel_xinfo(libxsmm_code_pointer code,const libxsmm_descriptor ** desc,size_t * code_size)2344 LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, const libxsmm_descriptor** desc, size_t* code_size)
2345 {
2346 libxsmm_kernel_xinfo* result = NULL;
2347 void *const result_address = &result;
2348 int flags = LIBXSMM_MALLOC_FLAG_X;
2349 if (NULL != code.ptr_const && EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, code_size, &flags, (void**)result_address) && NULL != result) {
2350 if (NULL != desc) {
2351 if (NULL != internal_registry && NULL != internal_registry_keys && result->registered < (LIBXSMM_CAPACITY_REGISTRY)
2352 #if defined(LIBXSMM_HASH_COLLISION)
2353 && code.uval == (~LIBXSMM_HASH_COLLISION & internal_registry[result->registered].uval)
2354 #else
2355 && code.ptr_const == internal_registry[result->registered].ptr_const
2356 #endif
2357 && internal_registry_keys[result->registered].kind < LIBXSMM_KERNEL_UNREGISTERED)
2358 {
2359 *desc = internal_registry_keys + result->registered;
2360 }
2361 else *desc = NULL;
2362 }
2363 }
2364 else {
2365 LIBXSMM_ASSERT(NULL == result);
2366 if (NULL != code_size) *code_size = 0;
2367 if (NULL != desc) *desc = NULL;
2368 }
2369 return result;
2370 }
2371
2372
libxsmm_get_kernel_info(const void * kernel,libxsmm_kernel_info * info)2373 LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info)
2374 {
2375 int result;
2376 const libxsmm_kernel_xinfo* xinfo;
2377 libxsmm_kernel_info result_info;
2378 const libxsmm_descriptor* desc;
2379 libxsmm_code_pointer code;
2380 code.ptr_const = kernel;
2381 LIBXSMM_MEMZERO127(&result_info);
2382 xinfo = libxsmm_get_kernel_xinfo(code, &desc, &result_info.code_size);
2383 if (NULL != xinfo) {
2384 if (NULL != desc) {
2385 const libxsmm_kernel_kind kind = (libxsmm_kernel_kind)desc->kind;
2386 result_info.kind = kind;
2387 if (LIBXSMM_KERNEL_KIND_USER == kind) {
2388 result_info.code_size = 0; /* invalid */
2389 }
2390 }
2391 else {
2392 result_info.kind = LIBXSMM_KERNEL_UNREGISTERED;
2393 }
2394 result_info.nflops = xinfo->nflops;
2395 LIBXSMM_ASSIGN127(info, &result_info);
2396 result = EXIT_SUCCESS;
2397 }
2398 else {
2399 LIBXSMM_ASSERT(NULL == desc);
2400 if (NULL != info) {
2401 LIBXSMM_ASSIGN127(info, &result_info);
2402 result = EXIT_FAILURE;
2403 }
2404 else {
2405 result = EXIT_SUCCESS;
2406 }
2407 }
2408 return result;
2409 }
2410
2411
libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel,libxsmm_mmkernel_info * info)2412 LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info)
2413 {
2414 libxsmm_code_pointer code;
2415 static int error_once = 0;
2416 int result;
2417 code.xgemm = kernel;
2418 if (NULL != info) {
2419 const libxsmm_descriptor* desc;
2420 if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2421 NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind)
2422 {
2423 info->iprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype);
2424 info->oprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype);
2425 info->prefetch = (libxsmm_gemm_prefetch_type)desc->gemm.desc.prefetch;
2426 info->flags = desc->gemm.desc.flags;
2427 info->lda = desc->gemm.desc.lda;
2428 info->ldb = desc->gemm.desc.ldb;
2429 info->ldc = desc->gemm.desc.ldc;
2430 info->m = desc->gemm.desc.m;
2431 info->n = desc->gemm.desc.n;
2432 info->k = desc->gemm.desc.k;
2433 result = EXIT_SUCCESS;
2434 }
2435 else {
2436 if ( 0 != libxsmm_verbosity /* library code is expected to be mute */
2437 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2438 {
2439 if (NULL == code.ptr_const) {
2440 fprintf(stderr, "LIBXSMM ERROR: NULL-kernel cannot be inspected!\n");
2441 }
2442 else {
2443 fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2444 }
2445 }
2446 result = EXIT_FAILURE;
2447 }
2448 }
2449 else {
2450 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2451 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2452 {
2453 fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2454 }
2455 result = EXIT_FAILURE;
2456 }
2457 return result;
2458 }
2459
2460
libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel,libxsmm_transkernel_info * info)2461 LIBXSMM_API int libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel, libxsmm_transkernel_info* info)
2462 {
2463 libxsmm_code_pointer code;
2464 static int error_once = 0;
2465 int result;
2466 code.xtrans = kernel;
2467 if (NULL != info) {
2468 const libxsmm_descriptor* desc;
2469 if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2470 NULL != desc && LIBXSMM_KERNEL_KIND_TRANS == desc->kind)
2471 {
2472 info->typesize = desc->trans.desc.typesize;
2473 info->ldo = desc->trans.desc.ldo;
2474 info->m = desc->trans.desc.m;
2475 info->n = desc->trans.desc.n;
2476 result = EXIT_SUCCESS;
2477 }
2478 else {
2479 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2480 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2481 {
2482 fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2483 }
2484 result = EXIT_FAILURE;
2485 }
2486 }
2487 else {
2488 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2489 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2490 {
2491 fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2492 }
2493 result = EXIT_FAILURE;
2494 }
2495 return result;
2496 }
2497
2498
libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel,libxsmm_mcopykernel_info * info)2499 LIBXSMM_API int libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel, libxsmm_mcopykernel_info* info)
2500 {
2501 libxsmm_code_pointer code;
2502 static int error_once = 0;
2503 int result;
2504 code.xmatcopy = kernel;
2505 if (NULL != info) {
2506 const libxsmm_descriptor* desc;
2507 if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2508 NULL != desc && LIBXSMM_KERNEL_KIND_MCOPY == desc->kind)
2509 {
2510 info->typesize = desc->mcopy.desc.typesize;
2511 info->prefetch = desc->mcopy.desc.prefetch;
2512 info->flags = desc->mcopy.desc.flags;
2513 info->ldi = desc->mcopy.desc.ldi;
2514 info->ldo = desc->mcopy.desc.ldo;
2515 info->m = desc->mcopy.desc.m;
2516 info->n = desc->mcopy.desc.n;
2517 result = EXIT_SUCCESS;
2518 }
2519 else {
2520 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2521 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2522 {
2523 fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2524 }
2525 result = EXIT_FAILURE;
2526 }
2527 }
2528 else {
2529 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2530 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2531 {
2532 fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2533 }
2534 result = EXIT_FAILURE;
2535 }
2536 return result;
2537 }
2538
2539
libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel,libxsmm_meltwkernel_info * info)2540 LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info)
2541 {
2542 libxsmm_code_pointer code;
2543 static int error_once = 0;
2544 int result;
2545 code.xmateltw = kernel;
2546 if (NULL != info) {
2547 const libxsmm_descriptor* desc;
2548 if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2549 NULL != desc && LIBXSMM_KERNEL_KIND_MELTW == desc->kind)
2550 {
2551 info->datatype = desc->meltw.desc.datatype;
2552 info->operation = desc->meltw.desc.operation;
2553 info->flags = desc->meltw.desc.flags;
2554 info->ldi = desc->meltw.desc.ldi;
2555 info->ldo = desc->meltw.desc.ldo;
2556 info->m = desc->meltw.desc.m;
2557 info->n = desc->meltw.desc.n;
2558 result = EXIT_SUCCESS;
2559 }
2560 else {
2561 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2562 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2563 {
2564 fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2565 }
2566 result = EXIT_FAILURE;
2567 }
2568 }
2569 else {
2570 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2571 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2572 {
2573 fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2574 }
2575 result = EXIT_FAILURE;
2576 }
2577 return result;
2578 }
2579
libxsmm_get_registry_info(libxsmm_registry_info * info)2580 LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info)
2581 {
2582 int result = EXIT_SUCCESS;
2583 LIBXSMM_INIT /* verbosity */
2584 if (0 != info && 0 != internal_registry) {
2585 size_t i;
2586 LIBXSMM_MEMZERO127(info); /* info->nstatic = 0; info->size = 0; */
2587 info->nbytes = (LIBXSMM_CAPACITY_REGISTRY) * (sizeof(libxsmm_code_pointer) + sizeof(libxsmm_descriptor));
2588 info->capacity = LIBXSMM_CAPACITY_REGISTRY;
2589 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2590 info->ncache = internal_cache_size;
2591 #else
2592 info->ncache = 0;
2593 #endif
2594 for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) {
2595 libxsmm_code_pointer code = internal_registry[i];
2596 if (0 != code.ptr_const && EXIT_SUCCESS == result) {
2597 if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */
2598 size_t buffer_size = 0;
2599 void* buffer = 0;
2600 #if defined(LIBXSMM_HASH_COLLISION)
2601 code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
2602 #endif
2603 result = libxsmm_get_malloc_xinfo(code.ptr_const, &buffer_size, NULL/*flags*/, &buffer);
2604 if (EXIT_SUCCESS == result) {
2605 info->nbytes += LIBXSMM_UP2(buffer_size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE);
2606 }
2607 }
2608 else {
2609 ++info->nstatic;
2610 }
2611 ++info->size;
2612 }
2613 }
2614 }
2615 else {
2616 result = EXIT_FAILURE;
2617 }
2618 return result;
2619 }
2620
2621
libxsmm_xregister(const void * key,size_t key_size,size_t value_size,const void * value_init)2622 LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init)
2623 {
2624 static int error_once = 0;
2625 void* result;
2626 LIBXSMM_INIT /* verbosity */
2627 if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) {
2628 libxsmm_descriptor wrap;
2629 void* dst;
2630 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2631 LIBXSMM_MEMSET127(&wrap, 0, key_size);
2632 #endif
2633 LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size);
2634 wrap.kind = LIBXSMM_KERNEL_KIND_USER;
2635 dst = internal_find_code(&wrap, key_size, value_size).ptr;
2636 if (NULL != dst) {
2637 size_t size;
2638 if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(dst, &size, NULL/*flags*/, NULL/*extra*/)
2639 && value_size <= size)
2640 {
2641 if (NULL != value_init) memcpy(dst, value_init, value_size);
2642 result = dst;
2643 }
2644 else {
2645 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2646 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2647 {
2648 fprintf(stderr, "LIBXSMM ERROR: value too large for previously registered key!\n");
2649 }
2650 result = NULL;
2651 }
2652 }
2653 else result = NULL;
2654 }
2655 else {
2656 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2657 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2658 {
2659 if (LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) {
2660 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n");
2661 }
2662 else {
2663 fprintf(stderr, "LIBXSMM ERROR: libxsmm_xregister has maximum key-size of %i Byte!\n",
2664 LIBXSMM_DESCRIPTOR_MAXSIZE);
2665 }
2666 }
2667 result = NULL;
2668 }
2669 return result;
2670 }
2671
2672
libxsmm_xdispatch(const void * key,size_t key_size)2673 LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size)
2674 {
2675 void* result;
2676 LIBXSMM_INIT /* verbosity */
2677 #if !defined(NDEBUG)
2678 if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size)
2679 #endif
2680 {
2681 libxsmm_descriptor wrap;
2682 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2683 LIBXSMM_MEMSET127(&wrap, 0, key_size);
2684 #endif
2685 LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size);
2686 wrap.kind = LIBXSMM_KERNEL_KIND_USER;
2687 result = internal_find_code(&wrap, key_size, 0/*user_size*/).ptr;
2688 }
2689 #if !defined(NDEBUG)
2690 else {
2691 static int error_once = 0;
2692 if (0 != libxsmm_verbosity /* library code is expected to be mute */
2693 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2694 {
2695 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n");
2696 }
2697 result = NULL;
2698 }
2699 #endif
2700 return result;
2701 }
2702
2703
libxsmm_xrelease(const void * key,size_t key_size)2704 LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size)
2705 {
2706 libxsmm_release_kernel(libxsmm_xdispatch(key, key_size));
2707 }
2708
2709
libxsmm_xmmdispatch(const libxsmm_gemm_descriptor * descriptor)2710 LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor)
2711 {
2712 libxsmm_xmmfunction result;
2713 LIBXSMM_INIT /* verbosity */
2714 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2715 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
2716 #endif
2717 if (NULL != descriptor) {
2718 libxsmm_descriptor wrap;
2719 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2720 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
2721 #endif
2722 LIBXSMM_ASSIGN127(&wrap.gemm.desc, descriptor);
2723 wrap.kind = LIBXSMM_KERNEL_KIND_MATMUL;
2724 if (0 != (0x80 & descriptor->prefetch)) { /* "sign"-bit of byte-value is set */
2725 wrap.gemm.desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
2726 }
2727 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgemm;
2728 #if defined(_DEBUG)
2729 if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity && NULL != result.xmm) {
2730 LIBXSMM_STDIO_ACQUIRE();
2731 fprintf(stderr, "\nLIBXSMM: ");
2732 libxsmm_gemm_xprint(stderr, result, NULL/*a*/, NULL/*b*/, NULL/*c*/);
2733 LIBXSMM_STDIO_RELEASE();
2734 }
2735 #endif
2736 }
2737 else { /* quietly accept NULL-descriptor */
2738 result.xmm = NULL;
2739 }
2740 return result;
2741 }
2742
2743
libxsmm_dmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)2744 LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2745 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2746 const double* alpha, const double* beta, const int* flags, const int* prefetch)
2747 {
2748 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2749 libxsmm_descriptor_blob blob;
2750 const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
2751 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2752 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2753 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2754 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2755 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2756 return result.dmm;
2757 }
2758
2759
libxsmm_smmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2760 LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2761 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2762 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2763 {
2764 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2765 libxsmm_descriptor_blob blob;
2766 const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
2767 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2768 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2769 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2770 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2771 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2772 return result.smm;
2773 }
2774
2775
libxsmm_bsmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2776 LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2777 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2778 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2779 {
2780 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2781 libxsmm_descriptor_blob blob;
2782 const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
2783 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2784 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2785 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2786 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2787 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2788 return result.bsmm;
2789 }
2790
2791
libxsmm_bmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2792 LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2793 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2794 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2795 {
2796 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2797 libxsmm_descriptor_blob blob;
2798 const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
2799 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2800 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2801 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2802 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2803 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2804 return result.bmm;
2805 }
2806
2807
libxsmm_wimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2808 LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2809 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2810 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2811 {
2812 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2813 libxsmm_descriptor_blob blob;
2814 const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
2815 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2816 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2817 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2818 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2819 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2820 return result.wimm;
2821 }
2822
2823
libxsmm_ssbimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2824 LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2825 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2826 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2827 {
2828 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2829 libxsmm_descriptor_blob blob;
2830 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2831 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2832 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2833 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2834 gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2835 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2836 return result.ssbimm;
2837 }
2838
2839
libxsmm_usbimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2840 LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2841 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2842 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2843 {
2844 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2845 libxsmm_descriptor_blob blob;
2846 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2847 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2848 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2849 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2850 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2851 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2852 return result.usbimm;
2853 }
2854
2855
libxsmm_subimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2856 LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2857 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2858 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2859 {
2860 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2861 libxsmm_descriptor_blob blob;
2862 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2863 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2864 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2865 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2866 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2867 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2868 return result.subimm;
2869 }
2870
2871
libxsmm_uubimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2872 LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2873 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2874 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2875 {
2876 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2877 libxsmm_descriptor_blob blob;
2878 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2879 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2880 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2881 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2882 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2883 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2884 return result.uubimm;
2885 }
2886
2887
libxsmm_sububmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2888 LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2889 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2890 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2891 {
2892 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2893 libxsmm_descriptor_blob blob;
2894 const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
2895 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2896 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2897 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2898 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2899 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2900 return result.sububmm;
2901 }
2902
2903
libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)2904 LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2905 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2906 const double* alpha, const double* beta, const int* flags, const int* prefetch)
2907 {
2908 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2909 libxsmm_descriptor_blob blob;
2910 const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
2911 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2912 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2913 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2914 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2915 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2916 return result.dmra;
2917 }
2918
2919
libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2920 LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2921 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2922 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2923 {
2924 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2925 libxsmm_descriptor_blob blob;
2926 const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
2927 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2928 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2929 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2930 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2931 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2932 return result.smra;
2933 }
2934
2935
libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2936 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2937 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2938 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2939 {
2940 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2941 libxsmm_descriptor_blob blob;
2942 const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
2943 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2944 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2945 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2946 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2947 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2948 return result.bsmra;
2949 }
2950
2951
libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2952 LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2953 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2954 const float* alpha, const float* beta, const int* flags, const int* prefetch)
2955 {
2956 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2957 libxsmm_descriptor_blob blob;
2958 const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
2959 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2960 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2961 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2962 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2963 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2964 return result.bmra;
2965 }
2966
2967
libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2968 LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2969 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2970 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2971 {
2972 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2973 libxsmm_descriptor_blob blob;
2974 const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
2975 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2976 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2977 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2978 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2979 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2980 return result.wimra;
2981 }
2982
2983
libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2984 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2985 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2986 const int* alpha, const int* beta, const int* flags, const int* prefetch)
2987 {
2988 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2989 libxsmm_descriptor_blob blob;
2990 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2991 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2992 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2993 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2994 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2995 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2996 return result.ssbimra;
2997 }
2998
2999
libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3000 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3001 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3002 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3003 {
3004 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3005 libxsmm_descriptor_blob blob;
3006 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3007 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3008 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3009 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3010 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3011 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3012 return result.usbimra;
3013 }
3014
3015
libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3016 LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3017 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3018 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3019 {
3020 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3021 libxsmm_descriptor_blob blob;
3022 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3023 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3024 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3025 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3026 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3027 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3028 return result.subimra;
3029 }
3030
3031
libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3032 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3033 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3034 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3035 {
3036 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3037 libxsmm_descriptor_blob blob;
3038 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3039 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3040 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3041 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3042 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3043 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3044 return result.uubimra;
3045 }
3046
3047
libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3048 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3049 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3050 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3051 {
3052 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3053 libxsmm_descriptor_blob blob;
3054 const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3055 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3056 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3057 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3058 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3059 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3060 return result.sububmra;
3061 }
3062
3063
libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3064 LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3065 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3066 const double* alpha, const double* beta, const int* flags, const int* prefetch)
3067 {
3068 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3069 libxsmm_descriptor_blob blob;
3070 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3071 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3072 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3073 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3074 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3075 /*const*/ libxsmm_xmmfunction result;
3076 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3077 result = libxsmm_xmmdispatch(desc);
3078 return result.dmra;
3079 }
3080
3081
libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3082 LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3083 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3084 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3085 {
3086 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3087 libxsmm_descriptor_blob blob;
3088 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3089 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3090 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3091 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3092 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3093 /*const*/ libxsmm_xmmfunction result;
3094 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3095 result = libxsmm_xmmdispatch(desc);
3096 return result.smra;
3097 }
3098
3099
libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3100 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3101 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3102 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3103 {
3104 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3105 libxsmm_descriptor_blob blob;
3106 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3107 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3108 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3109 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3110 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3111 /*const*/ libxsmm_xmmfunction result;
3112 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3113 result = libxsmm_xmmdispatch(desc);
3114 return result.bsmra;
3115 }
3116
3117
libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3118 LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3119 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3120 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3121 {
3122 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3123 libxsmm_descriptor_blob blob;
3124 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3125 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3126 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3127 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3128 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3129 /*const*/ libxsmm_xmmfunction result;
3130 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3131 result = libxsmm_xmmdispatch(desc);
3132 return result.bmra;
3133 }
3134
3135
libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3136 LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3137 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3138 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3139 {
3140 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3141 libxsmm_descriptor_blob blob;
3142 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3143 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3144 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3145 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3146 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3147 /*const*/ libxsmm_xmmfunction result;
3148 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3149 result = libxsmm_xmmdispatch(desc);
3150 return result.wimra;
3151 }
3152
3153
libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3154 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3155 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3156 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3157 {
3158 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3159 libxsmm_descriptor_blob blob;
3160 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3161 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3162 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3163 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3164 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3165 /*const*/ libxsmm_xmmfunction result;
3166 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3167 result = libxsmm_xmmdispatch(desc);
3168 return result.ssbimra;
3169 }
3170
3171
libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3172 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3173 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3174 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3175 {
3176 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3177 libxsmm_descriptor_blob blob;
3178 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3179 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3180 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3181 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3182 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3183 /*const*/ libxsmm_xmmfunction result;
3184 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3185 result = libxsmm_xmmdispatch(desc);
3186 return result.usbimra;
3187 }
3188
3189
libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3190 LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3191 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3192 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3193 {
3194 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3195 libxsmm_descriptor_blob blob;
3196 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3197 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3198 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3199 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3200 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3201 /*const*/ libxsmm_xmmfunction result;
3202 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3203 result = libxsmm_xmmdispatch(desc);
3204 return result.subimra;
3205 }
3206
3207
libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3208 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3209 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3210 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3211 {
3212 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3213 libxsmm_descriptor_blob blob;
3214 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3215 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3216 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3217 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3218 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3219 /*const*/ libxsmm_xmmfunction result;
3220 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3221 result = libxsmm_xmmdispatch(desc);
3222 return result.uubimra;
3223 }
3224
3225
libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3226 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3227 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3228 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3229 {
3230 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3231 libxsmm_descriptor_blob blob;
3232 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3233 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3234 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3235 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3236 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3237 /*const*/ libxsmm_xmmfunction result;
3238 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3239 result = libxsmm_xmmdispatch(desc);
3240 return result.sububmra;
3241 }
3242
3243
libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3244 LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3245 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3246 const double* alpha, const double* beta, const int* flags, const int* prefetch)
3247 {
3248 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3249 libxsmm_descriptor_blob blob;
3250 const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3251 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3252 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3253 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3254 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3255 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3256 return result.dmro;
3257 }
3258
3259
libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3260 LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3261 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3262 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3263 {
3264 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3265 libxsmm_descriptor_blob blob;
3266 const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3267 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3268 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3269 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3270 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3271 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3272 return result.smro;
3273 }
3274
3275
libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3276 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3277 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3278 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3279 {
3280 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3281 libxsmm_descriptor_blob blob;
3282 const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3283 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3284 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3285 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3286 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3287 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3288 return result.bsmro;
3289 }
3290
3291
libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3292 LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3293 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3294 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3295 {
3296 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3297 libxsmm_descriptor_blob blob;
3298 const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3299 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3300 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3301 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3302 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3303 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3304 return result.bmro;
3305 }
3306
3307
libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3308 LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3309 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3310 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3311 {
3312 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3313 libxsmm_descriptor_blob blob;
3314 const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3315 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3316 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3317 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3318 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3319 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3320 return result.wimro;
3321 }
3322
3323
libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3324 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3325 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3326 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3327 {
3328 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3329 libxsmm_descriptor_blob blob;
3330 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3331 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3332 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3333 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3334 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3335 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3336 return result.ssbimro;
3337 }
3338
3339
libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3340 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3341 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3342 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3343 {
3344 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3345 libxsmm_descriptor_blob blob;
3346 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3347 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3348 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3349 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3350 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3351 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3352 return result.usbimro;
3353 }
3354
3355
libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3356 LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3357 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3358 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3359 {
3360 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3361 libxsmm_descriptor_blob blob;
3362 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3363 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3364 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3365 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3366 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3367 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3368 return result.subimro;
3369 }
3370
3371
libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3372 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3373 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3374 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3375 {
3376 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3377 libxsmm_descriptor_blob blob;
3378 const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3379 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3380 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3381 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3382 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3383 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3384 return result.uubimro;
3385 }
3386
3387
libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3388 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3389 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3390 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3391 {
3392 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3393 libxsmm_descriptor_blob blob;
3394 const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3395 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3396 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3397 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3398 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3399 /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3400 return result.sububmro;
3401 }
3402
3403
libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3404 LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3405 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3406 const double* alpha, const double* beta, const int* flags, const int* prefetch)
3407 {
3408 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3409 libxsmm_descriptor_blob blob;
3410 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3411 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3412 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3413 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3414 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3415 /*const*/ libxsmm_xmmfunction result;
3416 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3417 result = libxsmm_xmmdispatch(desc);
3418 return result.dmro;
3419 }
3420
3421
libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3422 LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3423 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3424 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3425 {
3426 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3427 libxsmm_descriptor_blob blob;
3428 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3429 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3430 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3431 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3432 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3433 /*const*/ libxsmm_xmmfunction result;
3434 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3435 result = libxsmm_xmmdispatch(desc);
3436 return result.smro;
3437 }
3438
3439
libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3440 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3441 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3442 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3443 {
3444 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3445 libxsmm_descriptor_blob blob;
3446 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3447 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3448 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3449 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3450 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3451 /*const*/ libxsmm_xmmfunction result;
3452 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3453 result = libxsmm_xmmdispatch(desc);
3454 return result.bsmro;
3455 }
3456
3457
libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3458 LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3459 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3460 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3461 {
3462 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3463 libxsmm_descriptor_blob blob;
3464 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3465 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3466 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3467 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3468 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3469 /*const*/ libxsmm_xmmfunction result;
3470 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3471 result = libxsmm_xmmdispatch(desc);
3472 return result.bmro;
3473 }
3474
3475
libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3476 LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3477 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3478 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3479 {
3480 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3481 libxsmm_descriptor_blob blob;
3482 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3483 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3484 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3485 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3486 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3487 /*const*/ libxsmm_xmmfunction result;
3488 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3489 result = libxsmm_xmmdispatch(desc);
3490 return result.wimro;
3491 }
3492
3493
libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3494 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3495 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3496 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3497 {
3498 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3499 libxsmm_descriptor_blob blob;
3500 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3501 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3502 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3503 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3504 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3505 /*const*/ libxsmm_xmmfunction result;
3506 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3507 result = libxsmm_xmmdispatch(desc);
3508 return result.ssbimro;
3509 }
3510
3511
libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3512 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3513 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3514 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3515 {
3516 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3517 libxsmm_descriptor_blob blob;
3518 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3519 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3520 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3521 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3522 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3523 /*const*/ libxsmm_xmmfunction result;
3524 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3525 result = libxsmm_xmmdispatch(desc);
3526 return result.usbimro;
3527 }
3528
3529
libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3530 LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3531 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3532 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3533 {
3534 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3535 libxsmm_descriptor_blob blob;
3536 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3537 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3538 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3539 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3540 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3541 /*const*/ libxsmm_xmmfunction result;
3542 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3543 result = libxsmm_xmmdispatch(desc);
3544 return result.subimro;
3545 }
3546
3547
libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3548 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3549 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3550 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3551 {
3552 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3553 libxsmm_descriptor_blob blob;
3554 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3555 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3556 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3557 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3558 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3559 /*const*/ libxsmm_xmmfunction result;
3560 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3561 result = libxsmm_xmmdispatch(desc);
3562 return result.uubimro;
3563 }
3564
3565
libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3566 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3567 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3568 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3569 {
3570 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3571 libxsmm_descriptor_blob blob;
3572 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3573 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3574 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3575 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3576 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3577 /*const*/ libxsmm_xmmfunction result;
3578 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3579 result = libxsmm_xmmdispatch(desc);
3580 return result.sububmro;
3581 }
3582
3583
libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3584 LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3585 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3586 const double* alpha, const double* beta, const int* flags, const int* prefetch)
3587 {
3588 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3589 libxsmm_descriptor_blob blob;
3590 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3591 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3592 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3593 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3594 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3595 /*const*/ libxsmm_xmmfunction result;
3596 desc->c1 = (unsigned long long)stride_a;
3597 desc->c2 = (unsigned long long)stride_b;
3598 if ( (stride_a < 0) || (stride_b < 0) ) {
3599 return NULL;
3600 }
3601 result = libxsmm_xmmdispatch(desc);
3602 return result.dmrs;
3603 }
3604
3605
libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3606 LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3607 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3608 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3609 {
3610 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3611 libxsmm_descriptor_blob blob;
3612 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3613 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3614 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3615 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3616 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3617 /*const*/ libxsmm_xmmfunction result;
3618 desc->c1 = (unsigned long long)stride_a;
3619 desc->c2 = (unsigned long long)stride_b;
3620 if ( (stride_a < 0) || (stride_b < 0) ) {
3621 return NULL;
3622 }
3623 result = libxsmm_xmmdispatch(desc);
3624 return result.smrs;
3625 }
3626
3627
libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3628 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3629 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3630 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3631 {
3632 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3633 libxsmm_descriptor_blob blob;
3634 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3635 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3636 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3637 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3638 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3639 /*const*/ libxsmm_xmmfunction result;
3640 desc->c1 = (unsigned long long)stride_a;
3641 desc->c2 = (unsigned long long)stride_b;
3642 if ( (stride_a < 0) || (stride_b < 0) ) {
3643 return NULL;
3644 }
3645 result = libxsmm_xmmdispatch(desc);
3646 return result.bsmrs;
3647 }
3648
3649
libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3650 LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3651 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3652 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3653 {
3654 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3655 libxsmm_descriptor_blob blob;
3656 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3657 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3658 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3659 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3660 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3661 /*const*/ libxsmm_xmmfunction result;
3662 desc->c1 = (unsigned long long)stride_a;
3663 desc->c2 = (unsigned long long)stride_b;
3664 if ( (stride_a < 0) || (stride_b < 0) ) {
3665 return NULL;
3666 }
3667 result = libxsmm_xmmdispatch(desc);
3668 return result.bmrs;
3669 }
3670
3671
libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3672 LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3673 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3674 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3675 {
3676 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3677 libxsmm_descriptor_blob blob;
3678 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3679 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3680 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3681 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3682 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3683 /*const*/ libxsmm_xmmfunction result;
3684 desc->c1 = (unsigned long long)stride_a;
3685 desc->c2 = (unsigned long long)stride_b;
3686 if ( (stride_a < 0) || (stride_b < 0) ) {
3687 return NULL;
3688 }
3689 result = libxsmm_xmmdispatch(desc);
3690 return result.wimrs;
3691 }
3692
3693
libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3694 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3695 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3696 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3697 {
3698 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3699 libxsmm_descriptor_blob blob;
3700 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3701 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3702 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3703 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3704 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3705 /*const*/ libxsmm_xmmfunction result;
3706 desc->c1 = (unsigned long long)stride_a;
3707 desc->c2 = (unsigned long long)stride_b;
3708 if ( (stride_a < 0) || (stride_b < 0) ) {
3709 return NULL;
3710 }
3711 result = libxsmm_xmmdispatch(desc);
3712 return result.ssbimrs;
3713 }
3714
3715
libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3716 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3717 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3718 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3719 {
3720 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3721 libxsmm_descriptor_blob blob;
3722 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3723 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3724 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3725 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3726 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3727 /*const*/ libxsmm_xmmfunction result;
3728 desc->c1 = (unsigned long long)stride_a;
3729 desc->c2 = (unsigned long long)stride_b;
3730 if ( (stride_a < 0) || (stride_b < 0) ) {
3731 return NULL;
3732 }
3733 result = libxsmm_xmmdispatch(desc);
3734 return result.usbimrs;
3735 }
3736
3737
libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3738 LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3739 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3740 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3741 {
3742 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3743 libxsmm_descriptor_blob blob;
3744 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3745 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3746 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3747 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3748 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3749 /*const*/ libxsmm_xmmfunction result;
3750 desc->c1 = (unsigned long long)stride_a;
3751 desc->c2 = (unsigned long long)stride_b;
3752 if ( (stride_a < 0) || (stride_b < 0) ) {
3753 return NULL;
3754 }
3755 result = libxsmm_xmmdispatch(desc);
3756 return result.subimrs;
3757 }
3758
3759
libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3760 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3761 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3762 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3763 {
3764 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3765 libxsmm_descriptor_blob blob;
3766 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3767 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3768 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3769 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3770 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3771 /*const*/ libxsmm_xmmfunction result;
3772 desc->c1 = (unsigned long long)stride_a;
3773 desc->c2 = (unsigned long long)stride_b;
3774 if ( (stride_a < 0) || (stride_b < 0) ) {
3775 return NULL;
3776 }
3777 result = libxsmm_xmmdispatch(desc);
3778 return result.uubimrs;
3779 }
3780
3781
libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3782 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3783 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3784 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3785 {
3786 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3787 libxsmm_descriptor_blob blob;
3788 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3789 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3790 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3791 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3792 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3793 /*const*/ libxsmm_xmmfunction result;
3794 desc->c1 = (unsigned long long)stride_a;
3795 desc->c2 = (unsigned long long)stride_b;
3796 if ( (stride_a < 0) || (stride_b < 0) ) {
3797 return NULL;
3798 }
3799 result = libxsmm_xmmdispatch(desc);
3800 return result.sububmrs;
3801 }
3802
3803
libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3804 LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3805 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3806 const double* alpha, const double* beta, const int* flags, const int* prefetch)
3807 {
3808 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3809 libxsmm_descriptor_blob blob;
3810 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3811 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3812 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3813 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3814 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3815 /*const*/ libxsmm_xmmfunction result;
3816 desc->c1 = (unsigned long long)stride_a;
3817 desc->c2 = (unsigned long long)stride_b;
3818 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3819 if ( (stride_a < 0) || (stride_b < 0) ) {
3820 return NULL;
3821 }
3822 result = libxsmm_xmmdispatch(desc);
3823 return result.dmrs;
3824 }
3825
3826
libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3827 LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3828 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3829 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3830 {
3831 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3832 libxsmm_descriptor_blob blob;
3833 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3834 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3835 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3836 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3837 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3838 /*const*/ libxsmm_xmmfunction result;
3839 desc->c1 = (unsigned long long)stride_a;
3840 desc->c2 = (unsigned long long)stride_b;
3841 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3842 if ( (stride_a < 0) || (stride_b < 0) ) {
3843 return NULL;
3844 }
3845 result = libxsmm_xmmdispatch(desc);
3846 return result.smrs;
3847 }
3848
3849
libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3850 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3851 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3852 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3853 {
3854 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3855 libxsmm_descriptor_blob blob;
3856 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3857 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3858 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3859 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3860 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3861 /*const*/ libxsmm_xmmfunction result;
3862 desc->c1 = (unsigned long long)stride_a;
3863 desc->c2 = (unsigned long long)stride_b;
3864 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3865 if ( (stride_a < 0) || (stride_b < 0) ) {
3866 return NULL;
3867 }
3868 result = libxsmm_xmmdispatch(desc);
3869 return result.bsmrs;
3870 }
3871
3872
libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3873 LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3874 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3875 const float* alpha, const float* beta, const int* flags, const int* prefetch)
3876 {
3877 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3878 libxsmm_descriptor_blob blob;
3879 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3880 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3881 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3882 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3883 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3884 /*const*/ libxsmm_xmmfunction result;
3885 desc->c1 = (unsigned long long)stride_a;
3886 desc->c2 = (unsigned long long)stride_b;
3887 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3888 if ( (stride_a < 0) || (stride_b < 0) ) {
3889 return NULL;
3890 }
3891 result = libxsmm_xmmdispatch(desc);
3892 return result.bmrs;
3893 }
3894
3895
libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3896 LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3897 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3898 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3899 {
3900 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3901 libxsmm_descriptor_blob blob;
3902 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3903 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3904 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3905 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3906 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3907 /*const*/ libxsmm_xmmfunction result;
3908 desc->c1 = (unsigned long long)stride_a;
3909 desc->c2 = (unsigned long long)stride_b;
3910 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3911 if ( (stride_a < 0) || (stride_b < 0) ) {
3912 return NULL;
3913 }
3914 result = libxsmm_xmmdispatch(desc);
3915 return result.wimrs;
3916 }
3917
3918
libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3919 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3920 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3921 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3922 {
3923 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3924 libxsmm_descriptor_blob blob;
3925 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3926 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3927 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3928 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3929 gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3930 /*const*/ libxsmm_xmmfunction result;
3931 desc->c1 = (unsigned long long)stride_a;
3932 desc->c2 = (unsigned long long)stride_b;
3933 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3934 if ( (stride_a < 0) || (stride_b < 0) ) {
3935 return NULL;
3936 }
3937 result = libxsmm_xmmdispatch(desc);
3938 return result.ssbimrs;
3939 }
3940
3941
libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3942 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3943 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3944 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3945 {
3946 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3947 libxsmm_descriptor_blob blob;
3948 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3949 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3950 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3951 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3952 gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3953 /*const*/ libxsmm_xmmfunction result;
3954 desc->c1 = (unsigned long long)stride_a;
3955 desc->c2 = (unsigned long long)stride_b;
3956 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3957 if ( (stride_a < 0) || (stride_b < 0) ) {
3958 return NULL;
3959 }
3960 result = libxsmm_xmmdispatch(desc);
3961 return result.usbimrs;
3962 }
3963
3964
libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3965 LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3966 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3967 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3968 {
3969 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3970 libxsmm_descriptor_blob blob;
3971 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3972 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3973 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3974 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3975 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3976 /*const*/ libxsmm_xmmfunction result;
3977 desc->c1 = (unsigned long long)stride_a;
3978 desc->c2 = (unsigned long long)stride_b;
3979 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3980 if ( (stride_a < 0) || (stride_b < 0) ) {
3981 return NULL;
3982 }
3983 result = libxsmm_xmmdispatch(desc);
3984 return result.subimrs;
3985 }
3986
3987
libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3988 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3989 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3990 const int* alpha, const int* beta, const int* flags, const int* prefetch)
3991 {
3992 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3993 libxsmm_descriptor_blob blob;
3994 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3995 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3996 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3997 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3998 gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3999 /*const*/ libxsmm_xmmfunction result;
4000 desc->c1 = (unsigned long long)stride_a;
4001 desc->c2 = (unsigned long long)stride_b;
4002 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
4003 if ( (stride_a < 0) || (stride_b < 0) ) {
4004 return NULL;
4005 }
4006 result = libxsmm_xmmdispatch(desc);
4007 return result.uubimrs;
4008 }
4009
4010
libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)4011 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
4012 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4013 const int* alpha, const int* beta, const int* flags, const int* prefetch)
4014 {
4015 const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
4016 libxsmm_descriptor_blob blob;
4017 /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
4018 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
4019 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
4020 NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
4021 gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
4022 /*const*/ libxsmm_xmmfunction result;
4023 desc->c1 = (unsigned long long)stride_a;
4024 desc->c2 = (unsigned long long)stride_b;
4025 desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
4026 if ( (stride_a < 0) || (stride_b < 0) ) {
4027 return NULL;
4028 }
4029 result = libxsmm_xmmdispatch(desc);
4030 return result.sububmrs;
4031 }
4032
4033
libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor * descriptor)4034 LIBXSMM_API libxsmm_xmcopyfunction libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor* descriptor)
4035 {
4036 libxsmm_xmcopyfunction result;
4037 LIBXSMM_INIT /* verbosity */
4038 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4039 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4040 #endif
4041 if (NULL != descriptor) {
4042 libxsmm_descriptor wrap;
4043 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4044 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4045 #endif
4046 LIBXSMM_ASSIGN127(&wrap.mcopy.desc, descriptor);
4047 wrap.kind = LIBXSMM_KERNEL_KIND_MCOPY;
4048 #if defined(_WIN32) || defined(__CYGWIN__)
4049 wrap.mcopy.desc.prefetch = 0;
4050 #endif
4051 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmatcopy;
4052 }
4053 else {
4054 result = NULL;
4055 }
4056 return result;
4057 }
4058
4059
libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor * descriptor)4060 LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor)
4061 {
4062 libxsmm_xmeltwfunction result;
4063 LIBXSMM_INIT /* verbosity */
4064 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4065 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4066 #endif
4067 if (NULL != descriptor) {
4068 libxsmm_descriptor wrap;
4069 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4070 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4071 #endif
4072 LIBXSMM_ASSIGN127(&wrap.meltw.desc, descriptor);
4073 wrap.kind = LIBXSMM_KERNEL_KIND_MELTW;
4074 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmateltw;
4075 }
4076 else {
4077 result.xmeltw = NULL;
4078 }
4079 return result;
4080 }
4081
4082
libxsmm_dispatch_meltw_copy(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4083 LIBXSMM_API libxsmm_meltwfunction_copy libxsmm_dispatch_meltw_copy(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4084 libxsmm_descriptor_blob blob;
4085 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4086 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4087 0, LIBXSMM_MELTW_OPERATION_COPY);
4088
4089 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4090
4091 return result.meltw_copy;
4092 }
4093
4094
libxsmm_dispatch_meltw_zero(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4095 LIBXSMM_API libxsmm_meltwfunction_zero libxsmm_dispatch_meltw_zero(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4096 libxsmm_descriptor_blob blob;
4097 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4098 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4099 0, LIBXSMM_MELTW_OPERATION_ZERO);
4100
4101 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4102
4103 return result.meltw_zero;
4104 }
4105
4106
libxsmm_dispatch_meltw_add(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4107 LIBXSMM_API libxsmm_meltwfunction_add libxsmm_dispatch_meltw_add(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4108 libxsmm_descriptor_blob blob;
4109 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4110 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4111 0, LIBXSMM_MELTW_OPERATION_ADD);
4112
4113 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4114
4115 return result.meltw_add;
4116 }
4117
4118
libxsmm_dispatch_meltw_mul(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4119 LIBXSMM_API libxsmm_meltwfunction_mul libxsmm_dispatch_meltw_mul(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4120 libxsmm_descriptor_blob blob;
4121 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4122 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4123 0, LIBXSMM_MELTW_OPERATION_MUL);
4124
4125 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4126
4127 return result.meltw_mul;
4128 }
4129
4130
libxsmm_dispatch_meltw_relu(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4131 LIBXSMM_API libxsmm_meltwfunction_relu libxsmm_dispatch_meltw_relu(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4132 libxsmm_descriptor_blob blob;
4133 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4134 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4135 0, LIBXSMM_MELTW_OPERATION_RELU);
4136
4137 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4138
4139 return result.meltw_relu;
4140 }
4141
4142
libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4143 LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4144 libxsmm_descriptor_blob blob;
4145 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4146 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4147 0, LIBXSMM_MELTW_OPERATION_CVTFP32BF16);
4148
4149 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4150
4151 return result.meltw_cvtfp32bf16;
4152 }
4153
4154
libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_cvta_flags flags)4155 LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16_act libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_cvta_flags flags) {
4156 libxsmm_descriptor_blob blob;
4157 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4158 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4159 libxsmm_get_meltw_comp_cvta_flags( flags ), LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT);
4160
4161 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4162
4163 return result.meltw_cvtfp32bf16_act;
4164 }
4165
libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_acvt_flags flags)4166 LIBXSMM_API libxsmm_meltwfunction_act_cvtfp32bf16 libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_acvt_flags flags) {
4167 libxsmm_descriptor_blob blob;
4168 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4169 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4170 libxsmm_get_meltw_comp_acvt_flags( flags ), LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16);
4171
4172 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4173
4174 return result.meltw_act_cvtfp32bf16;
4175 }
4176
libxsmm_dispatch_meltw_reduce(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_redu_flags flags)4177 LIBXSMM_API libxsmm_meltwfunction_reduce libxsmm_dispatch_meltw_reduce(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_redu_flags flags) {
4178 libxsmm_descriptor_blob blob;
4179 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4180 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4181 libxsmm_get_meltw_comp_redu_flags( flags ), LIBXSMM_MELTW_OPERATION_REDUCE);
4182
4183 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4184
4185 return result.meltw_reduce;
4186 }
4187
4188
libxsmm_dispatch_meltw_scale(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_scal_flags flags)4189 LIBXSMM_API libxsmm_meltwfunction_scale libxsmm_dispatch_meltw_scale(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_scal_flags flags) {
4190 libxsmm_descriptor_blob blob;
4191 const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4192 in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4193 libxsmm_get_meltw_comp_scal_flags( flags ), LIBXSMM_MELTW_OPERATION_SCALE);
4194
4195 libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4196
4197 return result.meltw_scale;
4198 }
4199
4200
libxsmm_dispatch_trans(const libxsmm_trans_descriptor * descriptor)4201 LIBXSMM_API libxsmm_xtransfunction libxsmm_dispatch_trans(const libxsmm_trans_descriptor* descriptor)
4202 {
4203 libxsmm_xtransfunction result;
4204 LIBXSMM_INIT /* verbosity */
4205 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4206 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4207 #endif
4208 if (NULL != descriptor) {
4209 libxsmm_descriptor wrap;
4210 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4211 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4212 #endif
4213 LIBXSMM_ASSIGN127(&wrap.trans.desc, descriptor);
4214 wrap.kind = LIBXSMM_KERNEL_KIND_TRANS;
4215 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrans;
4216 }
4217 else {
4218 result = NULL;
4219 }
4220 return result;
4221 }
4222
4223
libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor * descriptor)4224 LIBXSMM_API libxsmm_pgemm_xfunction libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor* descriptor)
4225 {
4226 libxsmm_trmm_xfunction result;
4227 LIBXSMM_INIT /* verbosity */
4228 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4229 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4230 #endif
4231 if (NULL != descriptor) {
4232 libxsmm_descriptor wrap;
4233 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4234 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4235 #endif
4236 LIBXSMM_ASSIGN127(&wrap.pgemm.desc, descriptor);
4237 wrap.kind = LIBXSMM_KERNEL_KIND_PGEMM;
4238 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xpgemm;
4239 }
4240 else {
4241 result = NULL;
4242 }
4243 return result;
4244 }
4245
4246
libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor * descriptor)4247 LIBXSMM_API libxsmm_getrf_xfunction libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor* descriptor)
4248 {
4249 libxsmm_trmm_xfunction result;
4250 LIBXSMM_INIT /* verbosity */
4251 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4252 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4253 #endif
4254 if (NULL != descriptor) {
4255 libxsmm_descriptor wrap;
4256 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4257 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4258 #endif
4259 LIBXSMM_ASSIGN127(&wrap.getrf.desc, descriptor);
4260 wrap.kind = LIBXSMM_KERNEL_KIND_GETRF;
4261 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgetrf;
4262 }
4263 else {
4264 result = NULL;
4265 }
4266 return result;
4267 }
4268
4269
libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor * descriptor)4270 LIBXSMM_API libxsmm_trmm_xfunction libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor* descriptor)
4271 {
4272 libxsmm_trmm_xfunction result;
4273 LIBXSMM_INIT /* verbosity */
4274 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4275 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4276 #endif
4277 if (NULL != descriptor) {
4278 libxsmm_descriptor wrap;
4279 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4280 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4281 #endif
4282 LIBXSMM_ASSIGN127(&wrap.trmm.desc, descriptor);
4283 wrap.kind = LIBXSMM_KERNEL_KIND_TRMM;
4284 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrmm;
4285 }
4286 else {
4287 result = NULL;
4288 }
4289 return result;
4290 }
4291
4292
libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor * descriptor)4293 LIBXSMM_API libxsmm_trsm_xfunction libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor* descriptor)
4294 {
4295 libxsmm_trsm_xfunction result;
4296 LIBXSMM_INIT /* verbosity */
4297 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4298 LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4299 #endif
4300 if (NULL != descriptor) {
4301 libxsmm_descriptor wrap;
4302 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4303 LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4304 #endif
4305 LIBXSMM_ASSIGN127(&wrap.trsm.desc, descriptor);
4306 wrap.kind = LIBXSMM_KERNEL_KIND_TRSM;
4307 result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrsm;
4308 }
4309 else {
4310 result = NULL;
4311 }
4312 return result;
4313 }
4314
4315
libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const void * values,unsigned int packed_width)4316 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor* descriptor,
4317 const unsigned int* row_ptr, const unsigned int* column_idx, const void* values, unsigned int packed_width)
4318 {
4319 libxsmm_code_pointer result = { 0 };
4320 LIBXSMM_INIT
4321 if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4322 libxsmm_csr_soa_descriptor srsoa;
4323 libxsmm_build_request request;
4324 libxsmm_gemm_descriptor desc;
4325 if (0 == (0x80 & descriptor->prefetch)) {
4326 srsoa.gemm = descriptor;
4327 }
4328 else { /* "sign"-bit of byte-value is set */
4329 LIBXSMM_ASSIGN127(&desc, descriptor);
4330 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4331 srsoa.gemm = &desc;
4332 }
4333 srsoa.row_ptr = row_ptr;
4334 srsoa.column_idx = column_idx;
4335 srsoa.values = values;
4336 srsoa.packed_width = packed_width;
4337 request.descriptor.srsoa = &srsoa;
4338 request.kind = LIBXSMM_BUILD_KIND_SRSOA;
4339 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4340 }
4341 return result.xgemm;
4342 }
4343
4344
libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor * descriptor,const unsigned int * column_ptr,const unsigned int * row_idx,const void * values,unsigned int packed_width)4345 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor* descriptor,
4346 const unsigned int* column_ptr, const unsigned int* row_idx, const void* values, unsigned int packed_width)
4347 {
4348 libxsmm_code_pointer result = { 0 };
4349 LIBXSMM_INIT
4350 if (NULL != descriptor && NULL != column_ptr && NULL != row_idx && NULL != values) {
4351 libxsmm_csc_soa_descriptor scsoa;
4352 libxsmm_build_request request;
4353 libxsmm_gemm_descriptor desc;
4354 if (0 == (0x80 & descriptor->prefetch)) {
4355 scsoa.gemm = descriptor;
4356 }
4357 else { /* "sign"-bit of byte-value is set */
4358 LIBXSMM_ASSIGN127(&desc, descriptor);
4359 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4360 scsoa.gemm = &desc;
4361 }
4362 scsoa.column_ptr = column_ptr;
4363 scsoa.row_idx = row_idx;
4364 scsoa.values = values;
4365 scsoa.packed_width = packed_width;
4366 request.descriptor.scsoa = &scsoa;
4367 request.kind = LIBXSMM_BUILD_KIND_SCSOA;
4368 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4369 }
4370 return result.xgemm;
4371 }
4372
4373
libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor * descriptor,unsigned int packed_width)4374 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width)
4375 {
4376 libxsmm_code_pointer result = { 0 };
4377 LIBXSMM_INIT
4378 if (NULL != descriptor) {
4379 libxsmm_pgemm_ac_rm_descriptor pgemmacrm;
4380 libxsmm_build_request request;
4381 libxsmm_gemm_descriptor desc;
4382 if (0 == (0x80 & descriptor->prefetch)) {
4383 pgemmacrm.gemm = descriptor;
4384 }
4385 else { /* "sign"-bit of byte-value is set */
4386 LIBXSMM_ASSIGN127(&desc, descriptor);
4387 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4388 pgemmacrm.gemm = &desc;
4389 }
4390 pgemmacrm.packed_width = packed_width;
4391 request.descriptor.pgemmacrm = &pgemmacrm;
4392 request.kind = LIBXSMM_BUILD_KIND_PGEMMRMAC;
4393 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4394 }
4395 return result.xgemm;
4396 }
4397
4398
libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor * descriptor,unsigned int packed_width)4399 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width)
4400 {
4401 libxsmm_code_pointer result = { 0 };
4402 LIBXSMM_INIT
4403 if (NULL != descriptor) {
4404 libxsmm_pgemm_bc_rm_descriptor pgemmbcrm;
4405 libxsmm_build_request request;
4406 libxsmm_gemm_descriptor desc;
4407 if (0 == (0x80 & descriptor->prefetch)) {
4408 pgemmbcrm.gemm = descriptor;
4409 }
4410 else { /* "sign"-bit of byte-value is set */
4411 LIBXSMM_ASSIGN127(&desc, descriptor);
4412 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4413 pgemmbcrm.gemm = &desc;
4414 }
4415 pgemmbcrm.packed_width = packed_width;
4416 request.descriptor.pgemmbcrm = &pgemmbcrm;
4417 request.kind = LIBXSMM_BUILD_KIND_PGEMMRMBC;
4418 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4419 }
4420 return result.xgemm;
4421 }
4422
4423
libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const double * values)4424 LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor,
4425 const unsigned int* row_ptr, const unsigned int* column_idx, const double* values)
4426 {
4427 libxsmm_code_pointer result = { 0 };
4428 LIBXSMM_INIT
4429 if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4430 libxsmm_csr_reg_descriptor sreg;
4431 libxsmm_build_request request;
4432 libxsmm_gemm_descriptor desc;
4433 if (0 == (0x80 & descriptor->prefetch)) {
4434 sreg.gemm = descriptor;
4435 }
4436 else { /* "sign"-bit of byte-value is set */
4437 LIBXSMM_ASSIGN127(&desc, descriptor);
4438 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4439 sreg.gemm = &desc;
4440 }
4441 sreg.row_ptr = row_ptr;
4442 sreg.column_idx = column_idx;
4443 sreg.values = values;
4444 request.descriptor.sreg = &sreg;
4445 request.kind = LIBXSMM_BUILD_KIND_SREG;
4446 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4447 }
4448 return result.xgemm.dmm;
4449 }
4450
4451
libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const float * values)4452 LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor,
4453 const unsigned int* row_ptr, const unsigned int* column_idx, const float* values)
4454 {
4455 libxsmm_code_pointer result = { 0 };
4456 LIBXSMM_INIT
4457 if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4458 libxsmm_csr_reg_descriptor sreg;
4459 libxsmm_build_request request;
4460 const unsigned int n = row_ptr[descriptor->m];
4461 double *const d_values = (double*)(0 != n ? malloc(n * sizeof(double)) : NULL);
4462 if (NULL != d_values) {
4463 libxsmm_gemm_descriptor desc;
4464 unsigned int i;
4465 /* we need to copy the values into a double precision buffer */
4466 for (i = 0; i < n; ++i) d_values[i] = (double)values[i];
4467 if (0 == (0x80 & descriptor->prefetch)) {
4468 sreg.gemm = descriptor;
4469 }
4470 else { /* "sign"-bit of byte-value is set */
4471 LIBXSMM_ASSIGN127(&desc, descriptor);
4472 desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4473 sreg.gemm = &desc;
4474 }
4475 sreg.row_ptr = row_ptr;
4476 sreg.column_idx = column_idx;
4477 sreg.values = d_values;
4478 request.descriptor.sreg = &sreg;
4479 request.kind = LIBXSMM_BUILD_KIND_SREG;
4480 libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4481 free(d_values);
4482 }
4483 }
4484 return result.xgemm.smm;
4485 }
4486
4487
libxsmm_release_kernel(const void * kernel)4488 LIBXSMM_API void libxsmm_release_kernel(const void* kernel)
4489 {
4490 if (NULL != kernel) {
4491 static int error_once = 0;
4492 libxsmm_kernel_xinfo* extra = NULL;
4493 void *const extra_address = &extra;
4494 LIBXSMM_INIT
4495 if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(kernel, NULL/*size*/, NULL/*flags*/, (void**)extra_address) && NULL != extra) {
4496 const unsigned int regindex = extra->registered;
4497 if ((LIBXSMM_CAPACITY_REGISTRY) <= regindex) {
4498 libxsmm_xfree(kernel, 0/*no check*/);
4499 }
4500 else { /* attempt to unregister kernel */
4501 libxsmm_kernel_info info;
4502 #if !defined(LIBXSMM_ENABLE_DEREG)
4503 if (EXIT_SUCCESS == libxsmm_get_kernel_info(kernel, &info)
4504 && LIBXSMM_KERNEL_KIND_USER == info.kind)
4505 #endif
4506 {
4507 LIBXSMM_ASSERT(LIBXSMM_KERNEL_UNREGISTERED > info.kind);
4508 /* coverity[check_return] */
4509 LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */
4510 internal_registry[regindex].ptr = NULL;
4511 #if !defined(NDEBUG)
4512 LIBXSMM_MEMZERO127(internal_registry_keys + regindex);
4513 #endif
4514 libxsmm_xfree(kernel, 0/*no check*/);
4515 }
4516 #if !defined(LIBXSMM_ENABLE_DEREG)
4517 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4518 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4519 {
4520 fprintf(stderr, "LIBXSMM WARNING: attempt to unregister JIT-kernel!\n");
4521 }
4522 #endif
4523 }
4524 }
4525 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4526 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4527 {
4528 fprintf(stderr, "LIBXSMM ERROR: failed to release kernel!\n");
4529 }
4530 }
4531 }
4532
4533
4534 #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))
4535
4536 /* implementation provided for Fortran 77 compatibility */
4537 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void);
LIBXSMM_FSYMBOL(libxsmm_init)4538 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void)
4539 {
4540 libxsmm_init();
4541 }
4542
4543
4544 /* implementation provided for Fortran 77 compatibility */
4545 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void);
LIBXSMM_FSYMBOL(libxsmm_finalize)4546 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void)
4547 {
4548 libxsmm_finalize();
4549 }
4550
4551
4552 /* implementation provided for Fortran 77 compatibility */
4553 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** /*kernel*/);
LIBXSMM_FSYMBOL(libxsmm_release_kernel)4554 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** kernel)
4555 {
4556 #if !defined(NDEBUG)
4557 if (NULL != kernel)
4558 #endif
4559 {
4560 libxsmm_release_kernel(*kernel);
4561 }
4562 #if !defined(NDEBUG)
4563 else {
4564 static int error_once = 0;
4565 if (0 != libxsmm_verbosity /* library code is expected to be mute */
4566 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4567 {
4568 fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_release_kernel!\n");
4569 }
4570 }
4571 #endif
4572 }
4573
4574
4575 /* implementation provided for Fortran 77 compatibility */
4576 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* /*fn*/, const int* /*iprec*/, const int* /*oprec*/,
4577 const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/,
4578 const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/,
4579 const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/);
LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)4580 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* fn, const int* iprec, const int* oprec,
4581 const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
4582 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4583 const void* alpha, const void* beta, const int* flags, const int* prefetch)
4584 {
4585 #if !defined(NDEBUG)
4586 if (NULL != fn && NULL != m
4587 && (NULL == iprec || (0 <= *iprec && *iprec < LIBXSMM_DATATYPE_UNSUPPORTED))
4588 && (NULL == oprec || (0 <= *oprec && *oprec < LIBXSMM_DATATYPE_UNSUPPORTED)))
4589 #endif
4590 {
4591 const int gemm_flags = (NULL != flags ? *flags : LIBXSMM_FLAGS);
4592 const libxsmm_gemm_descriptor* descriptor;
4593 libxsmm_gemm_prefetch_type gemm_prefetch;
4594 libxsmm_descriptor_blob blob;
4595 libxsmm_code_pointer result;
4596 #if !defined(NDEBUG)
4597 const libxsmm_gemm_precision itype = (NULL != iprec ? ((libxsmm_gemm_precision)*iprec) : LIBXSMM_GEMM_PRECISION_F64);
4598 const libxsmm_gemm_precision otype = (NULL != oprec ? ((libxsmm_gemm_precision)*oprec) : itype);
4599 const libxsmm_blasint kk = *(NULL != k ? k : m), nn = (NULL != n ? *n : kk);
4600 #else
4601 const libxsmm_gemm_precision itype = (libxsmm_gemm_precision)*iprec, otype = (libxsmm_gemm_precision)*oprec;
4602 const libxsmm_blasint kk = *k, nn = *n;
4603 #endif
4604 LIBXSMM_PRAGMA_FORCEINLINE
4605 gemm_prefetch = libxsmm_get_gemm_xprefetch(prefetch);
4606 LIBXSMM_PRAGMA_FORCEINLINE
4607 descriptor = libxsmm_gemm_descriptor_init2(&blob, itype, otype, *m, nn, kk,
4608 NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? *m : kk),
4609 NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? kk : nn),
4610 *(NULL != ldc ? ldc : m), alpha, beta, gemm_flags, gemm_prefetch);
4611 #if !defined(NDEBUG)
4612 if (NULL != descriptor)
4613 #endif
4614 {
4615 LIBXSMM_PRAGMA_FORCEINLINE
4616 result.xgemm = libxsmm_xmmdispatch(descriptor);
4617 *fn = result.ival;
4618 }
4619 #if !defined(NDEBUG)
4620 else { /* quiet */
4621 *fn = 0;
4622 }
4623 #endif
4624 }
4625 #if !defined(NDEBUG)
4626 else {
4627 static int error_once = 0;
4628 if (0 != libxsmm_verbosity /* library code is expected to be mute */
4629 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4630 {
4631 fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_xmmdispatch!\n");
4632 }
4633 if (NULL != fn) *fn = 0;
4634 }
4635 #endif
4636 }
4637
4638
4639 /* implementation provided for Fortran 77 compatibility */
4640 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* /*fn*/, const int* /*precision*/,
4641 const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/,
4642 const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/,
4643 const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/);
LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)4644 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* fn, const int* precision,
4645 const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
4646 const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4647 const void* alpha, const void* beta, const int* flags, const int* prefetch)
4648 {
4649 LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(fn, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch);
4650 }
4651
4652
4653 /* implementation provided for Fortran 77 compatibility */
4654 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)(
4655 const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)4656 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)(
4657 const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c)
4658 {
4659 #if !defined(NDEBUG)
4660 static int error_once = 0;
4661 if (NULL != fn && NULL != a && NULL != b && NULL != c)
4662 #endif
4663 {
4664 #if !defined(NDEBUG)
4665 if (NULL != fn->xmm)
4666 #endif
4667 {
4668 fn->xmm(a, b, c);
4669 }
4670 #if !defined(NDEBUG)
4671 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4672 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4673 {
4674 fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_abc!\n");
4675 }
4676 #endif
4677 }
4678 #if !defined(NDEBUG)
4679 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4680 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4681 {
4682 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_abc specified!\n");
4683 }
4684 #endif
4685 }
4686
4687
4688 /* implementation provided for Fortran 77 compatibility */
4689 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(
4690 const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/,
4691 const void* /*pa*/, const void* /*pb*/, const void* /*pc*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)4692 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(
4693 const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c,
4694 const void* pa, const void* pb, const void* pc)
4695 {
4696 #if !defined(NDEBUG)
4697 static int error_once = 0;
4698 if (NULL != fn && NULL != a && NULL != b && NULL != c)
4699 #endif
4700 {
4701 #if !defined(NDEBUG)
4702 if (NULL != fn->xmm)
4703 #endif
4704 {
4705 fn->xmm(a, b, c, pa, pb, pc);
4706 }
4707 #if !defined(NDEBUG)
4708 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4709 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4710 {
4711 fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_prf!\n");
4712 }
4713 #endif
4714 }
4715 #if !defined(NDEBUG)
4716 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4717 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4718 {
4719 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_prf specified!\n");
4720 }
4721 #endif
4722 }
4723
4724
4725 /* implementation provided for Fortran 77 compatibility */
4726 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)(
4727 const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/,
4728 const void* /*pa*/, const void* /*pb*/, const void* /*pc*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall)4729 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)(
4730 const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c,
4731 const void* pa, const void* pb, const void* pc)
4732 {
4733 LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(fn, a, b, c, pa, pb, pc);
4734 }
4735
4736
4737 /* implementation provided for Fortran 77 compatibility */
4738 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** /*regval*/,
4739 const void* /*key*/, const int* /*keysize*/, const int* /*valsize*/, const void* /*valinit*/);
LIBXSMM_FSYMBOL(libxsmm_xregister)4740 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** regval,
4741 const void* key, const int* keysize, const int* valsize, const void* valinit)
4742 {
4743 #if !defined(NDEBUG)
4744 static int error_once = 0;
4745 if (NULL != regval && NULL != key && NULL != keysize && NULL != valsize)
4746 #endif
4747 {
4748 *regval = libxsmm_xregister(key, *keysize, *valsize, valinit);
4749 }
4750 #if !defined(NDEBUG)
4751 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4752 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4753 {
4754 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n");
4755 }
4756 #endif
4757 }
4758
4759
4760 /* implementation provided for Fortran 77 compatibility */
4761 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/);
LIBXSMM_FSYMBOL(libxsmm_xdispatch)4762 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** regval, const void* key, const int* keysize)
4763 {
4764 #if !defined(NDEBUG)
4765 static int error_once = 0;
4766 if (NULL != regval && NULL != key && NULL != keysize)
4767 #endif
4768 {
4769 *regval = libxsmm_xdispatch(key, *keysize);
4770 }
4771 #if !defined(NDEBUG)
4772 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4773 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4774 {
4775 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n");
4776 }
4777 #endif
4778 }
4779
4780
4781 /* implementation provided for Fortran 77 compatibility */
4782 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* /*key*/, const int* /*keysize*/);
LIBXSMM_FSYMBOL(libxsmm_xrelease)4783 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* key, const int* keysize)
4784 {
4785 #if !defined(NDEBUG)
4786 static int error_once = 0;
4787 if (NULL != key && NULL != keysize)
4788 #endif
4789 {
4790 libxsmm_xrelease(key, *keysize);
4791 }
4792 #if !defined(NDEBUG)
4793 else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4794 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4795 {
4796 fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xrelease specified!\n");
4797 }
4798 #endif
4799 }
4800
4801 #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/
4802
4803