1 /* ************************************************************************ 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * ************************************************************************/ 16 17 18 /* 19 * Something specific for BLAS generators 20 * 21 * NOTE: 22 * 1) All the blas kernel generators should 23 * perceive fields of the SubproblemDim 24 * structure as following: 25 * 'y' - rows of matrix A, i. e. M dimension 26 * of matrix C 27 * 'x' - columns of matrix B and C 28 * 'bwidth' - block width in K dimension 29 * 30 * 2) At generating copying functions and their calls one should 31 * keep in mind, all the matrix blocks are copied in 32 * the local memory such that sequentially accessed elements 33 * are located in memory sequentially. In this context 34 * transposing is perceived as transposing at copying 35 * to/from the local memory, not matrix storage way in 36 * the array passed to kernel. 37 */ 38 39 #ifndef BLAS_KGEN_H_ 40 #define BLAS_KGEN_H_ 41 42 #include <clBLAS.h> 43 44 #include <cltypes.h> 45 #include <kerngen.h> 46 #include <mempat.h> 47 #include <dblock_kgen.h> 48 49 #include <blas_funcs.h> 50 #include <matrix_props.h> 51 52 #include "tile.h" 53 #include "fetch.h" 54 55 #define BLAS_KGEN_FORMAT 1 56 57 #define genInternalLoopEnd(ctx) kgenEndBranch(ctx, NULL) 58 59 enum { 60 MAX_OPENCL_VECTOR_LENGTH = 16 61 }; 62 63 typedef enum TailFetch { 64 FETCH_NO_TAILS = 0, 65 FETCH_TAIL_ROW = 0x01, 66 FETCH_TAIL_COL = 0x02 67 } TailFetch; 68 69 /** 70 * @internal 71 * @brief Blas generator flags 72 * @ingroup GEN_SETTINGS 73 */ 74 typedef enum BlasGenFlags { 75 BGF_EXPLICIT_INLINE = 0x01, 76 BGF_DISTINCT_VECLEN = 0x02, 77 // TODO: replace with a flags with inverse semantics 78 BGF_WHOLE_A = 0x04, 79 /** Leading dimension are in vectors rather than in elements */ 80 BGF_LD_IN_VECTORS = 0x08, 81 /** 82 * Objects in the global memory are accessed through the unified pointers. 83 * This feature is deprecated and should be not used in new generators. 84 * It is left for backward compatibility 85 */ 86 BGF_UPTRS = 0x10 87 } BlasGenFlags; 88 89 /** 90 * @internal 91 * @brief Flags showing how problem tails are handled 92 * @ingroup TAILS_HANDLING 93 */ 94 typedef enum TailStatus { 95 /** Tail of the matrix A is raised */ 96 TAIL_A_RAISED = 0x01, 97 /** Tail of the matrix B is raised */ 98 TAIL_B_RAISED = 0x02 99 } TailStatus; 100 101 /** 102 * @internal 103 * @brief Tiles multiplier flags 104 * @ingroup BLAS_MAJOR_SUBGENS 105 */ 106 typedef enum TileMulFlags { 107 TILEMUL_NO_FLAGS = 0, /**< No flags */ 108 TILEMUL_TRA = 0x01, /**< Transposed matrix A */ 109 TILEMUL_TRB = 0x02, /**< Transposed matrix B */ 110 TILEMUL_CONJA = 0x04, /**< Conjugated elements of A */ 111 TILEMUL_CONJB = 0x08, /**< Conjugated elements of B */ 112 TILEMUL_C_COLUMN_MAJOR = 0x10, /**< Column major block for matrix C */ 113 TILEMUL_NOT_FETCH_B = 0x20, /**< Do not fetch matrix B block */ 114 TILEMUL_EXTERN_RDECL = 0x40, /**< External register tiles declaration, 115 the generator must not declare them 116 itself */ 117 118 /** 119 * Deprecated. Use the repsective mode being a part of FetchAddr mode. 120 * He is left just for backward compatibility to don't break the working 121 * code and will be removed soon 122 */ 123 TILEMUL_WRAP_AROUND_TAIL = 0x80, /**< Sizes used for column skew are 124 rounded to next vecLen bound */ 125 /** Use global cyclic along subproblem A coordinate. 126 * Deprecated. Don't use it */ 127 TILEMUL_GLOBAL_CYCLIC_A = 0x100, 128 /** Use global cyclic along subproblem B coordinate. 129 * Deprecated don't use it */ 130 TILEMUL_GLOBAL_CYCLIC_B = 0x200, 131 /* Deprecated. Don't use it */ 132 TILEMUL_GLOBAL_CYCLIC_K = 0x400, /**< Use global cyclic along K */ 133 /** Use skew along subproblem A coordinate */ 134 TILEMUL_SKEW_A = 0x800, 135 /** Use skew along subproblem B coordinate. Deprecated */ 136 TILEMUL_SKEW_B = 0x1000, 137 /* Deprecated */ 138 TILEMUL_SKEW_K = 0x2000, /**< Use skew along K */ 139 /** Use size of whole matrix for cyclic addressing. Deprecated */ 140 TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A | 141 TILEMUL_GLOBAL_CYCLIC_B | 142 TILEMUL_GLOBAL_CYCLIC_K, 143 // Deprecated 144 TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K, 145 /** Optimize coordinates calculations by storing coordinates values */ 146 // Deprecated 147 TILEMUL_OPTIMIZE_COORD_CALC = 0x4000, 148 /** Use bwidth0 stride */ 149 TILEMUL_BW_STRIDE = 0x8000, 150 /** Optimize coordinates calculations by using vectors 151 * and pointer increments */ 152 // Deprecated 153 TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000, 154 /** Do not increment K*/ 155 TILEMUL_NOT_INC_K = 0x20000, 156 /** 157 * Use variants with explicit vectorization. Useful on platforms with 158 * true SIMD. 159 */ 160 TILEMUL_FORCE_VECTORIZATION = 0x40000 161 } TileMulFlags; 162 163 164 /** 165 * @internal 166 * @brief Tiles multiplier core 167 * @ingroup BLAS_MAJOR_SUBGENS 168 */ 169 typedef enum TileMulCore { 170 /** Use multiplication and addition operations */ 171 TILEMUL_MULADD, 172 /** Use the 'dot' function where possible */ 173 TILEMUL_DOT, 174 /** Use the 'mad' function */ 175 TILEMUL_MAD 176 } TileMulCore; 177 178 /** 179 * @internal 180 * @brief Update result operations 181 * @ingroup BLAS_MAJOR_SUBGENS 182 */ 183 typedef enum UpdateResultOp { 184 /** Just set the values stored in a target buffer */ 185 UPRES_SET, 186 /** Summarize values stored in a target buffer with the temporary result */ 187 UPRES_SUM 188 } UpdateResultOp; 189 190 /** 191 * @internal 192 * @brief Update result generator flags 193 * @ingroup BLAS_MAJOR_SUBGENS 194 */ 195 typedef enum UpdateResultFlags { 196 /** Resulting matrix is stored in the column major form */ 197 UPRES_COLUMN_MAJOR = 0x01, 198 /** Generic version, non optimal sizes */ 199 UPRES_GENERIC = 0x02, 200 /** Multiply result on beta */ 201 UPRES_WITH_BETA = 0x04, 202 /** do not multiply on the alpha scalar */ 203 UPRES_WITHOUT_ALPHA = 0x08, 204 /** 205 * Destination is private memory; 206 * if not set destination is in the global one 207 */ 208 UPRES_PRIV_DEST = 0x10, 209 /** Use the local memory instead the global memory */ 210 UPRES_USE_LDS = 0x20, 211 /** Generate the inline version */ 212 UPRES_INLINE = 0x40, 213 /** Disable vectorization at memory access */ 214 UPRES_NO_VECTORIZATION = 0x80, 215 /** For the generic version useful data reside at the tile rows' tail */ 216 UPRES_TAIL_ROW = 0x100, 217 /** For the generic version useful data reside at the tile columns' tail */ 218 UPRES_TAIL_COL = 0x200, 219 /** Generate condition whether coordinates don't exceed problem bounds */ 220 UPRES_EXCEED_PROBLEM_CONDITION = 0x400, 221 /****/ 222 UPRES_INDEXING_WITH_CONSTANTS = 0x800, 223 /** Write result to C instead of B for functions with triangular matrix */ 224 UPRES_TRIANG_WRITE_C = 0x1000 225 } UpdateResultFlags; 226 227 typedef struct PrivateArea { 228 const char *typeName; 229 unsigned int vecLen; 230 unsigned int size; 231 } PrivateArea; 232 233 /** 234 * @internal 235 * @defgroup GEN_SETTINGS Generator settings 236 * @ingroup BLAS_GENERATORS 237 */ 238 /*@{*/ 239 240 /** 241 * @internal 242 * @brief Kernel variable and argument names 243 */ 244 typedef struct KernelVarNames { 245 const char *A; /**< Matrix A variable name */ 246 const char *B; /**< Matrix B variable name */ 247 const char *C; 248 const char *LDS; /**< LDS pointer name */ 249 const char *coordA; /**< Variable for subproblem A coordinate */ 250 const char *coordB; /**< Variable for subproblem B coordinate */ 251 const char *k; /**< Variable for incrementable K offset value*/ 252 const char *skewA; /**< Variable for skews along A */ 253 const char *skewB; /**< Variable for skews along B */ 254 const char *skewK; /**< Variable for skews along K */ 255 const char *sizeM; /**< Matrix A size M */ 256 const char *sizeN; /**< Matrix B size N */ 257 const char *sizeK; /**< Matrixes size K */ 258 const char *lda; /**< Leading dimension of matrix A */ 259 const char *ldb; /**< Leading dimension of matrix B */ 260 const char *ldc; /**< Leading dimension of matrix C, in vectors */ 261 const char *vectCoordA; /**< Vector containing indexes of tile a elements 262 in matrix A */ 263 const char *vectCoordB; /**< Vector containing indexes of tile b elements 264 in matrix B*/ 265 const char *startM; 266 const char *startN; 267 const char *startK; 268 const char *alpha; 269 const char *beta; 270 } KernelVarNames; 271 272 /** 273 * @internal 274 * @brief Blas generator settings 275 * 276 * This structure is designed to be used with most of subgenerators 277 * and generator helpers. It is assumed to be initialized once at the 278 * generator beginning and modified as few as possible over the rest of 279 * the process. 280 */ 281 typedef struct BlasGenSettings { 282 /** 283 * Subproblem dimensions: 284 * 285 * work group dimensions are at index 0 286 * work item dimensions are at index 1 287 */ 288 SubproblemDim subdims[2]; 289 const PGranularity *pgran; /**< Data parallelism granularity */ 290 const CLBLASKernExtra *kextra; /**< Kernel extra */ 291 BlasGenFlags flags; /**< Global generator flags */ 292 KernelVarNames varNames; /**< Kernel variables and argument names */ 293 Tile tileA; 294 Tile tileBX; 295 Tile tileCY; 296 } BlasGenSettings; 297 298 /*@}*/ 299 300 /** 301 * @internal 302 * @brief Variable names for the inline version of a function updating result 303 * @ingroup BLAS_MAJOR_SUBGENS 304 */ 305 typedef struct UpresVarNames { 306 const char *result; /**< Name of an output matrix */ 307 /** Leading dimension of a matrix stored in the global memory */ 308 const char *ld; 309 const char *startRow; /**< Start row to update from */ 310 const char *startCol; /**< Start column to update from */ 311 const char *nrRows; /**< Number of rows */ 312 const char *nrCols; /**< Number of columns */ 313 const char *cachedName; /**< Name of lds chached values */ 314 } UpresVarNames; 315 316 /** 317 * @internal 318 * @brief Options for matrix tiles multiplication generator 319 * @ingroup BLAS_MAJOR_SUBGENS 320 */ 321 typedef struct TileMulOpts { 322 CLMemType memA; /**< type of memory matrix A is located on */ 323 CLMemType memB; /**< type of memory matrix B is located on */ 324 TileMulFlags flags; /**< Flags on objects and computing specifics */ 325 TileMulCore core; /**< Multiply and add core */ 326 int (*postFetch)( 327 struct KgenContext *ctx, 328 MatrixRole mrole, 329 void *arg); /**< Tile post fetch callback */ 330 void *postFetchPriv; /**< Postfetch callback's private date */ 331 struct FetchContext *fctx; 332 } TileMulOpts; 333 334 typedef struct ZeroFuncs { 335 char names[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN]; 336 } ZeroFuncs; 337 338 /** 339 * @internal 340 * @brief Private data for fetch postprocessing callback 341 * @ingroup TAILS_HANDLING 342 */ 343 typedef struct TilePostFetchPrivate { 344 BlasFunctionID funcID; 345 const BlasGenSettings *gset; 346 const char *regName; 347 int fetchNumA; 348 int wholeA; 349 } TilePostFetchPrivate; 350 351 void 352 getPrivateAreaInfo( 353 const BlasGenSettings *gset, 354 BlasFunctionID funcID, 355 MatrixRole mrole, 356 PrivateArea *area); 357 358 void 359 declarePrivateArea( 360 struct KgenContext *ctx, 361 const PrivateArea *area, 362 const char *baseName, 363 PrivateStorageType storType); 364 365 /* 366 * Declare separately the real and imaginary part of 367 * a complex multiplier. 368 * 369 * @ctx: generator context 370 * @baseName: variable's base name matching to an existing variable 371 * with not sepated parts 372 * @typeName: variable type name 373 * 374 * Rule naming 375 * real part: <baseName>R 376 * imaginary part: <baseName>I 377 * 378 * On success returns 0, and -EOVERFLOW at source buffer 379 * overflowing 380 */ 381 int 382 declareComplexMultParts( 383 struct KgenContext *ctx, 384 const char *baseName, 385 const char *typeName); 386 387 /** 388 * @internal 389 * @defgroup CHECK_DECOMP_CACL_GRAN Checking decomposition and calculate 390 * parallelism granularity 391 * @ingroup BLAS_GENERATORS 392 */ 393 394 /*@{*/ 395 396 /** 397 * @brief Sanity check for decomposition 398 * 399 * @param[in] subdims Subproblem dimensions. 2 levels. 400 * @param[in] minSize Minimum size for any of the dimension 401 * components 402 * @param[in] maxSize Maxium size which can't be exceeded by 403 * any of the dimension components at the tile 404 * layer 405 * @param[in] maxRegs Maximum registers it's allowed to use 406 * @param[in] dtype BLAS data type 407 * @param[in] wholeA Is matrix A stored in registers entirely or 408 * partially 409 * 410 * The function rejects only decompositions that are completely invalid or lead 411 * to consumption of too many registers or just have component values at the 412 * tile layer that are out of the range [\b MinSize, \b MaxSize]. 413 * Completely invalid decompositions are those which don't allow to divide 414 * problem integrally among work items, e. g. zeroed components are wrong, 415 * the step components (x, y, bwidth) of the 0-th level not integrally 416 * divisible on respective size components (itemX, itemY, bwidth) of the 1-st 417 * level are wrong as well. The decomposition is also wrong if the size 418 * components are not integrally divisible on the step components and not equal 419 * to #SUBDIM_UNUSED. 420 * 421 * @return true if the decomposition is valid, or false otherwise 422 */ 423 bool 424 decompSanityCheck( 425 const SubproblemDim *subdims, 426 unsigned int minSize, 427 unsigned int maxSize, 428 unsigned int maxRegs, 429 DataType dtype, 430 bool wholeA); 431 432 /** 433 * @brief Calculate granularity in case when a work item is responsible 434 * for its own part of solution not overlapping with those of other 435 * items 436 * 437 * @param[out] pgran Location to store calculated granularity 438 * @pararm[in] subdims Subproblem dimensions 439 * @param[in] xdim Dimension in the OpenCL work space X component 440 * of decomposition is mapped on 441 * @param[in] level Function BLAS level. Reserved for future use. 442 * 443 * If value of \b xdim is -1, then the function assumes that OpenCL work 444 * space is single dimensional, and puts the product of granularity against 445 * X and Y component to 0-th element of \b wgSize field. If its value is 446 * 0 or 1, the function assumes that OpenCL work space is 2D and puts 447 * granularity against X component to \b xdim element of \b wgSize field 448 * of the granularity decriptor. Granularity against Y component is put to 449 * 1 - \b xdim element. Other values are invalid and forces abort in debug 450 * build. The function initializes the \b wgDim field properly. 451 * 452 * NOTE: Now, only this function is supported only for level 3 and 453 * must not be called for level 2 454 */ 455 void 456 calcPgranDedicated( 457 PGranularity *pgran, 458 const SubproblemDim *subdims, 459 int xdim, 460 int level); 461 462 /** 463 * @brief Calculate granularity in case when several items evaluate the same 464 * part of solution together 465 * 466 * @param[out] pgran Location to store calculated granularity 467 * @pararm[in] subdims Subproblem dimensions 468 * @param[in] xdim Dimension in the OpenCL work space X component 469 * of decomposition is mapped on 470 * @param[in] ydim Dimension in the OpenCL work space Y component 471 * of decomposition is mapped on 472 * @param[in] level Function BLAS level. Reserved for future use 473 * 474 * If \b xdim and \b ydim values are equal, then the function puts the product 475 * of granularity against X and Y component to \b xdim element of \b wgSize 476 * field. If not, it puts separated granularity for X and Y in \b xdim and 477 * \b ydim element respectively. Both the values must be non negative and less 478 * than 3 (since OpenCL workspace cannot have more than 3 dimensions). 479 * If some of these parameters is zero, then the other one must be zero as well. 480 * If some of these parameters is 2, then the other one must be 1. These 481 * restrictions are caused by needs in reflecting \b bwidth in granularity 482 * in case of multidimensional decomposition. For 2D and 3D decompositions 483 * granularity for bwidth is calculated as well, and it is always mapped 484 * onto 0-th workspace dimension. If some of these parameters are wrong, 485 * it forces abort in debug build. The function sets the \b wgDim field 486 * to maximum of xdim and ydim plus 1. 487 * 488 * NOTE: Now, only this function is supported only for level 3 and 489 * must not be called for level 2 490 */ 491 void 492 calcPgranCooperative( 493 PGranularity *pgran, 494 const SubproblemDim *subdims, 495 int xdim, 496 int ydim, 497 int level); 498 499 /*@}*/ 500 501 /** 502 * @internal 503 * @defgroup COMMON_MATH_OPERATIONS Constructing useful math expression 504 * @ingroup BLAS_GENERATORS 505 */ 506 /*@{*/ 507 508 /** 509 * @brief Sprintf a complex MAD operation 510 * 511 * Operations: 512 * - \f$ dst \leftarrow a * b + c \f$ 513 * - \f$ dst \leftarrow conj(a) * b + c \f$ 514 * - \f$ dst \leftarrow a * conj(b) + c \f$ 515 * - \f$ dst \leftarrow conj(a) * conj(b) + c \f$ 516 * 517 * @param[out] expr String object to hold the target expression 518 * @param[in] dst Destination argument 519 * @param[in] a The first multiplier 520 * @param[in] b The second multiplier 521 * @param[in] c Added argument 522 * @param[in] isDouble If set, the arguments have double precision 523 * @param[in] isConjA If set, the argument A should be conjugated 524 * @param[in] isConjB If set, the argument B should be conjugated 525 * @param[in] TileMulCore Multiplying core 526 * 527 * The \b c argument can be NULL. In this case it is ignored, and the function 528 * produces pure multiplication 529 */ 530 void 531 sprintfComplexMulUpdate( 532 Kstring *expr, 533 const Kstring *dst, 534 const Kstring *a, 535 const Kstring *b, 536 const Kstring *c, 537 bool isDouble, 538 bool conjA, 539 bool conjB, 540 TileMulCore core); 541 542 void 543 sprintfComplexMulUpdate_syr2k_beta0( 544 Kstring *expr, 545 const Kstring *dst, 546 const Kstring *a, 547 const Kstring *b, 548 const Kstring *c, 549 bool isDouble, 550 bool conjA, 551 bool conjB, 552 TileMulCore core); 553 554 /** 555 * @brief Sprintf expression of fast scalar mad 556 * 557 * @param[out] expr Output expression 558 * @param[in] first First multiplier 559 * @param[in] second Second multiplier 560 * @param[in] scale Scale of the second argument, i. e. it's divider. 561 * Ignored if zero. 562 * @param[in] third Added argument. Ignored if NULL. 563 * 564 * It can use mad24. So, expected result should not exceed 2^24 565 */ 566 void 567 sprintfFastScalarMad( 568 Kstring *expr, 569 const Kstring *first, 570 const Kstring *second, 571 unsigned int scale, 572 const Kstring *third); 573 574 /*@}*/ 575 576 /** 577 * @internal 578 * @defgroup BLAS_GEN_MISC_FUNCTIONS Miscellaneous functions 579 * @ingroup BLAS_GENERATORS 580 */ 581 582 /*@{*/ 583 584 /** 585 * @brief Default function prefix for the data type 586 * 587 * @param[in] dtype One of the data types supported by the library 588 */ 589 char 590 dtypeToBlasPrefix(DataType dtype); 591 592 /** 593 * @brief Convert kernel extra flags to tilemul flags 594 * 595 * @param[in] funcID BLAS function ID 596 * @param[in] kflags Kernel flags 597 */ 598 TileMulFlags 599 kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags); 600 601 /** 602 * @brief Get vector length elements should be fetched from (stored to) 603 * the global memory 604 * 605 * @param[in] gset Generator settings 606 * @param[in] funcID BLAS function ID (deprecated) 607 * @param[in] mrole Role of the matrix to get vectorization for 608 */ 609 unsigned int 610 getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole); 611 612 /** 613 * @brief Sprintf chunk (set of components) of an OpenCL vector type 614 * 615 * @param[out] chunk Buffer to sprintf to 616 * @param[in] vecLen Entire vector length 617 * @param[in] clen Length of the chunk 618 * @param[in] vecOff Starting component offset 619 */ 620 void 621 sprintfVecChunk( 622 char *chunk, 623 unsigned int vecLen, 624 unsigned int clen, 625 unsigned int vecOff); 626 627 /** 628 * @brief Generate code containing scaling of leading dimensions on 629 * vector size 630 * 631 * @param[out] ctx Generator context 632 * @param[in] gset Generator settings 633 * 634 * The function first checks whether the scaling is actually needed. 635 * If vector size is 1. If some of the kernel variables for 'lda', 'ldb' 636 * or 'ldc' is NULL, the function skips code generation for the dimension. 637 * Calling this function has no effect if the @ref BGF_LD_IN_VECTORS generator 638 * flag is not set. If some of the leading dimensions are not unique, only 639 * one of the instances is scaled. Originality of the dimensions is detected 640 * by values of the respective pointers being a part of @ref KernelVarNames. 641 * For example, 'lda' and 'ldb' pointers are the same, only 'lda' is scaled. 642 */ 643 void 644 genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset); 645 646 /*@}*/ 647 648 /** 649 * @internal 650 * @brief Generate default post processing logic after tile fetch 651 * 652 * @param[out] ctx Generator context 653 * @param[in] mrole Matrix role 654 * @priv[out] Handler's private data 655 * 656 * @ingroup TAILS_HANDLING 657 */ 658 int 659 defaultTilePostFetch( 660 struct KgenContext *ctx, 661 MatrixRole mrole, 662 void *priv); 663 664 void 665 getResultGPRsInfo( 666 DataType dtype, 667 const SubproblemDim *dims, 668 unsigned int vecLen, 669 unsigned int *nrRegs, 670 const char **typeName); 671 672 /** 673 * @internal 674 * @defgroup BLAS_MAJOR_SUBGENS Major subgenerators 675 * @ingroup BLAS_GENERATORS 676 */ 677 /*@{*/ 678 679 /** 680 * @internal 681 * @brief Tiles fetching and multiplication inlined code generator 682 * 683 * @param[out] ctx Generator context 684 * @param[in] gset Generator settings 685 * @param[in] mulOpts TileMul-specific generator settings 686 * 687 * This function generates code which fetches tiles a and b from global or local 688 * memory into private memory, multiply them storing result into tile c in 689 * private memory and increment coordinate k. Caller is responsible for loop 690 * along K.\n 691 * All combinations of tiles a and b orientations are supported. Generated 692 * code fetches tiles by vectors which size can be different for tiles a and b. 693 * Complex types and conjugated tiles are supported. Global cycling is supported 694 * for global memory fetching - this mean that if tile overlaps matrix 695 * the tail of tile will be fetched from the beginning instead of accessing 696 * memory outside the matrix.\n 697 * Second level of subdimensions is used for tiles sizes.\n 698 * Generated code will fetch tiles a, b, multiply them and add result to tile c 699 * in private memory, then increment k. By default, k is incremented by 700 * second level bwidth but it is incremented by first level bwidth if 701 * @ref TILEMUL_BW_STRIDE flag is set. It is used if whole work group goes 702 * along K loop.\n 703 * Each tile can be fetched from global memory or from local memory. 704 * If tile is fetched from local memory then leading dimensions for local 705 * memory area are taken from first level subdimensions.\n 706 * Post-fetch callback generator function can be called after fetching tiles 707 * for zeroing tails or setting diagonal elements to one. This function is 708 * provided by caller.\n 709 * If second level bwidth is not equal to first level bwidth, and 710 * @ref TILEMUL_BW_STRIDE flag is not set then TileMul generates 711 * loop from zero to first level bwidth with second level bwidth step. The 712 * most common case is second level bwidth equal to first level bwidth where 713 * single iteration of multiplication is generated.\n 714 * 715 * If the caller assume for efficient fetching from the global memory and the 716 * tilemul logic is generated within a loop, prepareFetchCycle() should be 717 * called before generation of the loop. 718 * 719 * @return 0 on success 720 * @return -EOVERFLOW on source buffer overflowing 721 * @return -EINVAL if input arguments are invalid 722 */ 723 int 724 tileMulGen( 725 struct KgenContext *ctx, 726 const BlasGenSettings *gset, 727 const TileMulOpts *mulOpts); 728 729 /** 730 * @internal 731 * @brief Tiles pure multiplication code generator 732 * 733 * @param[out] ctx Generator context 734 * @param[in] gset Generator settings 735 * @param[in] mulOpts TileMul-specific generator settings 736 * 737 * This function multiply two tiles, a and b, storing result in tile c. No 738 * additional operations are made. It just performs tiles multiplication without 739 * fetching, post-fetch processing and incrementing coordinates which can be 740 * made by caller. 741 * 742 * @return 0 on success 743 * @return -EOVERFLOW on source buffer overflowing 744 */ 745 int 746 genMulTiles( 747 struct KgenContext *ctx, 748 const BlasGenSettings *gset, 749 const TileMulOpts *mulOpts); 750 751 /** 752 * @internal 753 * @brief Update result generator 754 * 755 * @param[out] ctx Generator context 756 * @param[in] gset Generator settings 757 * @param[in] op Update operation 758 * @param[in] flags Update result flags 759 * @argNames 760 * 761 * It generates a function applying an operation to the temporary result 762 * stored in the private memory and updating the target result. 763 *\n 764 * The code can be generated as well in the form of callable function 765 * as in the inlined form. 766 *\n 767 * List of taken argument differs depending on specified flags. In general, 768 * these functions are defined as: \n 769 * @code 770 * void 771 * funcName( 772 * <input type> C, 773 * <output type> *c, 774 * <input type> alpha, 775 * size_t startRow, 776 * size_t startCol, 777 * size_t ld 778 * [,<input type> beta] 779 * [,size_t nrRows] 780 * [,size_t nrCols]) 781 * @endcode 782 * 783 * @return 0 on success, -EOVERFLOW at source buffer overflowing. 784 */ 785 int 786 updateResultGen( 787 struct KgenContext *ctx, 788 const BlasGenSettings *gset, 789 BlasFunctionID funcId, 790 UpdateResultOp op, 791 UpdateResultFlags flags, 792 const UpresVarNames *uvarNames); 793 794 /** 795 * @internal 796 * @brief Produce a code updating a single result element 797 * 798 * @param[out] ctx Generator context 799 * @param[in] dst Destination element expression 800 * @param[in] src Source element expression 801 * @param[in] gset Generator settings 802 * @param[in] op Update operation 803 * @param[in] flags Flags showing specifics of the code needed to be 804 * generated 805 * 806 * @return 0 on success, -EOVERFLOW if the source buffer is exceeded. 807 */ 808 int 809 genUpdateResultSingle( 810 struct KgenContext *ctx, 811 const char *dst, 812 const char *src, 813 const BlasGenSettings *gset, 814 UpdateResultOp op, 815 UpdateResultFlags flags); 816 817 /*@}*/ 818 819 TailFetch 820 checkForTailFetches( 821 BlasFunctionID funcID, 822 const SubproblemDim *dim, 823 const CLBLASKernExtra *kextra, 824 MatrixRole mrole, 825 bool distVect, 826 bool lowerTails); 827 828 bool 829 isNeedZeroTileTail( 830 BlasFunctionID funcID, 831 const SubproblemDim *dim, 832 const CLBLASKernExtra *kextra, 833 MatrixRole mrole, 834 bool distVect); 835 836 /** 837 * @internal 838 * @brief Generate tail coordinates adjustment if needed 839 * 840 * @param[out] ctx Generator context 841 * @param[in] funcID BLAS function ID 842 * @param[in] gset Generator settings 843 * @param[out] *error Location to store error. 844 * Ignored if NULL. 845 * 846 * Adjust coordinates if work is distributed over matrix rows so as 847 * a tile would not exceed the matrix bound. Cyclic addressing is not 848 * applicable for that since skew over rows can be used for performance goals. 849 * 850 * If it's needed, issues an expression like 851 * 852 * if (coord.y + dy > M) { 853 * coord.y -= dy - M % dy; 854 * } 855 * 856 * Return status showing if the tails have been actually adjusted or not. 857 * If \b ctx is NULL the function doesn't try to generate a code, but just 858 * return actual tail handling status 859 * 860 * @ingroup TAILS_HANDLING 861 */ 862 TailStatus 863 checkGenAdjustTailCoords( 864 struct KgenContext *ctx, 865 BlasFunctionID funcID, 866 const BlasGenSettings *gset, 867 int *error); 868 869 /** 870 * @internal 871 * @brief Generate restoring original coordinates if needed 872 * 873 * @param[out] ctx Generator context 874 * @param[in] gset Generator settings 875 * @param[in] status Tails handling status 876 * 877 * Coordinates restoring is needed to have ability to write back result to 878 * a correct location. 879 * 880 * If it's needed, issues an expression like 881 * 882 * if (coord.y + dy == M) { 883 * coord.y += dy - M % dy; 884 * } 885 * 886 * @ingroup TAILS_HANDLING 887 */ 888 int 889 checkGenRestoreTailCoords( 890 struct KgenContext *ctx, 891 const BlasGenSettings *gset, 892 TailStatus status); 893 894 /** 895 * @internal 896 * @brief Convert tail handling status to the respective flags 897 * of the update result generator 898 * 899 * @param[in] status Status of the handling to convert to 900 * the update result flags 901 * 902 * @ingroup TAILS_HANDLING 903 */ 904 UpdateResultFlags 905 tailStatusToUpresFlags(TailStatus status); 906 907 #endif /* BLAS_KGEN_H_ */ 908