1 /************************************************************************************************** 2 * * 3 * This file is part of BLASFEO. * 4 * * 5 * BLASFEO -- BLAS For Embedded Optimization. * 6 * Copyright (C) 2019 by Gianluca Frison. * 7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8 * All rights reserved. * 9 * * 10 * The 2-Clause BSD License * 11 * * 12 * Redistribution and use in source and binary forms, with or without * 13 * modification, are permitted provided that the following conditions are met: * 14 * * 15 * 1. Redistributions of source code must retain the above copyright notice, this * 16 * list of conditions and the following disclaimer. * 17 * 2. Redistributions in binary form must reproduce the above copyright notice, * 18 * this list of conditions and the following disclaimer in the documentation * 19 * and/or other materials provided with the distribution. * 20 * * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31 * * 32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33 * * 34 **************************************************************************************************/ 35 36 #ifndef BLASFEO_S_KERNEL_H_ 37 #define BLASFEO_S_KERNEL_H_ 38 39 40 41 #ifdef __cplusplus 42 extern "C" { 43 #endif 44 45 46 47 // utils 48 void blasfeo_align_64_byte(void *ptr, void **ptr_align); 49 50 51 52 // 53 // lib8 54 // 55 56 // 24x4 57 void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 58 void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 59 void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 60 void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); 61 void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 62 void kernel_sgemm_nn_24x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 63 void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 64 void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 65 void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 66 void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 67 void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 68 void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 69 void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 70 void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 71 void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 72 void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn); 73 void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 74 void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn); 75 void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 76 void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 77 void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 78 void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 79 void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd); 80 void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn); 81 82 // 16x8 83 void kernel_sgemm_nt_16x8_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *spil); 84 85 // 16x4 86 void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 87 void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 88 void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 89 void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); 90 void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 91 void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 92 void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 93 void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 94 void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 95 void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); 96 void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 97 void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 98 void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 99 void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 100 void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 101 void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn); 102 void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 103 void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn); 104 void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 105 void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 106 void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn); 107 void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 108 void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd); 109 void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn); 110 void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 111 112 // 8x8 113 void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 114 void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 115 void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 116 void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D); 117 void kernel_sgemm_nn_8x8_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); 118 void kernel_sgemm_nn_8x8_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 119 void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 120 void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 121 void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D); 122 void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn); 123 void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E); 124 void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 125 void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E); 126 void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 127 void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn); 128 void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D); 129 130 // 8x4 131 void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 132 void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 133 void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 134 void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D); 135 void kernel_sgemm_nn_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); 136 void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 137 //void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 138 void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 139 void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D); 140 void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn); 141 void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E); 142 void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 143 void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E); 144 void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 145 void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn); 146 void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D); 147 void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D); 148 void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn); 149 void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 150 void kernel_strmm_nt_ru_8x4_lib8(int k, float *alpha, float *A, float *B, float *D); 151 void kernel_strmm_nt_ru_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *D, int km, int kn); 152 153 // 4x8 154 void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 155 void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 156 void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 157 void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E); 158 void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 159 160 // 8 161 void kernel_sgemv_n_8_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z); 162 void kernel_sgemv_n_8_vs_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1); 163 void kernel_sgemv_n_8_gen_lib8(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1); 164 void kernel_sgemv_t_8_lib8(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z); 165 void kernel_sgemv_t_8_vs_lib8(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z, int k1); 166 void kernel_sgemv_t_4_lib8(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z); 167 void kernel_sgemv_t_4_vs_lib8(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z, int k1); 168 void kernel_strsv_ln_inv_8_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z); 169 void kernel_strsv_ln_inv_8_vs_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn); 170 void kernel_strsv_lt_inv_8_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z); 171 void kernel_strsv_lt_inv_8_vs_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z, int km, int kn); 172 void kernel_sgemv_nt_4_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t); 173 void kernel_sgemv_nt_4_vs_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km); 174 void kernel_ssymv_l_4l_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z); 175 void kernel_ssymv_l_4r_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z); 176 void kernel_ssymv_l_4l_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km); 177 void kernel_ssymv_l_4r_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km); 178 179 // -------- aux 180 181 // ---- copy 182 183 // lib4 184 // 185 void kernel_sgecpsc_4_0_lib4(int kmax, float *alpha, float *A, float *B); 186 void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B); 187 188 void kernel_sgecpsc_4_1_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 189 void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B); 190 void kernel_sgecpsc_4_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 191 void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B); 192 void kernel_sgecpsc_4_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 193 void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B); 194 195 void kernel_sgecpsc_3_0_lib4(int kmax, float *alpha, float *A, float *B); 196 void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B); 197 void kernel_sgecpsc_3_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 198 void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B); 199 void kernel_sgecpsc_3_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 200 void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B); 201 202 void kernel_sgecpsc_2_0_lib4(int kmax, float *alpha, float *A, float *B); 203 void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B); 204 void kernel_sgecpsc_2_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 205 void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B); 206 207 void kernel_sgecpsc_1_0_lib4(int kmax, float *alpha, float *A, float *B); 208 void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B); 209 210 // lib8 211 // 212 void kernel_sgecp_8_0_lib8(int m, float *A, float *B); 213 void kernel_sgecp_8_0_gen_lib8(int m, float *A, float *B, int m1); 214 void kernel_sgecp_8_0_gen_u_lib8(int m, float *A, float *B, int m1); 215 216 void kernel_sgesc_8_0_lib8(int m, float *alpha, float *A); 217 void kernel_sgesc_8_0_gen_lib8(int m, float *alpha, float *A, int m1); 218 void kernel_sgesc_8_0_gen_u_lib8(int m, float *alpha, float *A, int m1); 219 220 void kernel_sgecpsc_8_0_lib8(int m, float *alpha, float *A, float *B); 221 void kernel_sgecpsc_8_0_gen_lib8(int m, float *alpha, float *A, float *B, int m1); 222 void kernel_sgecpsc_8_0_gen_u_lib8(int m, float *alpha, float *A, float *B, int m1); 223 224 void kernel_sgecp_8_1_lib8(int m, float *A, int sda, float *B); 225 void kernel_sgecp_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1); 226 void kernel_sgecpsc_8_1_lib8(int m, float *alpha, float *A, int sda, float *B); 227 void kernel_sgecpsc_8_1_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 228 229 void kernel_sgecp_8_2_lib8(int m, float *A, int sda, float *B); 230 void kernel_sgecp_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1); 231 void kernel_sgecpsc_8_2_lib8(int m, float *alpha, float *A, int sda, float *B); 232 void kernel_sgecpsc_8_2_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 233 234 void kernel_sgecp_8_3_lib8(int m, float *A, int sda, float *B); 235 void kernel_sgecp_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1); 236 void kernel_sgecpsc_8_3_lib8(int m, float *alpha, float *A, int sda, float *B); 237 void kernel_sgecpsc_8_3_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 238 239 void kernel_sgecp_8_4_lib8(int m, float *A, int sda, float *B); 240 void kernel_sgecp_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1); 241 void kernel_sgecpsc_8_4_lib8(int m, float *alpha, float *A, int sda, float *B); 242 void kernel_sgecpsc_8_4_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 243 244 void kernel_sgecp_8_5_lib8(int m, float *A, int sda, float *B); 245 void kernel_sgecp_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1); 246 void kernel_sgecpsc_8_5_lib8(int m, float *alpha, float *A, int sda, float *B); 247 void kernel_sgecpsc_8_5_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 248 249 void kernel_sgecp_8_6_lib8(int m, float *A, int sda, float *B); 250 void kernel_sgecp_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1); 251 void kernel_sgecpsc_8_6_lib8(int m, float *alpha, float *A, int sda, float *B); 252 void kernel_sgecpsc_8_6_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 253 254 void kernel_sgecp_8_7_lib8(int m, float *A, int sda, float *B); 255 void kernel_sgecp_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1); 256 void kernel_sgecpsc_8_7_lib8(int m, float *alpha, float *A, int sda, float *B); 257 void kernel_sgecpsc_8_7_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 258 259 // transpose 260 // 261 void kernel_sgetr_8_0_lib8(int m, float *A, int sda, float *B); 262 void kernel_sgetr_8_0_gen_lib8(int m, float *A, int sda, float *B, int m1); 263 void kernel_sgetr_8_1_lib8(int m, float *A, int sda, float *B); 264 void kernel_sgetr_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1); 265 void kernel_sgetr_8_2_lib8(int m, float *A, int sda, float *B); 266 void kernel_sgetr_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1); 267 void kernel_sgetr_8_3_lib8(int m, float *A, int sda, float *B); 268 void kernel_sgetr_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1); 269 void kernel_sgetr_8_4_lib8(int m, float *A, int sda, float *B); 270 void kernel_sgetr_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1); 271 void kernel_sgetr_8_5_lib8(int m, float *A, int sda, float *B); 272 void kernel_sgetr_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1); 273 void kernel_sgetr_8_6_lib8(int m, float *A, int sda, float *B); 274 void kernel_sgetr_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1); 275 void kernel_sgetr_8_7_lib8(int m, float *A, int sda, float *B); 276 void kernel_sgetr_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1); 277 278 // add 279 // 280 void kernel_sgead_8_0_lib8(int m, float *alpha, float *A, float *B); 281 void kernel_sgead_8_0_gen_lib8(int m, float *alpha, float *A, float *B, int m1); 282 void kernel_sgead_8_1_lib8(int m, float *alpha, float *A, int sda, float *B); 283 void kernel_sgead_8_1_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 284 void kernel_sgead_8_2_lib8(int m, float *alpha, float *A, int sda, float *B); 285 void kernel_sgead_8_2_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 286 void kernel_sgead_8_3_lib8(int m, float *alpha, float *A, int sda, float *B); 287 void kernel_sgead_8_3_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 288 void kernel_sgead_8_4_lib8(int m, float *alpha, float *A, int sda, float *B); 289 void kernel_sgead_8_4_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 290 void kernel_sgead_8_5_lib8(int m, float *alpha, float *A, int sda, float *B); 291 void kernel_sgead_8_5_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 292 void kernel_sgead_8_6_lib8(int m, float *alpha, float *A, int sda, float *B); 293 void kernel_sgead_8_6_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 294 void kernel_sgead_8_7_lib8(int m, float *alpha, float *A, int sda, float *B); 295 void kernel_sgead_8_7_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1); 296 297 298 // 299 // lib4 300 // 301 302 303 304 // level 2 BLAS 305 // 4 306 void kernel_sgemv_n_4_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z); 307 void kernel_sgemv_n_4_vs_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1); 308 void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1); 309 void kernel_sgemv_t_4_lib4(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z); 310 void kernel_sgemv_t_4_vs_lib4(int k, float *alpha, int offsetA, float *A, int sda, float *x, float *beta, float *y, float *z, int k1); 311 void kernel_strsv_ln_inv_4_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z); 312 void kernel_strsv_ln_inv_4_vs_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn); 313 void kernel_strsv_lt_inv_4_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z); 314 void kernel_strsv_lt_inv_3_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z); 315 void kernel_strsv_lt_inv_2_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z); 316 void kernel_strsv_lt_inv_1_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z); 317 void kernel_strmv_un_4_lib4(int k, float *A, float *x, float *z); 318 void kernel_strmv_ut_4_lib4(int k, float *A, int sda, float *x, float *z); 319 void kernel_strmv_ut_4_vs_lib4(int k, float *A, int sda, float *x, float *z, int km); 320 void kernel_sgemv_nt_6_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t); 321 void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t); 322 void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km); 323 void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n); 324 void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km); 325 326 327 328 // level 3 BLAS 329 // 12x4 330 void kernel_sgemm_nt_16x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); // 331 void kernel_sgemm_nt_16x4_vs_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 332 void kernel_strsm_nt_rl_inv_16x4_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 333 void kernel_strsm_nt_rl_inv_16x4_vs_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int m1, int n1); 334 // 12x4 335 void kernel_sgemm_nt_12x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); // 336 void kernel_sgemm_nt_12x4_vs_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 337 void kernel_strsm_nt_rl_inv_12x4_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 338 void kernel_strsm_nt_rl_inv_12x4_vs_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int m1, int n1); 339 // 8x8 340 void kernel_sgemm_nt_8x8_lib4(int k, float *alpha, float *A, int sda, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); // 341 void kernel_sgemm_nt_8x8_vs_lib4(int k, float *alpha, float *A, int sda, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 342 void kernel_sgemm_nn_8x8_lib4(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); // 343 void kernel_sgemm_nn_8x8_vs_lib4(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 344 // 8x4 345 void kernel_sgemm_nt_8x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); // 346 void kernel_sgemm_nt_8x4_vs_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 347 void kernel_sgemm_nn_8x4_lib4(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); // 348 void kernel_sgemm_nn_8x4_vs_lib4(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int m1, int n1); // 349 void kernel_ssyrk_nt_l_8x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); // 350 void kernel_ssyrk_nt_l_8x4_vs_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn); // 351 void kernel_strsm_nt_rl_inv_8x4_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E); 352 void kernel_strsm_nt_rl_inv_8x4_vs_lib4(int k, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int m1, int n1); 353 // 4x4 354 void kernel_sgemm_nt_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); // 355 void kernel_sgemm_nt_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); // 356 void kernel_sgemm_nt_4x4_gen_lib4(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int k0, int k1); 357 void kernel_sgemm_nn_4x4_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D); // 358 void kernel_sgemm_nn_4x4_vs_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); // 359 void kernel_sgemm_nn_4x4_gen_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); // 360 void kernel_ssyrk_nt_l_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); // 361 void kernel_ssyrk_nt_l_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); // 362 void kernel_ssyrk_nt_l_4x4_gen_lib4(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 363 void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *D); // 364 void kernel_strmm_nt_ru_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *D, int km, int kn); // 365 void kernel_strmm_nn_rl_4x4_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D); 366 void kernel_strmm_nn_rl_4x4_gen_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 367 void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E); 368 void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 369 void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E); 370 void kernel_strsm_nt_rl_one_4x4_vs_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, int km, int kn); 371 void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E); 372 void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 373 void kernel_strsm_nt_ru_one_4x4_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E); 374 void kernel_strsm_nt_ru_one_4x4_vs_lib4(int k, float *A, float *B, float *beta, float *C, float *D, float *E, int km, int kn); 375 void kernel_strsm_nn_ru_inv_4x4_lib4(int k, float *A, float *B, int sdb, float *beta, float *C, float *D, float *E, float *inv_diag_E); 376 void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 377 void kernel_strsm_nn_ll_one_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E); 378 void kernel_strsm_nn_ll_one_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn); 379 void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E); 380 void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 381 // diag 382 void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd); 383 void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 384 void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 385 void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 386 void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); 387 void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D); 388 void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D); 389 void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D); 390 void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D); 391 void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D); 392 393 394 395 // LAPACK 396 // 16x4 397 void kernel_spotrf_nt_l_16x4_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 398 void kernel_spotrf_nt_l_16x4_vs_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int m1, int n1); 399 // 12x4 400 void kernel_spotrf_nt_l_12x4_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 401 void kernel_spotrf_nt_l_12x4_vs_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int m1, int n1); 402 // 8x4 403 void kernel_spotrf_nt_l_8x4_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D); 404 void kernel_spotrf_nt_l_8x4_vs_lib4(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int m1, int n1); 405 // 4x4 406 void kernel_spotrf_nt_l_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D); 407 void kernel_spotrf_nt_l_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn); 408 void kernel_sgetrf_nn_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D); 409 void kernel_sgetrf_nn_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn); 410 void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv); 411 void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv); 412 413 414 415 // merged routines 416 // 4x4 417 void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E); 418 void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 419 void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn); 420 void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D); 421 422 423 424 // auxiliary routines 425 void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B); 426 void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B); 427 void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B); 428 void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B); 429 void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B); 430 void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B); 431 void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B); 432 void kernel_strcp_l_2_0_lib4(int kmax, float *A, float *B); 433 void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B); 434 void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B); 435 void kernel_sgead_4_0_lib4(int kmax, float *alpha, float *A, float *B); 436 void kernel_sgead_4_1_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 437 void kernel_sgead_4_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 438 void kernel_sgead_4_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 439 void kernel_sgead_3_0_lib4(int kmax, float *alpha, float *A, float *B); 440 void kernel_sgead_3_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 441 void kernel_sgead_3_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 442 void kernel_sgead_2_0_lib4(int kmax, float *alpha, float *A, float *B); 443 void kernel_sgead_2_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B); 444 void kernel_sgead_1_0_lib4(int kmax, float *alpha, float *A, float *B); 445 // TODO 446 void kernel_sgeset_4_lib4(int kmax, float alpha, float *A); 447 void kernel_strset_4_lib4(int kmax, float alpha, float *A); 448 void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc); 449 void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc); 450 void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc); 451 void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc); 452 453 454 455 // pack 456 // 24 lib 8 457 void kernel_spack_nn_24_lib8(int kmax, float *A, int lda, float *B, int sdb); 458 void kernel_spack_nn_24_vs_lib8(int kmax, float *A, int lda, float *B, int sdb, int m1); 459 // 16 lib 8 460 void kernel_spack_nn_16_lib8(int kmax, float *A, int lda, float *B, int sdb); 461 void kernel_spack_nn_16_vs_lib8(int kmax, float *A, int lda, float *B, int sdb, int m1); 462 // 8 lib 8 463 void kernel_spack_nn_8_lib8(int kmax, float *A, int lda, float *B); 464 void kernel_spack_nn_8_vs_lib8(int kmax, float *A, int lda, float *B, int m1); 465 void kernel_spack_tn_8_lib8(int kmax, float *A, int lda, float *B); 466 void kernel_spack_tn_8_vs_lib8(int kmax, float *A, int lda, float *B, int m1); 467 void kernel_spack_tt_8_lib8(int kmax, float *A, int lda, float *B, int sdb); 468 void kernel_spack_tt_8_vs_lib8(int kmax, float *A, int lda, float *B, int sdb, int m1); 469 // 8 lib 4 470 void kernel_spack_nn_8_lib4(int kmax, float *A, int lda, float *B, int sdb); 471 void kernel_spack_nn_8_vs_lib4(int kmax, float *A, int lda, float *B, int sdb, int m1); 472 //void kernel_spack_tt_8_lib4(int kmax, float *A, int lda, float *B, int sdb); 473 // 4 474 void kernel_spack_nn_4_lib4(int kmax, float *A, int lda, float *B); 475 void kernel_spack_nn_4_vs_lib4(int kmax, float *A, int lda, float *B, int m1); 476 void kernel_spack_tn_4_lib4(int kmax, float *A, int lda, float *B); 477 void kernel_spack_tn_4_vs_lib4(int kmax, float *A, int lda, float *B, int m1); 478 void kernel_spack_tt_4_lib4(int kmax, float *A, int lda, float *B, int sdb); 479 void kernel_spack_tt_4_vs_lib4(int kmax, float *A, int lda, float *B, int sdb, int m1); 480 // unpack 481 // 8 482 void kernel_sunpack_nn_8_lib4(int kmax, float *A, int sda, float *B, int ldb); 483 void kernel_sunpack_nn_8_vs_lib4(int kmax, float *A, int sda, float *B, int ldb, int m1); 484 //void kernel_sunpack_tt_8_lib4(int kmax, float *A, int sda, float *B, int ldb); 485 // 4 486 void kernel_sunpack_nn_4_lib4(int kmax, float *A, float *B, int ldb); 487 void kernel_sunpack_nn_4_vs_lib4(int kmax, float *A, float *B, int ldb, int m1); 488 void kernel_sunpack_nt_4_lib4(int kmax, float *A, float *B, int ldb); 489 void kernel_sunpack_nt_4_vs_lib4(int kmax, float *A, float *B, int ldb, int m1); 490 void kernel_sunpack_tt_4_lib4(int kmax, float *A, int sda, float *B, int ldb); 491 492 // panel copy 493 // 4 494 void kernel_spacp_nt_4_lib4(int kmax, float *A, int offsetB, float *B, int sdb); 495 void kernel_spacp_tn_4_lib4(int kmax, int offsetA, float *A, int sda, float *B); 496 void kernel_spacp_nn_4_lib4(int kmax, int offsetA, float *A, int sda, float *B); 497 void kernel_spacp_nn_4_vs_lib4(int kmax, int offsetA, float *A, int sda, float *B, int m1); 498 499 500 501 /************************************************ 502 * BLAS API kernels 503 ************************************************/ 504 505 #ifdef BLAS_API 506 507 // A, B panel-major bs=8; C, D column-major 508 // 24x4 509 void kernel_sgemm_nt_24x4_lib88cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd); 510 void kernel_sgemm_nt_24x4_vs_lib88cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 511 // 16x4 512 void kernel_sgemm_nt_16x4_lib88cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd); 513 void kernel_sgemm_nt_16x4_vs_lib88cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 514 // 8x8 515 void kernel_sgemm_nt_8x8_lib88cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd); 516 void kernel_sgemm_nt_8x8_vs_lib88cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 517 // 8x4 518 void kernel_sgemm_nt_8x4_lib88cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd); 519 void kernel_sgemm_nt_8x4_vs_lib88cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 520 521 // A, B panel-major bs=4; C, D column-major 522 // 8x4 523 void kernel_sgemm_nt_8x4_lib44cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd); 524 //void kernel_sgemm_nt_8x4_vs_lib44cc(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 525 // 4x4 526 void kernel_sgemm_nt_4x4_lib44cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd); 527 void kernel_sgemm_nt_4x4_vs_lib44cc(int kmax, float *alpha, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 528 void kernel_strsm_nt_rl_inv_4x4_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, float *dE); 529 void kernel_strsm_nt_rl_inv_4x4_vs_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, float *dE, int m1, int n1); 530 void kernel_strsm_nt_rl_inv_4x4_lib44ccc(int kmax, float *A, float *B, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE); 531 void kernel_strsm_nt_rl_inv_4x4_vs_lib44ccc(int kmax, float *A, float *B, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE, int m1, int n1); 532 void kernel_strsm_nt_rl_one_4x4_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E); 533 void kernel_strsm_nt_rl_one_4x4_vs_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, int m1, int n1); 534 void kernel_strsm_nt_ru_inv_4x4_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, float *dE); 535 void kernel_strsm_nt_ru_inv_4x4_vs_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, float *dE, int m1, int n1); 536 void kernel_strsm_nt_ru_one_4x4_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E); 537 void kernel_strsm_nt_ru_one_4x4_vs_lib44cc4(int kmax, float *A, float *B, float *beta, float *C, int ldc, float *D, int ldd, float *E, int m1, int n1); 538 539 // B panel-major bs=4; A, C, D column-major 540 // 4x4 541 void kernel_sgemm_nt_4x4_libc4cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd); 542 void kernel_sgemm_nt_4x4_vs_libc4cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 543 void kernel_sgemm_tt_4x4_libc4cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd); 544 void kernel_sgemm_tt_4x4_vs_libc4cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 545 546 // A panel-major bs=4; B, C, D column-major 547 // 4x4 548 void kernel_sgemm_nn_4x4_lib4ccc(int kmax, float *alpha, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd); 549 void kernel_sgemm_nn_4x4_vs_lib4ccc(int kmax, float *alpha, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 550 void kernel_sgemm_nt_4x4_lib4ccc(int kmax, float *alpha, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd); 551 void kernel_sgemm_nt_4x4_vs_lib4ccc(int kmax, float *alpha, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 552 void kernel_strsm_nn_rl_inv_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE); 553 void kernel_strsm_nn_rl_inv_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE, int m1, int n1); 554 void kernel_strsm_nn_rl_one_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde); 555 void kernel_strsm_nn_rl_one_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, int m1, int n1); 556 void kernel_strsm_nt_rl_inv_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE); 557 void kernel_strsm_nt_rl_inv_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE, int m1, int n1); 558 void kernel_strsm_nt_rl_one_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde); 559 void kernel_strsm_nt_rl_one_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, int m1, int n1); 560 void kernel_strsm_nn_ru_inv_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE); 561 void kernel_strsm_nn_ru_inv_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE, int m1, int n1); 562 void kernel_strsm_nn_ru_one_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde); 563 void kernel_strsm_nn_ru_one_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, int m1, int n1); 564 void kernel_strsm_nt_ru_inv_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE); 565 void kernel_strsm_nt_ru_inv_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, float *dE, int m1, int n1); 566 void kernel_strsm_nt_ru_one_4x4_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde); 567 void kernel_strsm_nt_ru_one_4x4_vs_lib4cccc(int kmax, float *A, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, float *E, int lde, int m1, int n1); 568 569 // A, C, D panel-major; B, E column-major 570 // TODO merge with above 571 // 4x4 572 void kernel_strsm_nn_rl_inv_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE); 573 void kernel_strsm_nn_rl_inv_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE, int m1, int n1); 574 void kernel_strsm_nn_rl_one_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde); 575 void kernel_strsm_nn_rl_one_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, int m1, int n1); 576 void kernel_strsm_nn_ru_inv_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE); 577 void kernel_strsm_nn_ru_inv_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE, int m1, int n1); 578 void kernel_strsm_nn_ru_one_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde); 579 void kernel_strsm_nn_ru_one_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, int m1, int n1); 580 void kernel_strsm_nt_rl_inv_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE); 581 void kernel_strsm_nt_rl_inv_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE, int m1, int n1); 582 void kernel_strsm_nt_rl_one_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde); 583 void kernel_strsm_nt_rl_one_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, int m1, int n1); 584 void kernel_strsm_nt_ru_inv_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE); 585 void kernel_strsm_nt_ru_inv_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, float *dE, int m1, int n1); 586 void kernel_strsm_nt_ru_one_4x4_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde); 587 void kernel_strsm_nt_ru_one_4x4_vs_lib4c44c(int kmax, float *A, float *B, int ldb, float *beta, float *C, float *D, float *E, int lde, int m1, int n1); 588 589 // A, B, C, D column-major 590 void kernel_sgemm_nn_4x4_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd); 591 void kernel_sgemm_nn_4x4_vs_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 592 void kernel_sgemm_nt_4x4_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd); 593 void kernel_sgemm_nt_4x4_vs_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 594 void kernel_sgemm_tt_4x4_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd); 595 void kernel_sgemm_tt_4x4_vs_libcccc(int kmax, float *alpha, float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1); 596 597 // vector 598 void kernel_sdot_11_lib(int n, float *x, float *y, float *res); 599 void kernel_saxpy_11_lib(int n, float *alpha, float *x, float *y); 600 601 602 #endif // BLAS_API 603 604 605 606 #ifdef __cplusplus 607 } 608 #endif 609 610 #endif // BLASFEO_S_KERNEL_H_ 611