/dports/math/openblas/OpenBLAS-0.3.18/kernel/x86_64/ |
H A D | dgemm_small_kernel_tt_skylakex.c | 42 _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); 44 _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); 53 __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ 55 _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); 57 …__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc… 59 _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); 292 __m512i vindex_n = _mm512_loadu_si512(index_n); in CNAME() local
|
H A D | sgemm_small_kernel_tt_skylakex.c | 42 _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); 44 _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); 53 __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ 55 _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); 57 …__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc… 59 _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); 314 __m512i vindex_n = _mm512_loadu_si512(index_n); in CNAME() local
|
H A D | dgemm_small_kernel_nt_skylakex.c | 48 _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); 50 _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8) 61 __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ 63 _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); 65 …__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc… 67 _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); 388 __m512i vindex_n = _mm512_loadu_si512(index_n); in CNAME() local
|
H A D | sgemm_small_kernel_nt_skylakex.c | 48 _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); 50 _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4) 61 __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ 63 _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); 65 …__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc… 67 _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); 388 __m512i vindex_n = _mm512_loadu_si512(index_n); in CNAME() local
|
H A D | sgemm_small_kernel_tn_skylakex.c | 59 #define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); 67 s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \ 68 _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); 100 __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); in CNAME() local
|
H A D | dgemm_small_kernel_tn_skylakex.c | 58 #define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); 66 s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \ 67 _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); 98 __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0); in CNAME() local
|
H A D | dgemm_small_kernel_nn_skylakex.c | 78 _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ 89 s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \ 91 _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ 370 __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0); in CNAME() local
|
H A D | sgemm_small_kernel_nn_skylakex.c | 79 _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ 90 s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \ 92 _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ 398 __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); in CNAME() local
|