1/* 2 * Copyright (C) 2013,2014 Pascal Giorgi 3 * 4 * Written by Pascal Giorgi <pascal.giorgi@lirmm.fr> 5 * the code is inspired and adapted from the Eigen library 6 * modified by Brice Boyer (briceboyer) <boyer.brice@gmail.com> 7 * 8 * ========LICENCE======== 9 * This file is part of the library FFLAS-FFPACK. 10 * 11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with this library; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 * ========LICENCE======== 25 *. 26 */ 27 28#ifndef __FFLASFFPACK_fflas_igemm_igemm_tools_INL 29#define __FFLASFFPACK_fflas_igemm_igemm_tools_INL 30 31#include "fflas-ffpack/fflas/fflas_simd.h" 32 33namespace FFLAS { namespace details { 34 35 36 // store each rows x k submatrices of Rhs in row major mode 37 // if k does not divide cols, the remaining column are not packed 38 template<size_t k, bool transpose> 39 void pack_rhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols) 40 { 41 size_t cols_by_k=(cols/k)*k; 42 size_t p=0; 43 // pack by k columns 44 for(size_t j=0;j<cols_by_k;j+=k){ 45 for(size_t i=0;i<rows;i++) 46 //! @bug this is fassign 47 for (size_t l=0;l<k;l++,p++) { 48 XX[p]=X[i+(j+l)*ldx]; 49 } 50 } 51 if (transpose){ 52 if (cols-cols_by_k>=StepA){ 53 for(size_t i=0;i<rows;i++) { 54 for (size_t l=0;l<StepA;l++,p++) 55 XX[p]=X[i+(cols_by_k+l)*ldx]; 56 } 57 cols_by_k+=StepA; 58 } 59 } 60 // the remaining columns are not packed 61 for(size_t j=cols_by_k;j<cols;j++) 62 //! @bug this is fassign 63 for(size_t i=0;i<rows;i++,p++) { 64 XX[p]=X[i+j*ldx]; 65 } 66 } 67 68 69 // store each k x cols submatrices of Lhs in column major mode 70 // if k does not divide rows, the remaining rows are not packed 71 template<size_t k, bool transpose> 72 void pack_lhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols) 73 { 74 //using simd = Simd<int64_t> ; 75 size_t p=0; 76 size_t rows_by_k=(rows/k)*k; 77 // pack rows by group of k 78 for(size_t i=0;i<rows_by_k;i+=k) 79 for(size_t j=0;j<cols;j++) { 80 for (size_t l=0;l<k;l++,p++) XX[p]=X[i+l+j*ldx]; 81 //FFLASFFPACK_check(k%simd::vect_size == 0); 82 //! @bug this is fassign 83 // for (size_t l=0;l<k;l+= simd::vect_size, p+=simd::vect_size){ 84 // simd::store(&XX[p],simd::loadu(&X[i+l+j*ldx])); 85 // } 86 } 87 // the remaining rows are packed by group of StepA (if possible) 88 if (!transpose) { 89 if (rows-rows_by_k>=StepA){ 90 for(size_t j=0;j<cols;j++) { 91 for (size_t l=0;l<StepA;l++,p++) XX[p]=X[rows_by_k+l+j*ldx]; 92 // FFLASFFPACK_check(StepA%simd::vect_size == 0); 93 // for (size_t l=0;l<StepA;l+=simd::vect_size,p+=simd::vect_size){ 94 // simd::store(&XX[p],simd::loadu(&X[rows_by_k+l+j*ldx])); 95 // } 96 } 97 rows_by_k+=StepA; 98 } 99 } 100 for(size_t i=rows_by_k;i<rows;i++) { 101 //! @bug this is fassign 102 for(size_t j=0;j<cols;j++,p++){ 103 XX[p]=X[i+j*ldx]; 104 } 105 } 106 107 } 108 109 inline void BlockingFactor(size_t& m, size_t& n, size_t& k) 110 { 111 int l1, l2, l3, tlb; 112 queryCacheSizes(l1,l2,l3); 113 getTLBSize(tlb); 114 /* 115 cout<<"Cache size: "; 116 cout<<"L1 ("<<l1<<") "; 117 cout<<"L2 ("<<l2<<") "; 118 cout<<"L3 ("<<l3<<") "; 119 cout<<"TLB ("<<tlb<<") "; 120 cout<<endl; 121 */ 122 l2=std::max(l2,l3); 123 if (tlb) 124 l2=std::min(l2,tlb); 125 size_t kc,mc; 126 // kc * 2*(_mr+_nr) must fit in L1 cache 127 // kc * (n+mc) must fit in L2 cache and in TLB 128 size_t kdiv= 2*(_nr+_mr)*sizeof(int64_t); 129 kc = std::min(k, l1/kdiv); 130 mc = std::min(m, l2/(sizeof(int64_t) * kc)); 131 k=kc; 132 m=mc; 133 //cout<<"mc="<<m<<endl; 134 //cout<<"kc="<<k<<endl; 135 } 136 137 138} // details 139} // FFLAS 140 141#endif // __FFLASFFPACK_fflas_igemm_igemm_tools_INL 142 143/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 144// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s 145