1/*
2 * Copyright (C) 2013,2014  Pascal Giorgi
3 *
4 * Written by Pascal Giorgi <pascal.giorgi@lirmm.fr>
5 * the code is inspired and adapted from the Eigen library
6 * modified by Brice Boyer (briceboyer) <boyer.brice@gmail.com>
7 *
8 * ========LICENCE========
9 * This file is part of the library FFLAS-FFPACK.
10 *
11 * FFLAS-FFPACK is free software: you can redistribute it and/or modify
12 * it under the terms of the  GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
24 * ========LICENCE========
25 *.
26 */
27
28#ifndef __FFLASFFPACK_fflas_igemm_igemm_tools_INL
29#define __FFLASFFPACK_fflas_igemm_igemm_tools_INL
30
31#include "fflas-ffpack/fflas/fflas_simd.h"
32
33namespace FFLAS { namespace details {
34
35
36    // store each rows x k submatrices of Rhs in row major mode
37    // if k does not divide cols, the remaining column are not packed
38    template<size_t k, bool transpose>
39    void pack_rhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols)
40    {
41        size_t cols_by_k=(cols/k)*k;
42        size_t p=0;
43        // pack by k columns
44        for(size_t j=0;j<cols_by_k;j+=k){
45            for(size_t i=0;i<rows;i++)
46                //! @bug this is fassign
47                for (size_t l=0;l<k;l++,p++) {
48                    XX[p]=X[i+(j+l)*ldx];
49                }
50        }
51        if (transpose){
52            if (cols-cols_by_k>=StepA){
53                for(size_t i=0;i<rows;i++) {
54                    for (size_t l=0;l<StepA;l++,p++)
55                        XX[p]=X[i+(cols_by_k+l)*ldx];
56                }
57                cols_by_k+=StepA;
58            }
59        }
60        // the remaining columns are not packed
61        for(size_t j=cols_by_k;j<cols;j++)
62            //! @bug this is fassign
63            for(size_t i=0;i<rows;i++,p++) {
64                XX[p]=X[i+j*ldx];
65            }
66    }
67
68
69    // store each k x cols submatrices of Lhs in column major mode
70    // if k does not divide rows, the remaining rows are not packed
71    template<size_t k, bool transpose>
72    void pack_lhs(int64_t* XX, const int64_t* X, size_t ldx, size_t rows, size_t cols)
73    {
74        //using simd = Simd<int64_t> ;
75        size_t p=0;
76        size_t rows_by_k=(rows/k)*k;
77        // pack rows by group of k
78        for(size_t i=0;i<rows_by_k;i+=k)
79            for(size_t j=0;j<cols;j++) {
80                for (size_t l=0;l<k;l++,p++) XX[p]=X[i+l+j*ldx];
81                //FFLASFFPACK_check(k%simd::vect_size == 0);
82                //! @bug this is fassign
83                // for (size_t l=0;l<k;l+= simd::vect_size, p+=simd::vect_size){
84                // 	simd::store(&XX[p],simd::loadu(&X[i+l+j*ldx]));
85                // }
86            }
87        // the remaining rows are packed by group of StepA (if possible)
88        if (!transpose) {
89            if (rows-rows_by_k>=StepA){
90                for(size_t j=0;j<cols;j++) {
91                    for (size_t l=0;l<StepA;l++,p++) XX[p]=X[rows_by_k+l+j*ldx];
92                    // FFLASFFPACK_check(StepA%simd::vect_size == 0);
93                    // for (size_t l=0;l<StepA;l+=simd::vect_size,p+=simd::vect_size){
94                    // 	simd::store(&XX[p],simd::loadu(&X[rows_by_k+l+j*ldx]));
95                    // }
96                }
97                rows_by_k+=StepA;
98            }
99        }
100        for(size_t i=rows_by_k;i<rows;i++) {
101            //! @bug this is fassign
102            for(size_t j=0;j<cols;j++,p++){
103                XX[p]=X[i+j*ldx];
104            }
105        }
106
107    }
108
109    inline void BlockingFactor(size_t& m, size_t& n, size_t& k)
110    {
111        int l1, l2, l3, tlb;
112        queryCacheSizes(l1,l2,l3);
113        getTLBSize(tlb);
114        /*
115           cout<<"Cache size: ";
116           cout<<"L1 ("<<l1<<") ";
117           cout<<"L2 ("<<l2<<") ";
118           cout<<"L3 ("<<l3<<") ";
119           cout<<"TLB ("<<tlb<<") ";
120           cout<<endl;
121           */
122        l2=std::max(l2,l3);
123        if (tlb)
124            l2=std::min(l2,tlb);
125        size_t kc,mc;
126        // kc * 2*(_mr+_nr) must fit in L1 cache
127        // kc * (n+mc) must fit in L2 cache and in TLB
128        size_t kdiv= 2*(_nr+_mr)*sizeof(int64_t);
129        kc = std::min(k, l1/kdiv);
130        mc = std::min(m, l2/(sizeof(int64_t) * kc));
131        k=kc;
132        m=mc;
133        //cout<<"mc="<<m<<endl;
134        //cout<<"kc="<<k<<endl;
135    }
136
137
138} // details
139} // FFLAS
140
141#endif // __FFLASFFPACK_fflas_igemm_igemm_tools_INL
142
143/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
144// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
145