1 /*
2     Genome-wide Efficient Mixed Model Association (GEMMA)
3     Copyright © 2011-2017, Xiang Zhou
4     Copyright © 2017, Peter Carbonetto
5     Copyright © 2017-2020, Pjotr Prins
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <algorithm>    // std::min
22 #include <cmath>
23 #include <iomanip>
24 #include <vector>
25 #include "debug.h"
26 #include "mathfunc.h"
27 #include <string.h>
28 #include "fastblas.h"
29 
30 const char *FastblasTrans = "T";
31 const char *FastblasNoTrans = "N";
32 
33 using namespace std;
34 
35 /*
36    Reasonably fast function to copy data from standard C array into
37    gsl_matrix. Avoid it for performance critical sections.
38 */
fast_copy(gsl_matrix * m,const double * mem)39 gsl_matrix *fast_copy(gsl_matrix *m, const double *mem) {
40   auto rows = m->size1;
41   auto cols = m->size2;
42   if (is_strict_mode()) { // slower correct version
43     for (size_t r=0; r<rows; r++) {
44       for (size_t c=0; c<cols; c++) {
45         gsl_matrix_set(m,r,c,mem[r*cols+c]);
46       }
47     }
48   } else { // faster goes by row
49     auto v = gsl_vector_calloc(cols);
50     enforce(v); // just to be sure
51     for (size_t r=0; r<rows; r++) {
52       assert(v->size == cols);
53       assert(v->block->size == cols);
54       assert(v->stride == 1);
55       memcpy(v->block->data,&mem[r*cols],cols*sizeof(double));
56       gsl_matrix_set_row(m,r,v);
57     }
58     gsl_vector_free(v);
59   }
60   return m;
61 }
62 
63 /*
64     Helper function fast_cblas_dgemm runs the local dgemm
65 */
fast_cblas_dgemm(const enum CBLAS_ORDER Order,const enum CBLAS_TRANSPOSE TransA,const enum CBLAS_TRANSPOSE TransB,const size_t M,const size_t N,const size_t K,const double alpha,const double * A,const size_t lda,const double * B,const size_t ldb,const double beta,double * C,const size_t ldc)66 void fast_cblas_dgemm(const enum CBLAS_ORDER Order,
67                       const enum CBLAS_TRANSPOSE TransA,
68                       const enum CBLAS_TRANSPOSE TransB,
69                       const size_t M,
70                       const size_t N,
71                       const size_t K,
72                       const double alpha,
73                       const double *A,
74                       const size_t lda,
75                       const double *B,
76                       const size_t ldb,
77                       const double beta,
78                       double *C,
79                       const size_t ldc) {
80 #ifndef NDEBUG
81   if (is_debug_mode()) {
82     #ifdef DISABLED
83     size_t i,j;
84     printf (" Top left corner of matrix A: \n");
85     for (i=0; i<min(M,6); i++) {
86       for (j=0; j<min(K,6); j++) {
87         printf ("%12.0f", A[j+i*K]);
88       }
89       printf ("\n");
90     }
91 
92     printf ("\n Top left corner of matrix B: \n");
93     for (i=0; i<min(K,6); i++) {
94       for (j=0; j<min(N,6); j++) {
95         printf ("%12.0f", B[j+i*N]);
96       }
97       printf ("\n");
98     }
99 
100     printf ("\n Top left corner of matrix C: \n");
101     for (i=0; i<min(M,6); i++) {
102       for (j=0; j<min(N,6); j++) {
103         printf ("%12.5G", C[j+i*N]);
104       }
105       printf ("\n");
106     }
107     #endif
108 
109     cout << scientific << setprecision(3) << "* RowMajor " << Order << "\t" ;
110     cout << "transA " << TransA << "\t" ;
111     cout << "transB " << TransB << "\t" ;
112     cout << "m " << M << "\t" ;
113     cout << "n " << N << "\t" ;
114     cout << "k " << K << "\n" ;
115     cout << "* lda " << lda << "\t" ;
116     cout << "ldb " << ldb << "\t" ;
117     cout << "ldc " << ldc << "\t" ;
118     cout << "alpha " << alpha << "\t" ;
119     cout << "beta " << beta << "\n" ;
120     cout << "* A03 " << A[3] << "\t" ;
121     cout << "B03 " << B[3] << "\t" ;
122     cout << "C03 " << C[3] << "\t" ;
123     cout << "Asum " << sum(A,M,K) << "\t" ;
124     cout << "Bsum " << sum(B,K,N) << "\n" ;
125     cout << "Csum " << sum(C,M,N) << "\n" ;
126   }
127 #endif // NDEBUG
128 
129   // Check for (integer) overflows
130   enforce(M>0);
131   enforce(N>0);
132   enforce(K>0);
133 
134   // check_int_mult_overflow(560000,8000); // fails on default int (32-bits)
135   check_int_mult_overflow(M,K);
136   check_int_mult_overflow(N,K);
137   check_int_mult_overflow(M,N);
138 
139   cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
140 
141 #ifndef NDEBUG
142   #ifdef DISABLED
143   if (is_debug_mode()) {
144     printf (" Top left corner of matrix A (cols=k %i, rows=m %i): \n",K,M);
145     for (i=0; i<min(M,6); i++) {
146       for (j=0; j<min(K,6); j++) {
147         printf ("%12.0f", A[j+i*K]);
148       }
149       printf ("\n");
150     }
151 
152     printf ("\n Top left corner of matrix B: \n");
153     for (i=0; i<min(K,6); i++) {
154       for (j=0; j<min(N,6); j++) {
155         printf ("%12.0f", B[j+i*N]);
156       }
157       printf ("\n");
158     }
159 
160     printf ("\n Top left corner of matrix C: \n");
161     for (i=0; i<min(M,6); i++) {
162       for (j=0; j<min(N,6); j++) {
163       printf ("%12.5G", C[j+i*N]);
164       }
165       printf ("\n");
166     }
167   }
168   #endif
169 #endif // NDEBUG
170 }
171 
172 /*
173     Helper function fast_cblas_dgemm converts a GEMMA layout to cblas_dgemm.
174 */
fast_cblas_dgemm(const char * TransA,const char * TransB,const double alpha,const gsl_matrix * A,const gsl_matrix * B,const double beta,gsl_matrix * C)175 static void fast_cblas_dgemm(const char *TransA, const char *TransB, const double alpha,
176                              const gsl_matrix *A, const gsl_matrix *B, const double beta,
177                              gsl_matrix *C) {
178   // C++ is row-major
179   auto transA = (*TransA == 'N' || *TransA == 'n' ? CblasNoTrans : CblasTrans);
180   auto transB = (*TransB == 'N' || *TransB == 'n' ? CblasNoTrans : CblasTrans);
181   const size_t M   = C->size1;
182   const size_t N   = C->size2;
183   const size_t MA  = (transA == CblasNoTrans) ? A->size1 : A->size2;
184   const size_t NA  = (transA == CblasNoTrans) ? A->size2 : A->size1;
185   const size_t MBx = (transB == CblasNoTrans) ? B->size1 : B->size2;
186   const size_t NB  = (transB == CblasNoTrans) ? B->size2 : B->size1;
187 
188   if (M == MA && N == NB && NA == MBx) {  /* [MxN] = [MAxNA][MBxNB] */
189 
190     auto K = NA;
191 
192     // Check for (integer) overflows
193     enforce(M>0);
194     enforce(N>0);
195     enforce(K>0);
196 
197     // check_int_mult_overflow(560000,8000);
198     check_int_mult_overflow(M,K);
199     check_int_mult_overflow(N,K);
200     check_int_mult_overflow(M,N);
201 
202     cblas_dgemm (CblasRowMajor, transA, transB, M, N, NA,
203                  alpha, A->data, A->tda, B->data, B->tda, beta,
204                  C->data, C->tda);
205 
206   } else {
207     fail_msg("Range error in dgemm");
208   }
209 }
210 
211 
212 /*
213    Use the fast/supported way to call BLAS dgemm
214 */
215 
fast_dgemm(const char * TransA,const char * TransB,const double alpha,const gsl_matrix * A,const gsl_matrix * B,const double beta,gsl_matrix * C)216 void fast_dgemm(const char *TransA, const char *TransB, const double alpha,
217                 const gsl_matrix *A, const gsl_matrix *B, const double beta,
218                 gsl_matrix *C) {
219   fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C);
220 
221 #ifdef DISABLE
222   if (is_check_mode()) {
223     // ---- validate with original implementation
224     gsl_matrix *C1 = gsl_matrix_alloc(C->size1,C->size2);
225     eigenlib_dgemm(TransA,TransB,alpha,A,B,beta,C1);
226     enforce_msg(gsl_matrix_equal(C,C1),"dgemm outcomes are not equal for fast & eigenlib");
227     gsl_matrix_free(C1);
228   }
229 #endif
230 }
231 
fast_eigen_dgemm(const char * TransA,const char * TransB,const double alpha,const gsl_matrix * A,const gsl_matrix * B,const double beta,gsl_matrix * C)232 void fast_eigen_dgemm(const char *TransA, const char *TransB, const double alpha,
233                       const gsl_matrix *A, const gsl_matrix *B, const double beta,
234                       gsl_matrix *C) {
235     fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C);
236 }
237 
238 /*
239  *  Inverse in place
240  */
241 
242 #include <gsl/gsl_permutation.h>
243 // #include <gsl/gsl_linalg.h>
244 
245 extern "C" {
246   int gsl_linalg_LU_invert(const gsl_matrix * LU, const gsl_permutation * p, gsl_matrix * inverse);
247   int gsl_linalg_LU_decomp(gsl_matrix * A, gsl_permutation * p, int * signum);
248 }
249 
gsl_matrix_inv(gsl_matrix * m)250 void gsl_matrix_inv(gsl_matrix *m)
251 {
252     size_t n=m->size1;
253 
254     gsl_matrix *temp1=gsl_matrix_calloc(n,n);
255     gsl_matrix_memcpy(temp1,m);
256 
257     gsl_permutation *p=gsl_permutation_calloc(n);
258     int sign=0;
259     gsl_linalg_LU_decomp(temp1,p,&sign);
260     gsl_matrix *inverse=gsl_matrix_calloc(n,n);
261 
262     gsl_linalg_LU_invert(temp1,p,inverse);
263     gsl_matrix_memcpy(m,inverse);
264 
265     gsl_permutation_free(p);
266     gsl_matrix_free(temp1);
267     gsl_matrix_free(inverse);
268 
269 }
270 
fast_inverse(gsl_matrix * m)271 void fast_inverse(gsl_matrix *m) {
272     gsl_matrix_inv(m);
273 }
274