1 /*
2   HMat-OSS (HMatrix library, open source software)
3 
4   Copyright (C) 2014-2015 Airbus Group SAS
5 
6   This program is free software; you can redistribute it and/or
7   modify it under the terms of the GNU General Public License
8   as published by the Free Software Foundation; either version 2
9   of the License, or (at your option) any later version.
10 
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15 
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
19 
20   http://github.com/jeromerobert/hmat-oss
21 */
22 
23 /*! \file
24   \ingroup HMatrix
25   \brief Dense Matrix implementation.
26 */
27 #include "config.h"
28 
29 #ifdef __INTEL_COMPILER
30 #include <mathimf.h>
31 #else
32 #include <cmath>
33 #endif
34 
35 #include "scalar_array.hpp"
36 
37 #include "data_types.hpp"
38 #include "lapack_overloads.hpp"
39 #include "blas_overloads.hpp"
40 #include "lapack_exception.hpp"
41 #include "common/memory_instrumentation.hpp"
42 #include "system_types.h"
43 #include "common/my_assert.h"
44 #include "common/context.hpp"
45 #include "common/timeline.hpp"
46 #include "lapack_operations.hpp"
47 
48 #include <cstring> // memset
49 #include <algorithm> // swap
50 #include <iostream>
51 #include <fstream>
52 #include <cmath>
53 #include <fcntl.h>
54 
55 #ifndef _WIN32
56 #include <sys/mman.h> // mmap
57 #endif
58 
59 #include <sys/stat.h>
60 
61 #ifdef HAVE_UNISTD_H
62 #include <unistd.h>
63 #endif
64 
65 #include <stdlib.h>
66 
67 #ifdef HAVE_JEMALLOC
68 #define JEMALLOC_NO_DEMANGLE
69 #include <jemalloc/jemalloc.h>
70 #endif
71 
72 #ifdef _MSC_VER
73 // Intel compiler defines isnan in global namespace
74 // MSVC defines _isnan
75 # ifndef __INTEL_COMPILER
76 #  define isnan _isnan
77 # endif
78 #elif __GLIBC__ == 2 && __GLIBC_MINOR__ < 23
79 // https://sourceware.org/bugzilla/show_bug.cgi?id=19439
80 #elif __cplusplus >= 201103L || !defined(__GLIBC__)
81 using std::isnan;
82 #endif
83 
84 namespace hmat {
85 
convert_int_to_factorization(int t)86 Factorization convert_int_to_factorization(int t) {
87     switch (t) {
88     case hmat_factorization_none:
89         return Factorization::NONE;
90     case hmat_factorization_lu:
91         return Factorization::LU;
92     case hmat_factorization_ldlt:
93         return Factorization::LDLT;
94     case hmat_factorization_llt:
95         return Factorization::LLT;
96     default:
97         HMAT_ASSERT(false);
98     }
99 }
100 
convert_factorization_to_int(Factorization f)101 int convert_factorization_to_int(Factorization f) {
102     switch (f) {
103     case Factorization::NONE:
104         return hmat_factorization_none;
105     case Factorization::LU:
106         return hmat_factorization_lu;
107     case Factorization::LDLT:
108         return hmat_factorization_ldlt;
109     case Factorization::LLT:
110         return hmat_factorization_llt;
111     default:
112         HMAT_ASSERT(false);
113     }
114 }
115 
116 /** ScalarArray */
117 template<typename T>
ScalarArray(T * _m,int _rows,int _cols,int _lda)118 ScalarArray<T>::ScalarArray(T* _m, int _rows, int _cols, int _lda)
119   : ownsMemory(false), m(_m), rows(_rows), cols(_cols), lda(_lda) {
120   if (lda == -1) {
121     lda = rows;
122   }
123   ownsFlag = true ;
124   is_ortho = (int*)calloc(1, sizeof(int));
125   assert(lda >= rows);
126 }
127 
128 template<typename T>
ScalarArray(int _rows,int _cols,bool initzero)129 ScalarArray<T>::ScalarArray(int _rows, int _cols, bool initzero)
130   : ownsMemory(true), ownsFlag(true), rows(_rows), cols(_cols), lda(_rows) {
131   size_t size = sizeof(T) * rows * cols;
132   void * p;
133 #ifdef HAVE_JEMALLOC
134   p = initzero ? je_calloc(size, 1) : je_malloc(size);
135 #else
136   p = initzero ? calloc(size, 1) : malloc(size);
137 #endif
138   m = static_cast<T*>(p);
139   is_ortho = (int*)calloc(1, sizeof(int));
140   setOrtho(initzero ? 1 : 0); // buffer filled with 0 is orthogonal
141   HMAT_ASSERT_MSG(m, "Trying to allocate %ldb of memory failed (rows=%d cols=%d sizeof(T)=%d)", size, rows, cols, sizeof(T));
142   MemoryInstrumenter::instance().alloc(size, MemoryInstrumenter::FULL_MATRIX);
143 }
144 
~ScalarArray()145 template<typename T> ScalarArray<T>::~ScalarArray() {
146   if (ownsMemory) {
147     size_t size = ((size_t) rows) * cols * sizeof(T);
148     MemoryInstrumenter::instance().free(size, MemoryInstrumenter::FULL_MATRIX);
149 #ifdef HAVE_JEMALLOC
150     je_free(m);
151 #else
152     free(m);
153 #endif
154     m = NULL;
155   }
156   if (ownsFlag) {
157     free(is_ortho);
158     is_ortho = NULL;
159   }
160 }
161 
resize(int col_num)162 template<typename T> void ScalarArray<T>::resize(int col_num) {
163   assert(ownsFlag);
164   if(col_num > cols)
165     setOrtho(0);
166   int diffcol = col_num - cols;
167   if(diffcol > 0)
168     MemoryInstrumenter::instance().alloc(sizeof(T) * rows * diffcol,
169                                          MemoryInstrumenter::FULL_MATRIX);
170   else
171     MemoryInstrumenter::instance().free(sizeof(T) * rows * -diffcol,
172                                         MemoryInstrumenter::FULL_MATRIX);
173   cols = col_num;
174 #ifdef HAVE_JEMALLOC
175   void * p = je_realloc(m, sizeof(T) * rows * cols);
176 #else
177   void * p = realloc(m, sizeof(T) * rows * cols);
178 #endif
179   m = static_cast<T*>(p);
180 }
181 
clear()182 template<typename T> void ScalarArray<T>::clear() {
183   assert(lda == rows);
184   std::fill(m, m + ((size_t) rows) * cols, Constants<T>::zero);
185   setOrtho(1); // we dont use ptr(): buffer filled with 0 is orthogonal
186 }
187 
storedZeros() const188 template<typename T> size_t ScalarArray<T>::storedZeros() const {
189   size_t result = 0;
190   for (int col = 0; col < cols; col++) {
191     for (int row = 0; row < rows; row++) {
192       if (std::abs(get(row, col)) < 1e-16) {
193         result++;
194       }
195     }
196   }
197   return result;
198 }
199 
scale(T alpha)200 template<typename T> void ScalarArray<T>::scale(T alpha) {
201   increment_flops(Multipliers<T>::mul * ((size_t) rows) * cols);
202   if (lda == rows) {
203     if (alpha == Constants<T>::zero) {
204       this->clear();
205     } else {
206       // Warning: check for overflow
207       size_t nm = ((size_t) rows) * cols;
208       const size_t block_size_blas = 1 << 30;
209       while (nm > block_size_blas) {
210         proxy_cblas::scal(block_size_blas, alpha, ptr() + nm - block_size_blas, 1);
211         nm -= block_size_blas;
212       }
213       proxy_cblas::scal(nm, alpha, ptr(), 1);
214     }
215   } else {
216     T* x = ptr();
217     if (alpha == Constants<T>::zero) {
218       for (int col = 0; col < cols; col++) {
219         std::fill(x, x + rows, Constants<T>::zero);
220         x += lda;
221       }
222     } else {
223       for (int col = 0; col < cols; col++) {
224         proxy_cblas::scal(rows, alpha, x, 1);
225         x += lda;
226       }
227     }
228   }
229   if (alpha == Constants<T>::zero) setOrtho(1); // buffer filled with 0 is orthogonal
230 }
231 
transpose()232 template<typename T> void ScalarArray<T>::transpose() {
233   assert(lda == rows);
234 #ifdef HAVE_MKL_IMATCOPY
235   proxy_mkl::imatcopy(rows, cols, ptr());
236   std::swap(rows, cols);
237   lda = rows;
238 #else
239   if (rows == cols) {
240     // "Fast" path
241     for (int col = 0; col < cols; col++) {
242       for (int row = 0; row < col; row++) {
243         T tmp = get(row, col);
244         get(row, col) = get(col, row);
245         get(col, row) = tmp;
246       }
247     }
248   } else {
249     ScalarArray<T> *tmp=copy();
250     std::swap(rows, cols);
251     lda = rows;
252     for (int i = 0; i < rows; i++) {
253       for (int j = 0; j < cols; j++) {
254         get(i, j) = tmp->get(j, i);
255       }
256     }
257     delete(tmp);
258   }
259 #endif
260 }
261 
conjugate()262 template<typename T> void ScalarArray<T>::conjugate() {
263   if (lda == rows) {
264     // Warning: check for overflow
265     size_t nm = ((size_t) rows) * cols;
266     const size_t block_size_blas = 1 << 30;
267     while (nm > block_size_blas) {
268       // We don't use c->ptr() on purpose, because is_ortho is preserved here
269       proxy_lapack::lacgv(block_size_blas, m + nm - block_size_blas, 1);
270       nm -= block_size_blas;
271     }
272     proxy_lapack::lacgv(nm, m, 1);
273   } else {
274     T* x = m;
275     for (int col = 0; col < cols; col++) {
276       proxy_lapack::lacgv(rows, x, 1);
277       x += lda;
278     }
279   }
280 }
281 
copy(ScalarArray<T> * result) const282 template<typename T> ScalarArray<T>* ScalarArray<T>::copy(ScalarArray<T>* result) const {
283   if(result == NULL)
284     result = new ScalarArray<T>(rows, cols, false);
285 
286   if (lda == rows && result->lda == result->rows) {
287     size_t size = ((size_t) rows) * cols * sizeof(T);
288     memcpy(result->ptr(), const_ptr(), size);
289   } else {
290     for (int col = 0; col < cols; col++) {
291       size_t resultOffset = ((size_t) result->lda) * col;
292       size_t offset = ((size_t) lda) * col;
293       memcpy(result->ptr() + resultOffset, const_ptr() + offset, rows * sizeof(T));
294     }
295   }
296   result->setOrtho(getOrtho());
297   return result;
298 }
299 
copyAndTranspose(ScalarArray<T> * result) const300 template<typename T> ScalarArray<T>* ScalarArray<T>::copyAndTranspose(ScalarArray<T>* result) const {
301   if(result == NULL)
302     result = new ScalarArray<T>(cols, rows);
303 #ifdef HAVE_MKL_IMATCOPY
304   if (lda == rows && result->lda == result->rows) {
305     proxy_mkl::omatcopy(rows, cols, const_ptr(), result->ptr());
306   } else {
307 #endif
308   for (int i = 0; i < rows; i++) {
309     for (int j = 0; j < cols; j++) {
310       result->get(j, i) = get(i, j);
311     }
312   }
313 #ifdef HAVE_MKL_IMATCOPY
314   }
315 #endif
316   return result;
317 }
318 
319 template<typename T>
rowsSubset(const int rowsOffset,const int rowsSize) const320 ScalarArray<T> ScalarArray<T>::rowsSubset(const int rowsOffset, const int rowsSize) const { //TODO: remove it, we have a constructor to do this
321   assert(rowsOffset + rowsSize <= rows);
322   return ScalarArray<T>(*this, rowsOffset, rowsSize, 0, cols);
323 }
324 
325 template<typename T>
gemm(char transA,char transB,T alpha,const ScalarArray<T> * a,const ScalarArray<T> * b,T beta)326 void ScalarArray<T>::gemm(char transA, char transB, T alpha,
327                          const ScalarArray<T>* a, const ScalarArray<T>* b,
328                          T beta) {
329   const int aRows  = (transA == 'N' ? a->rows : a->cols);
330   const int n  = (transB == 'N' ? b->cols : b->rows);
331   const int k  = (transA == 'N' ? a->cols : a->rows);
332   const int tA = (transA == 'N' ? 0 : ( transA == 'T' ? 1 : 2 ));
333   const int tB = (transB == 'N' ? 0 : ( transB == 'T' ? 1 : 2 ));
334   Timeline::Task t(Timeline::BLASGEMM, &rows, &cols, &k, &tA, &tB);
335   (void)t;
336   assert(rows == aRows);
337   assert(cols == n);
338   assert(k == (transB == 'N' ? b->rows : b->cols));
339   {
340     const size_t _m = aRows, _n = n, _k = k;
341     const size_t adds = _m * _n * _k;
342     const size_t muls = _m * _n * _k;
343     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
344   }
345   assert(a->lda >= a->rows);
346   assert(b->lda >= b->rows);
347   assert(a->lda > 0);
348   assert(b->lda > 0);
349 
350   if (n > 1 || transB != 'N')
351     proxy_cblas::gemm(transA, transB, aRows, n, k, alpha, a->const_ptr(), a->lda, b->const_ptr(), b->lda,
352                       beta, this->ptr(), this->lda);
353   else
354     proxy_cblas::gemv(transA, a->rows, a->cols, alpha, a->const_ptr(), a->lda, b->const_ptr(), 1, beta, this->ptr(), 1);
355 }
356 
357 template<typename T>
copyMatrixAtOffset(const ScalarArray<T> * a,int rowOffset,int colOffset)358 void ScalarArray<T>::copyMatrixAtOffset(const ScalarArray<T>* a,
359                                        int rowOffset, int colOffset) {
360   assert(rowOffset + a->rows <= rows);
361   assert(colOffset + a->cols <= cols);
362 
363   if (rowOffset == 0 && a->rows == rows &&
364       a->lda == a->rows && lda == rows) {
365     memcpy(ptr() + colOffset * lda, a->const_ptr(), sizeof(T) * rows * a->cols);
366     // If I copy the whole matrix, I copy this flag
367     if(a->cols == cols)
368       setOrtho(a->getOrtho());
369   } else {
370     for (int col = 0; col < a->cols; col++) {
371       memcpy(ptr() + rowOffset + (colOffset + col) * lda,
372              a->const_ptr() + col * a->lda,
373              sizeof(T) * a->rows);
374     }
375   }
376 }
377 
378 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
addRandSFINAE(ScalarArray<T> & a,double epsilon)379 void addRandSFINAE(ScalarArray<T>& a, double epsilon) {
380   if (a.lda == a.rows) {
381     for (size_t i = 0; i < ((size_t) a.rows) * a.cols; ++i) {
382       a.get(i) *= 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
383     }
384   } else {
385     for (int col = 0; col < a.cols; ++col) {
386       for (int row = 0; row < a.rows; ++row) {
387         a.get(row, col) *= 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
388       }
389     }
390   }
391 }
392 
393 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
addRandSFINAE(ScalarArray<T> & a,double epsilon)394 void addRandSFINAE(ScalarArray<T>& a, double epsilon) {
395   if (a.lda == a.rows) {
396     for (size_t i = 0; i < ((size_t) a.rows) * a.cols; ++i) {
397       typename T::value_type c1 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
398       typename T::value_type c2 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
399       a.get(i) *= T(c1, c2);
400     }
401   } else {
402     for (int col = 0; col < a.cols; ++col) {
403       for (int row = 0; row < a.rows; ++row) {
404         typename T::value_type c1 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
405         typename T::value_type c2 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
406         a.get(row, col) *= T(c1, c2);
407       }
408     }
409   }
410 }
411 
412 template<typename T>
addRand(double epsilon)413 void ScalarArray<T>::addRand(double epsilon) {
414   DECLARE_CONTEXT;
415   addRandSFINAE<T>(*this, epsilon);
416 }
417 
418 template<typename T>
axpy(T alpha,const ScalarArray<T> * a)419 void ScalarArray<T>::axpy(T alpha, const ScalarArray<T>* a) {
420   assert(rows == a->rows);
421   assert(cols == a->cols);
422   size_t size = ((size_t) rows) * cols;
423 
424   increment_flops(Multipliers<T>::add * size
425 		  + (alpha == Constants<T>::pone ? 0 : Multipliers<T>::mul * size));
426   // Fast path
427   if ((lda == rows) && (a->lda == a->rows) && (size < 1000000000)) {
428     proxy_cblas::axpy(size, alpha, a->const_ptr(), 1, ptr(), 1);
429     return;
430   }
431 
432   for (int col = 0; col < cols; col++) {
433     proxy_cblas::axpy(rows, alpha, a->const_ptr() + ((size_t) col) * a->lda, 1, ptr() + ((size_t) col) * lda, 1);
434   }
435 }
436 
437 template<typename T>
normSqr() const438 double ScalarArray<T>::normSqr() const {
439   size_t size = ((size_t) rows) * cols;
440   T result = Constants<T>::zero;
441 
442   // Fast path
443   if ((size < 1000000000) && (lda == rows)) {
444     result += proxy_cblas_convenience::dot_c(size, const_ptr(), 1, const_ptr(), 1);
445     return real(result);
446   }
447   for (int col = 0; col < cols; col++) {
448     result += proxy_cblas_convenience::dot_c(rows, const_ptr() + col * lda, 1, const_ptr() + col * lda, 1);
449   }
450   return real(result);
451 }
452 
norm() const453 template<typename T> double ScalarArray<T>::norm() const {
454   return sqrt(normSqr());
455 }
456 
457 // Compute squared Frobenius norm of a.b^t (a=this)
norm_abt_Sqr(const ScalarArray<T> & b) const458 template<typename T> double ScalarArray<T>::norm_abt_Sqr(const ScalarArray<T> &b) const {
459   double result = 0;
460   const int k = cols;
461   for (int i = 1; i < k; ++i) {
462     for (int j = 0; j < i; ++j) {
463       result += real(proxy_cblas_convenience::dot_c(rows, const_ptr(0, i), 1, const_ptr(0, j), 1) *
464                      proxy_cblas_convenience::dot_c(b.rows, b.const_ptr(0, i), 1, b.const_ptr(0, j), 1));
465     }
466   }
467   result *= 2.0;
468   for (int i = 0; i < k; ++i) {
469     result += real(proxy_cblas_convenience::dot_c(rows, const_ptr(0, i), 1, const_ptr(0, i), 1) *
470                    proxy_cblas_convenience::dot_c(b.rows, b.const_ptr(0, i), 1, b.const_ptr(0, i), 1));
471   }
472   return result;
473 }
474 
475 // Compute dot product between this[i,*] and b[j,*]
dot_aibj(int i,const ScalarArray<T> & b,int j) const476 template<typename T> T ScalarArray<T>::dot_aibj(int i, const ScalarArray<T> &b, int j) const {
477   return proxy_cblas::dot(cols, &get(i,0), lda, &b.get(j,0), b.lda);
478 }
479 
fromFile(const char * filename)480 template<typename T> void ScalarArray<T>::fromFile(const char * filename) {
481   FILE * f = fopen(filename, "rb");
482   /* Read the header before data : [stype, rows, cols, sieof(T), 0] */
483   int code;
484   int r = fread(&code, sizeof(int), 1, f);
485   HMAT_ASSERT(r == 1);
486   HMAT_ASSERT(code == Constants<T>::code);
487   r = fread(&rows, sizeof(int), 1, f);
488   lda = rows;
489   HMAT_ASSERT(r == 1);
490   r = fread(&cols, sizeof(int), 1, f);
491   HMAT_ASSERT(r == 1);
492   r = fseek(f, 2 * sizeof(int), SEEK_CUR);
493   HMAT_ASSERT(r == 0);
494   if(m)
495       free(m);
496   size_t size = ((size_t) rows) * cols * sizeof(T);
497   m = (T*) calloc(size, 1);
498   r = fread(ptr(), size, 1, f);
499   fclose(f);
500   HMAT_ASSERT(r == 1);
501 }
502 
toFile(const char * filename) const503 template<typename T> void ScalarArray<T>::toFile(const char *filename) const {
504   int ierr;
505   int fd;
506   size_t size = ((size_t) rows) * cols * sizeof(T) + 5 * sizeof(int);
507 
508   HMAT_ASSERT(lda == rows);
509 
510   fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
511   HMAT_ASSERT(fd != -1);
512   ierr = lseek(fd, size - 1, SEEK_SET);
513   HMAT_ASSERT(ierr != -1);
514   ierr = write(fd, "", 1);
515   HMAT_ASSERT(ierr == 1);
516 #ifndef _WIN32
517   void* mmapedFile = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
518   ierr = (mmapedFile == MAP_FAILED) ? 1 : 0;
519   HMAT_ASSERT(!ierr);
520   /* Write the header before data : [stype, rows, cols, sieof(T), 0] */
521   int *asIntArray = (int*) mmapedFile;
522   asIntArray[0] = Constants<T>::code;
523   asIntArray[1] = rows;
524   asIntArray[2] = cols;
525   asIntArray[3] = sizeof(T);
526   asIntArray[4] = 0;
527   asIntArray += 5;
528   T* mat = (T*) asIntArray;
529   memcpy(mat, const_ptr(), size - 5 * sizeof(int));
530   close(fd);
531   munmap(mmapedFile, size);
532 #else
533   HMAT_ASSERT_MSG(false, "mmap not available on this platform");
534 #endif
535 
536 }
537 
memorySize() const538 template<typename T> size_t ScalarArray<T>::memorySize() const {
539    return ((size_t) rows) * cols * sizeof(T);
540 }
541 
542 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
checkNanSFINAE(const ScalarArray<T> * m)543 void checkNanSFINAE(const ScalarArray<T>* m) {
544   for (int col = 0; col < m->cols; col++) {
545     for (int row = 0; row < m->rows; row++) {
546       HMAT_ASSERT(!isnan(m->get(row, col)));
547     }
548   }
549 }
550 
551 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
checkNanSFINAE(const ScalarArray<T> * m)552 void checkNanSFINAE(const ScalarArray<T>* m) {
553   for (int col = 0; col < m->cols; col++) {
554     for (int row = 0; row < m->rows; row++) {
555       HMAT_ASSERT(!isnan(m->get(row, col).real()));
556       HMAT_ASSERT(!isnan(m->get(row, col).imag()));
557     }
558   }
559 }
560 
561 template<typename T>
checkNan() const562 void ScalarArray<T>::checkNan() const {
563   checkNanSFINAE<T>(this);
564 }
565 
isZero() const566 template<typename T> bool ScalarArray<T>::isZero() const {
567   for(int i = 0; i < rows; i++)
568     for(int j = 0; j < cols; j++)
569       if (get(i, j) != Constants<T>::zero)
570         return false;
571   return true;
572 }
573 
574 // this := alpha*x*b^T + this
rankOneUpdate(const T alpha,const ScalarArray<T> & x,const ScalarArray<T> & b)575 template<typename T> void ScalarArray<T>::rankOneUpdate(const T alpha, const ScalarArray<T> &x, const ScalarArray<T> &b){
576   assert(x.rows==rows);
577   assert(x.cols==1);
578   assert(b.rows==cols);
579   assert(b.cols==1);
580   proxy_cblas::ger(rows, cols, alpha, x.const_ptr(), 1, b.const_ptr(), 1, ptr(), lda);
581 }
582 
583 // this := alpha*x*b + this
rankOneUpdateT(const T alpha,const ScalarArray<T> & x,const ScalarArray<T> & tb)584 template<typename T> void ScalarArray<T>::rankOneUpdateT(const T alpha, const ScalarArray<T> &x, const ScalarArray<T> &tb){
585   assert(x.rows==rows);
586   assert(x.cols==1);
587   assert(tb.rows==1);
588   assert(tb.cols==cols);
589   proxy_cblas::ger(rows, cols, alpha, x.const_ptr(), 1, tb.const_ptr(), tb.lda, ptr(), lda);
590 }
591 
writeArray(hmat_iostream writeFunc,void * userData) const592 template<typename T> void ScalarArray<T>::writeArray(hmat_iostream writeFunc, void * userData) const{
593   assert(lda == rows);
594   size_t s = (size_t)rows * cols;
595   // We use m instead of const_ptr() because writeFunc() expects a void*, not a const void*
596   writeFunc(m, sizeof(T) * s, userData);
597 }
598 
readArray(hmat_iostream readFunc,void * userData)599 template<typename T> void ScalarArray<T>::readArray(hmat_iostream readFunc, void * userData) {
600   assert(lda == rows);
601   size_t s = (size_t)rows * cols;
602   readFunc(ptr(), sizeof(T) * s, userData);
603 }
604 
luDecomposition(int * pivots)605 template<typename T> void ScalarArray<T>::luDecomposition(int *pivots) {
606   int info;
607   {
608     const size_t _m = rows, _n = cols;
609     const size_t muls = _m * _n *_n / 2 - _n *_n*_n / 6 + _m * _n / 2 - _n*_n / 2 + 2 * _n / 3;
610     const size_t adds = _m * _n *_n / 2 - _n *_n*_n / 6 + _m * _n / 2 + _n / 6;
611     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
612   }
613   info = proxy_lapack::getrf(rows, cols, ptr(), lda, pivots);
614   if (info)
615     throw LapackException("getrf", info);
616 }
617 
618 template<typename T>
619 class InvalidDiagonalException: public LapackException {
620     std::string invalidDiagonalMessage_;
621 public:
InvalidDiagonalException(const T value,const int j,const char * where)622     InvalidDiagonalException(const T value, const int j, const char * where)
623       : LapackException(where, -1)
624     {
625         std::stringstream sstm;
626         sstm << "In " << where << ", diagonal index "<< j << " has an invalid value " << value;
627         invalidDiagonalMessage_ = sstm.str();
628     }
629 
what() const630     virtual const char* what() const noexcept {
631         return invalidDiagonalMessage_.c_str();
632     }
633 };
634 
635 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
assertPositive(const T v,const int j,const char * const where)636 void assertPositive(const T v, const int j, const char * const where) {
637     if(!(v > 0))
638       throw InvalidDiagonalException<S_t>(v, j, where);
639 }
640 
641 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
assertPositive(const T v,const int j,const char * const where)642 void assertPositive(const T v, const int j, const char * const where) {
643     if(v == Constants<T>::zero)
644       throw InvalidDiagonalException<T>(v, j, where);
645 }
646 
647 template<typename T>
ldltDecomposition(Vector<T> & diagonal)648 void ScalarArray<T>::ldltDecomposition(Vector<T>& diagonal) {
649   assert(rows == cols); // We expect a square matrix
650   //TODO : add flops counter
651 
652   // Standard LDLt factorization algorithm is:
653   //  diag[j] = A(j,j) - sum_{k < j} L(j,k)^2 diag[k]
654   //  L(i,j) = (A(i,j) - sum_{k < j} (L(i,k)L(j,k)diag[k])) / diag[j]
655   // See for instance http://en.wikipedia.org/wiki/Cholesky_decomposition
656   // An auxiliary array is introduced in order to perform less multiplications,
657   // see  algorithm 1 in http://icl.cs.utk.edu/projectsfiles/plasma/pubs/ICL-UT-11-03.pdf
658   int n = rows;
659   T* v = new T[n];
660   HMAT_ASSERT(v);
661   for (int j = 0; j < n; j++) {
662     for (int i = 0; i < j; i++)
663       v[i] = get(j,i) * get(i,i);
664 
665     v[j] = get(j,j);
666     for (int i = 0; i < j; i++)
667       // Do not use the -= operator because it's buggy in the intel compiler
668       v[j] = v[j] - get(j,i) * v[i];
669 
670     get(j,j) = v[j];
671     for (int i = 0; i < j; i++)
672       for (int k = j+1; k < n; k++)
673         get(k,j) -= get(k,i) * v[i];
674 
675     for (int k = j+1; k < n; k++) {
676       if (v[j] == Constants<T>::zero)
677         throw InvalidDiagonalException<T>(v[j], j, "ldltDecomposition");
678       get(k,j) /= v[j];
679     }
680   }
681 
682   for(int i = 0; i < n; i++) {
683     diagonal[i] = get(i,i);
684     get(i,i) = Constants<T>::pone;
685     for (int j = i + 1; j < n; j++)
686       get(i,j) = Constants<T>::zero;
687   }
688 
689   delete[] v;
690 }
691 
692 template<typename T>
lltDecomposition()693 void ScalarArray<T>::lltDecomposition() {
694   assert(rows == cols); // We expect a square matrix
695 
696   // Standard LLt factorization algorithm is:
697   //  L(j,j) = sqrt( A(j,j) - sum_{k < j} L(j,k)^2)
698   //  L(i,j) = ( A(i,j) - sum_{k < j} L(i,k)L(j,k) ) / L(j,j)
699 
700     // from http://www.netlib.org/lapack/lawnspdf/lawn41.pdf page 120
701   int n = rows;
702   const size_t n2 = (size_t) n*n;
703   const size_t n3 = n2 * n;
704   const size_t muls = n3 / 6 + n2 / 2 + n / 3;
705   const size_t adds = n3 / 6 - n / 6;
706     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
707 
708   for (int j = 0; j < n; j++) {
709     for (int k = 0; k < j; k++)
710       get(j,j) -= get(j,k) * get(j,k);
711     assertPositive(get(j, j), j, "lltDecomposition");
712 
713     get(j,j) = std::sqrt(get(j,j));
714 
715     for (int k = 0; k < j; k++)
716       for (int i = j+1; i < n; i++)
717         get(i,j) -= get(i,k) * get(j,k);
718 
719     for (int i = j+1; i < n; i++) {
720       get(i,j) /= get(j,j);
721     }
722   }
723 
724   // For real matrices, we could use the lapack version, could be faster
725   // (There is no L.Lt factorisation for complex matrices in lapack)
726   //    int info = proxy_lapack::potrf('L', this->rows(), this->data.m, this->data.lda);
727   //    if(info != 0)
728   //        // throw a pointer to be compliant with the Status class
729   //        throw hmat::LapackException("potrf", info);
730 
731   for (int j = 0; j < n; j++) {
732         for(int i = 0; i < j; i++) {
733             get(i,j) = Constants<T>::zero;
734         }
735     }
736 }
737 
738 // Helper functions for solveTriangular
to_blas(const Side side)739 inline char to_blas(const Side side) {
740     return side == Side::LEFT ? 'L' : 'R';
741 }
742 
to_blas(const Uplo uplo)743 inline char to_blas(const Uplo uplo) {
744     return uplo == Uplo::LOWER ? 'L' : 'U';
745 }
746 
to_blas(const Diag diag)747 inline char to_blas(const Diag diag) {
748     return diag == Diag::UNIT ? 'U' : 'N';
749 }
750 
751 
752 // The following code is very close to that of ZGETRS in LAPACK.
753 // However, the resolution here is divided in the two parties.
754 
755 // Warning! The matrix has been obtained with ZGETRF therefore it is
756 // permuted! We have the factorization A = P L U  with P the
757 // permutation matrix. So to solve L X = B, we must
758 // solve LX = (P ^ -1 B), which is done by ZLASWP with
759 // the permutation. we used it just like in ZGETRS.
760 template<typename T>
solveLowerTriangularLeft(ScalarArray<T> * x,const FactorizationData<T> & context,Diag diag,Uplo uplo) const761 void ScalarArray<T>::solveLowerTriangularLeft(ScalarArray<T>* x, const FactorizationData<T>& context, Diag diag, Uplo uplo) const {
762   {
763     const size_t _m = rows, _n = x->cols;
764     const size_t adds = _n * _m * (_m - 1) / 2;
765     const size_t muls = _n * _m * (_m + 1) / 2;
766     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
767   }
768   if (context.algo == Factorization::LU && uplo == Uplo::LOWER)
769     proxy_lapack::laswp(x->cols, x->ptr(), x->lda, 1, rows, context.data.pivots, 1);
770   proxy_cblas::trsm('L', to_blas(uplo), uplo == Uplo::LOWER ? 'N' : 'T', to_blas(diag), rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
771 }
772 
773 // The resolution of the upper triangular system does not need to
774 //  change the order of columns.
775 //  The pivots are not necessary here, but this helps to check
776 //  the matrix was factorized before.
777 
778 template<typename T>
solveUpperTriangularRight(ScalarArray<T> * x,const FactorizationData<T> & context,Diag diag,Uplo uplo) const779 void ScalarArray<T>::solveUpperTriangularRight(ScalarArray<T>* x, const FactorizationData<T>& context, Diag diag, Uplo uplo) const {
780   // Void matrix
781   if (x->rows == 0 || x->cols == 0) return;
782 
783   {
784     const size_t _m = rows, _n = x->cols;
785     const size_t adds = _n * _m * (_m - 1) / 2;
786     const size_t muls = _n * _m * (_m + 1) / 2;
787     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
788   }
789   proxy_cblas::trsm('R', to_blas(uplo), uplo == Uplo::LOWER ? 'T' : 'N', to_blas(diag),
790     x->rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
791 }
792 
793 template<typename T>
solveUpperTriangularLeft(ScalarArray<T> * x,const FactorizationData<T> &,Diag diag,Uplo uplo) const794 void ScalarArray<T>::solveUpperTriangularLeft(ScalarArray<T>* x, const FactorizationData<T>&, Diag diag, Uplo uplo) const {
795   // Void matrix
796   if (x->rows == 0 || x->cols == 0) return;
797 
798   {
799     const size_t _m = rows, _n = x->cols;
800     const size_t adds = _n * _m * (_n - 1) / 2;
801     const size_t muls = _n * _m * (_n + 1) / 2;
802     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
803   }
804   proxy_cblas::trsm('L', to_blas(uplo), uplo == Uplo::LOWER ? 'T' : 'N', to_blas(diag),
805     x->rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
806 }
807 
808 template<typename T>
solve(ScalarArray<T> * x,const FactorizationData<T> & context) const809 void ScalarArray<T>::solve(ScalarArray<T>* x, const FactorizationData<T>& context) const {
810   // Void matrix
811   if (x->rows == 0 || x->cols == 0) return;
812 
813   int ierr = 0;
814   {
815     const size_t nrhs = x->cols;
816     const size_t n = rows;
817     const size_t adds = n * n * nrhs;
818     const size_t muls = (n * n - n) * nrhs;
819     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
820   }
821   HMAT_ASSERT(context.algo == Factorization::LU);
822   ierr = proxy_lapack::getrs('N', rows, x->cols, const_ptr(), lda, context.data.pivots, x->ptr(), x->rows);
823   if (ierr)
824     throw LapackException("getrs", ierr);
825 }
826 
827 
828 template<typename T>
inverse()829 void ScalarArray<T>::inverse() {
830 
831   // The inversion is done in two steps with dgetrf for LU decomposition and
832   // dgetri for inversion of triangular matrices
833 
834   assert(rows == cols);
835 
836   int *ipiv = new int[rows];
837   int info;
838   {
839     size_t vn = cols, vm = cols;
840     // getrf
841     size_t additions = (vm*vn*vn)/2 - (vn*vn*vn)/6 - (vm*vn)/2 + vn/6;
842     size_t multiplications = (vm*vn*vn)/2 - (vn*vn*vn)/6 + (vm*vn)/2
843       - (vn*vn)/2 + 2*vn/3;
844     increment_flops(Multipliers<T>::add * additions + Multipliers<T>::mul * multiplications);
845     // getri
846     additions = (2*vn*vn*vn)/3 - (3*vn*vn)/2 + (5*vn)/6;
847     multiplications = (2*vn*vn*vn)/3 + (vn*vn)/2 + (5*vn)/6;
848     increment_flops(Multipliers<T>::add * additions + Multipliers<T>::mul * multiplications);
849   }
850   info = proxy_lapack::getrf(rows, cols, ptr(), lda, ipiv);
851   HMAT_ASSERT(!info);
852   // We call it twice: the first time to know the optimal size of
853   // temporary arrays, and the second time for real calculation.
854   int workSize;
855   T workSize_req;
856   info = proxy_lapack::getri(rows, ptr(), lda, ipiv, &workSize_req, -1);
857   workSize = (int) real(workSize_req) + 1;
858   T* work = new T[workSize];
859   HMAT_ASSERT(work);
860   info = proxy_lapack::getri(rows, ptr(), lda, ipiv, work, workSize);
861   delete[] work;
862   if (info)
863     throw LapackException("getri", info);
864   delete[] ipiv;
865 }
866 
867 /*! \brief Returns the number of singular values to keep.
868 
869      The stop criterion is (assuming that the singular value
870      are in descending order):
871          sigma [k] / sigma[0] <epsilon
872      except if env. var. HMAT_SUM_CRITERION is set, in which case the criterion is:
873          sigma [k] / SUM (sigma) <epsilon
874 
875      \param sigma table of singular values at least maxK elements.
876      \param epsilon tolerance.
877      \return int the number of singular values to keep.
878  */
findK(Vector<double> & sigma,double epsilon)879 static int findK(Vector<double> &sigma, double epsilon) {
880   assert(epsilon >= 0.);
881   static char *useSumCriterion = getenv("HMAT_SUM_CRITERION");
882   double threshold_eigenvalue = 0.0;
883   if (useSumCriterion == NULL) {
884     threshold_eigenvalue = sigma[0];
885   } else {
886     for (int i = 0; i < sigma.rows; i++) {
887       threshold_eigenvalue += sigma[i];
888     }
889   }
890   threshold_eigenvalue *= epsilon;
891   int i = 0;
892   for (i = 0; i < sigma.rows; i++) {
893     if (sigma[i] <= threshold_eigenvalue){
894       break;
895     }
896   }
897   return i;
898 }
899 
truncatedSvdDecomposition(ScalarArray<T> ** u,ScalarArray<T> ** v,double epsilon,bool workAroundFailures) const900 template<typename T> int ScalarArray<T>::truncatedSvdDecomposition(ScalarArray<T>** u, ScalarArray<T>** v, double epsilon, bool workAroundFailures) const {
901   Vector<double>* sigma = NULL;
902 
903   svdDecomposition(u, &sigma, v, workAroundFailures);
904 
905   // Control of the approximation
906   int newK = findK(*sigma, epsilon);
907 
908   if(newK == 0) {
909     delete *u;
910     delete *v;
911     delete sigma;
912     *u = NULL;
913     *v = NULL;
914     return 0;
915   }
916 
917   (*u)->resize(newK);
918   sigma->rows = newK;
919   (*v)->resize(newK);
920 
921   // We put the square root of singular values in sigma
922   for (int i = 0; i < newK; i++)
923     (*sigma)[i] = sqrt((*sigma)[i]);
924 
925   // Apply sigma 'symmetrically' on u and v
926   (*u)->multiplyWithDiag(sigma);
927   (*v)->multiplyWithDiag(sigma);
928   delete sigma;
929 
930   return newK;
931 }
932 
svdDecomposition(ScalarArray<T> ** u,Vector<double> ** sigma,ScalarArray<T> ** v,bool workAroundFailures) const933   template<typename T> int ScalarArray<T>::svdDecomposition(ScalarArray<T>** u, Vector<double>** sigma, ScalarArray<T>** v, bool workAroundFailures) const {
934   DECLARE_CONTEXT;
935   static char * useGESDD = getenv("HMAT_GESDD");
936   Timeline::Task t(Timeline::SVD, &rows, &cols);
937   (void)t;
938   // Allocate free space for U, S, V
939   int p = std::min(rows, cols);
940 
941   *u = new ScalarArray<T>(rows, p, false);
942   *sigma = new Vector<double>(p);
943   *v = new ScalarArray<T>(p, cols, false); // We create v in transposed shape (as expected by lapack zgesvd)
944 
945   // To be prepared for working around a failure in SVD, I must do a copy of 'this'
946   ScalarArray<T> *a = workAroundFailures ? copy() : NULL;
947 
948   assert(lda >= rows);
949 
950   char jobz = 'S';
951   int info=0;
952 
953   {
954     const size_t _m = rows, _n = cols;
955     // Warning: These quantities are a rough approximation.
956     // What's wrong with these estimates:
957     //  - Golub only gives 14 * M*N*N + 8 N*N*N
958     //  - This is for real numbers
959     //  - We assume the same number of * and +
960     size_t adds = 7 * _m * _n * _n + 4 * _n * _n * _n;
961     size_t muls = 7 * _m * _n * _n + 4 * _n * _n * _n;
962     increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
963   }
964 
965   try {
966     if(useGESDD)
967       info = sddCall(jobz, rows, cols, ptr(), lda, (*sigma)->ptr(), (*u)->ptr(),
968                      (*u)->lda, (*v)->ptr(), (*v)->lda);
969     else
970       info = svdCall(jobz, jobz, rows, cols, ptr(), lda, (*sigma)->ptr(), (*u)->ptr(),
971                      (*u)->lda, (*v)->ptr(), (*v)->lda);
972     (*v)->transpose();
973     (*u)->setOrtho(1);
974     (*v)->setOrtho(1);
975     delete a;
976   } catch(LapackException & e) {
977     // if SVD fails, and if workAroundFailures is set to true, I return a "fake" result that allows
978     // computation to proceed. Otherwise, I stop here.
979     if (!workAroundFailures) throw;
980 
981     printf("%s overriden...\n", e.what());
982     // If rows<cols, then p==rows, 'u' is square, 'v' has the dimensions of 'this'.
983     // fake 'u' is identity, fake 'v' is 'this^T'
984     if (rows<cols) {
985       for (int i=0 ; i<rows ; i++)
986         for (int j=0 ; j<p ; j++)
987           (*u)->get(i,j) = i==j ? Constants<T>::pone : Constants<T>::zero ;
988       (*u)->setOrtho(1);
989       delete *v;
990       *v = a ;
991       (*v)->transpose();
992     } else {
993       // Otherwise rows>=cols, then p==cols, 'u' has the dimensions of 'this', 'v' is square.
994       // fake 'u' is 'this', fake 'v' is identity
995       delete *u;
996       *u = a ;
997       for (int i=0 ; i<p ; i++)
998         for (int j=0 ; j<cols ; j++)
999           (*v)->get(i,j) = i==j ? Constants<T>::pone : Constants<T>::zero ;
1000       (*v)->setOrtho(1);
1001     }
1002     // Fake 'sigma' is all 1
1003     for (int i=0 ; i<p ; i++)
1004       (**sigma)[i] = Constants<double>::pone ;
1005   }
1006 
1007   return info;
1008 }
1009 
orthoColumns(ScalarArray<T> * resultR,int initialPivot)1010 template<typename T> void ScalarArray<T>::orthoColumns(ScalarArray<T> *resultR, int initialPivot) {
1011   DECLARE_CONTEXT;
1012 
1013   // We use the fact that the initial 'initialPivot' are orthogonal
1014   // We just normalize them and update the columns >= initialPivot using MGS-like approach
1015   ScalarArray<T> bK(*this, 0, rows, initialPivot, cols-initialPivot); // All the columns of 'this' after column 'initialPivot'
1016   for(int j = 0; j < initialPivot; ++j) {
1017     // Normalisation of column j
1018     Vector<T> aj(*this, j);
1019     resultR->get(j,j) = aj.norm();
1020     T coef = Constants<T>::pone / resultR->get(j,j);
1021     aj.scale(coef);
1022   }
1023   // Remove the qj-component from vectors bk (k=initialPivot,...,n-1)
1024   if (initialPivot<cols) {
1025     static char *useBlas3 = getenv("HMAT_MGS_BLAS3");
1026     if (useBlas3) {
1027       ScalarArray<T> aJ(*this, 0, rows, 0, initialPivot); // All the columns of 'this' from 0 to 'initialPivot-1'
1028       ScalarArray<T> aJ_bK(*resultR, 0, initialPivot, initialPivot, cols-initialPivot); // In 'r': row '0' to 'initialPivot-1', all the columns after column 'initialPivot-1'
1029       // Compute in 1 operation all the scalar products between a_0,...,a_init-1 and a_init, ..., a_n-1
1030       aJ_bK.gemm('C', 'N', Constants<T>::pone, &aJ, &bK, Constants<T>::zero);
1031       // Update a_init, ..., a_n-1
1032       bK.gemm('N', 'N', Constants<T>::mone, &aJ, &aJ_bK, Constants<T>::pone);
1033     } else {
1034       for(int j = 0; j < initialPivot; ++j) {
1035         Vector<T> aj(*this, j);
1036         ScalarArray<T> aj_bK(*resultR, j, 1, initialPivot, cols-initialPivot); // In 'r': row 'j', all the columns after column 'initialPivot'
1037         // Compute in 1 operation all the scalar products between aj and a_firstcol, ..., a_n
1038         aj_bK.gemm('C', 'N', Constants<T>::pone, &aj, &bK, Constants<T>::zero);
1039         // Update a_firstcol, ..., a_n
1040         bK.rankOneUpdateT(Constants<T>::mone, aj, aj_bK);
1041       }
1042     }
1043   } // if (initialPivot<cols)
1044 }
1045 
qrDecomposition(ScalarArray<T> * resultR,int initialPivot)1046 template<typename T> void ScalarArray<T>::qrDecomposition(ScalarArray<T> *resultR, int initialPivot) {
1047   DECLARE_CONTEXT;
1048   Timeline::Task t(Timeline::QR, &rows, &cols, &initialPivot);
1049   (void)t;
1050 
1051   static char *useInitPivot = getenv("HMAT_TRUNC_INITPIV");
1052   if (!useInitPivot) initialPivot=0;
1053   assert(initialPivot>=0 && initialPivot<=cols);
1054 
1055   ScalarArray<T> *bK = NULL, *restR = NULL, *a = this; // we need this because if initialPivot>0, we will run the end of the computation on a subset of 'this'
1056 
1057   // Initial pivot
1058   if (initialPivot) {
1059     // modify the columns [initialPivot, cols[ to make them orthogonals to columns [0, initialPivot[
1060     orthoColumns(resultR, initialPivot);
1061     // then we do qrDecomposition on the remaining columns (without initial pivot)
1062     bK = new ScalarArray<T> (*this, 0, rows, initialPivot, cols-initialPivot); // All the columns of 'this' after column 'initialPivot'
1063     restR = new ScalarArray<T> (*resultR, initialPivot, cols-initialPivot, initialPivot, cols-initialPivot); // In 'r': all the rows and columns after row/column 'initialPivot'
1064     a = bK;
1065     resultR = restR;
1066   }
1067 
1068   //  SUBROUTINE DGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
1069   T* tau = (T*) calloc(std::min(a->rows, a->cols), sizeof(T));
1070   {
1071     size_t mm = std::max(a->rows, a->cols);
1072     size_t n = std::min(a->rows, a->cols);
1073     size_t multiplications = mm * n * n - (n * n * n) / 3 + mm * n + (n * n) / 2 + (29 * n) / 6;
1074     size_t additions = mm * n * n + (n * n * n) / 3 + 2 * mm * n - (n * n) / 2 + (5 * n) / 6;
1075     increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1076   }
1077   int info;
1078   int workSize;
1079   T workSize_S;
1080   // int info = LAPACKE_sgeqrf(LAPACK_COL_MAJOR, rows, cols, m, rows, *tau);
1081   info = proxy_lapack::geqrf(a->rows, a->cols, a->ptr(), a->rows, tau, &workSize_S, -1);
1082   HMAT_ASSERT(!info);
1083   workSize = (int) hmat::real(workSize_S) + 1;
1084   T* work = new T[workSize];// TODO Mettre dans la pile ??
1085   HMAT_ASSERT(work) ;
1086   info = proxy_lapack::geqrf(a->rows, a->cols, a->ptr(), a->rows, tau, work, workSize);
1087   delete[] work;
1088 
1089   HMAT_ASSERT(!info);
1090 
1091   // Copy the 'r' factor in the upper part of resultR
1092   for (int col = 0; col < a->cols; col++) {
1093     for (int row = 0; row <= col; row++) {
1094       resultR->get(row, col) = a->get(row, col);
1095     }
1096   }
1097 
1098   // Copy tau in the last column of 'this'
1099   memcpy(a->ptr(0, a->cols-1), tau, sizeof(T)*std::min(a->rows, a->cols));
1100   free(tau);
1101 
1102   // temporary data created if initialPivot>0
1103   if (bK) delete(bK);
1104   if (restR) delete(restR);
1105 
1106   return;
1107 }
1108 
1109 // aFull <- aFull.bTri^t with aFull=this and bTri upper triangular matrix
1110 template<typename T>
myTrmm(const ScalarArray<T> * bTri)1111 void ScalarArray<T>::myTrmm(const ScalarArray<T>* bTri) {
1112   DECLARE_CONTEXT;
1113   int mm = rows;
1114   int n = rows;
1115   T alpha = Constants<T>::pone;
1116   const T *bData = bTri->const_ptr();
1117   HMAT_ASSERT(bTri->lda == bTri->rows);
1118   HMAT_ASSERT(lda == rows);
1119   {
1120     size_t m_ = mm;
1121     size_t nn = n;
1122     size_t multiplications = m_ * nn  * (nn + 1) / 2;
1123     size_t additions = m_ * nn  * (nn - 1) / 2;
1124     increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1125   }
1126   proxy_cblas::trmm('R', 'U', 'T', 'N', mm, n, alpha, bData, bTri->lda, ptr(), lda);
1127 }
1128 
1129 template<typename T>
productQ(char side,char trans,ScalarArray<T> * c) const1130 int ScalarArray<T>::productQ(char side, char trans, ScalarArray<T>* c) const {
1131   DECLARE_CONTEXT;
1132   Timeline::Task t(Timeline::PRODUCTQ, &cols, &c->rows, &c->cols);
1133   (void)t;
1134   assert((side == 'L') ? rows == c->rows : rows == c->cols);
1135   int info;
1136   int workSize;
1137   T workSize_req;
1138   {
1139     size_t _m = c->rows, _n = c->cols, _k = cols;
1140     size_t muls = 2 * _m * _n * _k - _n * _k * _k + 2 * _n * _k;
1141     size_t adds = 2 * _m * _n * _k - _n * _k * _k + _n * _k;
1142     increment_flops(Multipliers<T>::mul * muls + Multipliers<T>::add * adds);
1143   }
1144 
1145   // In qrDecomposition(), tau is stored in the last column of 'this'
1146   // it is not valid to work with 'tau' inside the array 'a' because zunmqr modifies 'a'
1147   // during computation. So we work on a copy of tau.
1148   T tau[std::min(rows, cols)];
1149   memcpy(tau, const_ptr(0, cols-1), sizeof(T)*std::min(rows, cols));
1150 
1151   // We don't use c->ptr() on purpose, because c->is_ortho is preserved here (Q is orthogonal)
1152   info = proxy_lapack_convenience::or_un_mqr(side, trans, c->rows, c->cols, cols, const_ptr(), lda, tau, c->m, c->lda, &workSize_req, -1);
1153   HMAT_ASSERT(!info);
1154   workSize = (int) hmat::real(workSize_req) + 1;
1155 
1156   // If the previous call to 'or_un_mqr' does not give us a large enough work
1157   // space size, we set the latter to the count of rows or columns of the matrix
1158   // depending on the value of 'side'.
1159   if(side == 'L') {
1160     workSize = workSize < c->rows ? c->rows : workSize;
1161   } else if(side == 'R') {
1162     workSize = workSize < c->cols ? c->cols : workSize;
1163   }
1164 
1165   T* work = new T[workSize];
1166   HMAT_ASSERT(work);
1167   info = proxy_lapack_convenience::or_un_mqr(side, trans, c->rows, c->cols, cols, const_ptr(), lda, tau, c->m, c->lda, work, workSize);
1168   HMAT_ASSERT(!info);
1169   delete[] work;
1170   return 0;
1171 }
1172 
modifiedGramSchmidt(ScalarArray<T> * result,double prec,int initialPivot)1173 template<typename T> int ScalarArray<T>::modifiedGramSchmidt(ScalarArray<T> *result, double prec, int initialPivot ) {
1174   DECLARE_CONTEXT;
1175   Timeline::Task t(Timeline::MGS, &rows, &cols, &initialPivot);
1176   (void)t;
1177 
1178   static char *useInitPivot = getenv("HMAT_MGS_INITPIV");
1179   if (!useInitPivot) initialPivot=0;
1180   assert(initialPivot>=0 && initialPivot<=cols);
1181 
1182 
1183   {
1184     size_t mm = rows;
1185     size_t n = cols;
1186     size_t multiplications = mm*n*n;
1187     size_t additions = mm*n*n;
1188     increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1189   }
1190   int rank=0;
1191   double relative_epsilon;
1192   static const double LOWEST_EPSILON = 1.0e-6;
1193 
1194   const int original_rank(result->rows);
1195   assert(original_rank == result->cols);
1196   assert(original_rank >= cols);
1197   int* perm = new int[original_rank];
1198   for(int k = 0; k < original_rank; ++k) {
1199     perm[k] = k;
1200   }
1201   // Temporary arrays
1202   ScalarArray<T> r(original_rank, original_rank);
1203   Vector<T> buffer(std::max(original_rank, rows));
1204 
1205   // Lower threshold for relative precision
1206   if(prec < LOWEST_EPSILON) {
1207     prec = LOWEST_EPSILON;
1208   }
1209 
1210   // Apply Modified Gram-Schmidt process with column pivoting to the 'initialPivot' first columns,
1211   // where the j-th pivot is column j
1212   // No stopping criterion here. Since these columns come from another recompression, we assume
1213   // they are all kept.
1214   // With this approach, we can use blas3 (more computationnaly efficient) but we do a classical
1215   // (not a "modified") Gram Schmidt algorithm, which is known to be numerically unstable
1216   // So expect to lose on final accuracy of hmat
1217 
1218   // Initial pivot
1219   if (initialPivot) {
1220     // modify the columns [initialPivot, cols[ to make them orthogonals to columns [0, initialPivot[
1221     orthoColumns(&r, initialPivot);
1222     rank = initialPivot;
1223   }
1224 
1225   // Init.
1226   Vector<double> norm2(cols); // Norm^2 of the columns of 'this' during computation
1227   Vector<double> norm2_orig(cols); // Norm^2 of the original columns of 'this'
1228   Vector<double> norm2_update(cols); // Norm^2 of the columns of 'this' normalized (will be 1. at the beginning, then decrease)
1229   // The actual norm^2 of column j during computation (before it is finally normalized) will be norm2[j] = norm2_orig[j] * norm2_update[j]
1230   // The choice of the pivot will be based on norm2_update[] or norm2[]
1231   relative_epsilon = 0.0;
1232   for(int j=rank; j < cols; ++j) {
1233     const Vector<T> aj(*this, j);
1234     norm2[j] = aj.normSqr();
1235     relative_epsilon = std::max(relative_epsilon, norm2[j]);
1236     norm2_orig[j] = norm2[j];
1237     norm2_update[j] = 1.0 ;
1238     if(norm2_orig[j]==0) { // Neutralize the null columns
1239       norm2_orig[j] = 1.;
1240       norm2_update[j] = 0.0 ;
1241     }
1242   }
1243   relative_epsilon *= prec * prec;
1244 
1245   // Modified Gram-Schmidt process with column pivoting
1246   for(int j = rank; j < cols; ++j) {
1247     // Find the largest pivot
1248     int pivot = norm2.absoluteMaxIndex(j);
1249     double pivmax = norm2[pivot];
1250 
1251     static char *newPivot = getenv("HMAT_MGS_ALTPIV");
1252     if (newPivot) {
1253       pivot = norm2_update.absoluteMaxIndex(j);
1254       pivmax = norm2_update[pivot];
1255       relative_epsilon = prec * prec;
1256     }
1257     if (j<initialPivot) {
1258       pivot = j;
1259       pivmax = 1.;
1260     }
1261 
1262     // Stopping criterion
1263     if (pivmax > relative_epsilon) {
1264       ++rank;
1265 
1266       // Pivoting
1267       if (j != pivot) {
1268         std::swap(perm[j], perm[pivot]);
1269         std::swap(norm2[j], norm2[pivot]);
1270         std::swap(norm2_orig[j], norm2_orig[pivot]);
1271         std::swap(norm2_update[j], norm2_update[pivot]);
1272 
1273         // Exchange the column 'j' and 'pivot' in this[] using buffer as temp space
1274         memcpy(buffer.ptr(),  const_ptr(0, j),     rows*sizeof(T));
1275         memcpy(ptr(0, j),     const_ptr(0, pivot), rows*sizeof(T));
1276         memcpy(ptr(0, pivot), buffer.const_ptr(),  rows*sizeof(T));
1277 
1278         // Idem for r[]
1279         memcpy(buffer.ptr(),    r.const_ptr(0, j),     cols*sizeof(T));
1280         memcpy(r.ptr(0, j),     r.const_ptr(0, pivot), cols*sizeof(T));
1281         memcpy(r.ptr(0, pivot), buffer.const_ptr(),    cols*sizeof(T));
1282       }
1283 
1284       // Normalisation of qj
1285       r.get(j,j) = sqrt(norm2[j]);
1286       Vector<T> aj(*this, j);
1287       T coef = Constants<T>::pone / r.get(j,j);
1288       aj.scale(coef);
1289 
1290       // Remove the qj-component from vectors bk (k=j+1,...,n-1)
1291       if (j<cols-1) {
1292         int firstcol=std::max(j+1, initialPivot) ;
1293         ScalarArray<T> bK(*this, 0, rows, firstcol, cols-firstcol); // All the columns of 'this' after column 'firstcol'
1294         ScalarArray<T> aj_bK(r, j, 1, firstcol, cols-firstcol); // In 'r': row 'j', all the columns after column 'firstcol'
1295         // Compute in 1 operation all the scalar products between aj and a_firstcol, ..., a_n
1296         aj_bK.gemm('C', 'N', Constants<T>::pone, &aj, &bK, Constants<T>::zero);
1297         // Update a_firstcol, ..., a_n
1298         bK.rankOneUpdateT(Constants<T>::mone, aj, aj_bK);
1299         // Update the norms
1300         for(int k = firstcol; k < cols; ++k) {
1301           double rjk = std::abs(r.get(j,k));
1302           norm2[k] -= rjk * rjk;
1303           norm2_update[k] -= rjk * rjk / norm2_orig[k];
1304         }
1305       }
1306     } else
1307       break;
1308   } // for(int j = rank;
1309 
1310   // Update matrix dimensions
1311   cols = rank;
1312   result->rows = rank;
1313 
1314   // Apply perm to result
1315   for(int j = 0; j < result->cols; ++j) {
1316     // Copy the column j of r into the column perm[j] of result (only 'rank' rows)
1317     memcpy(result->ptr(0, perm[j]), r.const_ptr(0, j), result->rows*sizeof(T));
1318   }
1319   setOrtho(1);
1320   // Clean up
1321   delete[] perm;
1322   /* end of modified Gram-Schmidt */
1323   return rank;
1324 }
1325 
1326 template<typename T>
multiplyWithDiagOrDiagInv(const ScalarArray<T> * d,bool inverse,Side side)1327 void ScalarArray<T>::multiplyWithDiagOrDiagInv(const ScalarArray<T>* d, bool inverse, Side side) {
1328   assert(d);
1329   assert(side == Side::LEFT  || (cols == d->rows));
1330   assert(side == Side::RIGHT || (rows == d->rows));
1331   assert(d->cols==1);
1332 
1333   {
1334     const size_t _rows = rows, _cols = cols;
1335     increment_flops(Multipliers<T>::mul * _rows * _cols);
1336   }
1337   if (side == Side::LEFT) { // line i is multiplied by d[i] or 1/d[i]
1338     // TODO: Test with scale to see if it is better.
1339     if (inverse) {
1340       ScalarArray<T> *d2 = new ScalarArray<T>(rows,1);
1341       for (int i = 0; i < rows; i++)
1342         d2->get(i) = Constants<T>::pone / d->get(i);
1343       d = d2;
1344     }
1345     for (int j = 0; j < cols; j++) {
1346       for (int i = 0; i < rows; i++) {
1347         get(i, j) *= d->get(i);
1348       }
1349     }
1350     if (inverse) delete(d);
1351   } else { // column j is multiplied by d[j] or 1/d[j]
1352     for (int j = 0; j < cols; j++) {
1353       T diag_val = inverse ? Constants<T>::pone / d->get(j,0) : d->get(j);
1354       proxy_cblas::scal(rows, diag_val, ptr(0,j), 1);
1355     }
1356   }
1357 }
1358 
1359 template<typename T>
multiplyWithDiag(const ScalarArray<double> * d)1360 void ScalarArray<T>::multiplyWithDiag(const ScalarArray<double>* d) {
1361   assert(d);
1362   assert(cols <= d->rows); // d can be larger than needed
1363   assert(d->cols==1);
1364 
1365   {
1366     const size_t _rows = rows, _cols = cols;
1367     increment_flops(Multipliers<T>::mul * _rows * _cols);
1368   }
1369   for (int j = 0; j < cols; j++) {
1370     T diag_val = T(d->get(j));
1371     proxy_cblas::scal(rows, diag_val, m+j*lda, 1); // We don't use ptr() on purpose, because is_ortho is preserved here
1372   }
1373 }
1374 
1375 template<typename T>
testOrtho() const1376 bool ScalarArray<T>::testOrtho() const {
1377   static char *test = getenv("HMAT_TEST_ORTHO");
1378   // code % 2 == 0 means we are in simple precision (real or complex)
1379   static double machine_accuracy = Constants<T>::code % 2 == 0 ? 1.19e-7 : 1.11e-16 ;
1380   static double test_accuracy = Constants<T>::code % 2 == 0 ? 1.e-3 : 1.e-7 ;
1381   static double ratioMax=0.;
1382   double ref = norm();
1383   if (ref==0.) return true;
1384   ScalarArray<T> *sp = new ScalarArray<T>(cols, cols);
1385   // Compute the scalar product sp = X^H.X
1386   sp->gemm('C', 'N', Constants<T>::pone, this, this, Constants<T>::zero);
1387   // Nullify the diagonal elements
1388   for (int i=0 ; i<cols ; i++)
1389     sp->get(i,i) = Constants<T>::zero;
1390   // The norm of the rest should be below 'epsilon x norm of this' to have orthogonality and return true
1391   double res = sp->norm();
1392   delete sp;
1393   if (test) {
1394     double ratio = res/ref/machine_accuracy/sqrt((double)rows);
1395     if (ratio > ratioMax) {
1396       ratioMax = ratio;
1397       printf("testOrtho[%dx%d] test=%d get=%d        res=%g ref=%g res/ref=%g ratio=%g ratioMax=%g\n",
1398              rows, cols, (res < ref * test_accuracy), getOrtho(), res, ref, res/ref, ratio, ratioMax);
1399     }
1400   }
1401   return (res < ref * test_accuracy);
1402 }
1403 
1404 template<typename T>
dot(const Vector<T> * x,const Vector<T> * y)1405 T Vector<T>::dot(const Vector<T>* x, const Vector<T>* y) {
1406   assert(x->cols == 1);
1407   assert(y->cols == 1);
1408   assert(x->rows == y->rows);
1409   // TODO: Beware of large vectors (>2 billion elements) !
1410   return proxy_cblas_convenience::dot_c(x->rows, x->const_ptr(), 1, y->const_ptr(), 1);
1411 }
1412 
1413 template<typename T>
absoluteMaxIndex(int startIndex) const1414 int Vector<T>::absoluteMaxIndex(int startIndex) const {
1415   assert(this->cols == 1);
1416   return startIndex + proxy_cblas::i_amax(this->rows - startIndex, this->const_ptr() + startIndex, 1);
1417 }
1418 
1419 // the classes declaration
1420 template class ScalarArray<S_t>;
1421 template class ScalarArray<D_t>;
1422 template class ScalarArray<C_t>;
1423 template class ScalarArray<Z_t>;
1424 
1425 // the classes declaration
1426 template class Vector<S_t>;
1427 template class Vector<D_t>;
1428 template class Vector<C_t>;
1429 template class Vector<Z_t>;
1430 }  // end namespace hmat
1431