1 /*
2 HMat-OSS (HMatrix library, open source software)
3
4 Copyright (C) 2014-2015 Airbus Group SAS
5
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
20 http://github.com/jeromerobert/hmat-oss
21 */
22
23 /*! \file
24 \ingroup HMatrix
25 \brief Dense Matrix implementation.
26 */
27 #include "config.h"
28
29 #ifdef __INTEL_COMPILER
30 #include <mathimf.h>
31 #else
32 #include <cmath>
33 #endif
34
35 #include "scalar_array.hpp"
36
37 #include "data_types.hpp"
38 #include "lapack_overloads.hpp"
39 #include "blas_overloads.hpp"
40 #include "lapack_exception.hpp"
41 #include "common/memory_instrumentation.hpp"
42 #include "system_types.h"
43 #include "common/my_assert.h"
44 #include "common/context.hpp"
45 #include "common/timeline.hpp"
46 #include "lapack_operations.hpp"
47
48 #include <cstring> // memset
49 #include <algorithm> // swap
50 #include <iostream>
51 #include <fstream>
52 #include <cmath>
53 #include <fcntl.h>
54
55 #ifndef _WIN32
56 #include <sys/mman.h> // mmap
57 #endif
58
59 #include <sys/stat.h>
60
61 #ifdef HAVE_UNISTD_H
62 #include <unistd.h>
63 #endif
64
65 #include <stdlib.h>
66
67 #ifdef HAVE_JEMALLOC
68 #define JEMALLOC_NO_DEMANGLE
69 #include <jemalloc/jemalloc.h>
70 #endif
71
72 #ifdef _MSC_VER
73 // Intel compiler defines isnan in global namespace
74 // MSVC defines _isnan
75 # ifndef __INTEL_COMPILER
76 # define isnan _isnan
77 # endif
78 #elif __GLIBC__ == 2 && __GLIBC_MINOR__ < 23
79 // https://sourceware.org/bugzilla/show_bug.cgi?id=19439
80 #elif __cplusplus >= 201103L || !defined(__GLIBC__)
81 using std::isnan;
82 #endif
83
84 namespace hmat {
85
convert_int_to_factorization(int t)86 Factorization convert_int_to_factorization(int t) {
87 switch (t) {
88 case hmat_factorization_none:
89 return Factorization::NONE;
90 case hmat_factorization_lu:
91 return Factorization::LU;
92 case hmat_factorization_ldlt:
93 return Factorization::LDLT;
94 case hmat_factorization_llt:
95 return Factorization::LLT;
96 default:
97 HMAT_ASSERT(false);
98 }
99 }
100
convert_factorization_to_int(Factorization f)101 int convert_factorization_to_int(Factorization f) {
102 switch (f) {
103 case Factorization::NONE:
104 return hmat_factorization_none;
105 case Factorization::LU:
106 return hmat_factorization_lu;
107 case Factorization::LDLT:
108 return hmat_factorization_ldlt;
109 case Factorization::LLT:
110 return hmat_factorization_llt;
111 default:
112 HMAT_ASSERT(false);
113 }
114 }
115
116 /** ScalarArray */
117 template<typename T>
ScalarArray(T * _m,int _rows,int _cols,int _lda)118 ScalarArray<T>::ScalarArray(T* _m, int _rows, int _cols, int _lda)
119 : ownsMemory(false), m(_m), rows(_rows), cols(_cols), lda(_lda) {
120 if (lda == -1) {
121 lda = rows;
122 }
123 ownsFlag = true ;
124 is_ortho = (int*)calloc(1, sizeof(int));
125 assert(lda >= rows);
126 }
127
128 template<typename T>
ScalarArray(int _rows,int _cols,bool initzero)129 ScalarArray<T>::ScalarArray(int _rows, int _cols, bool initzero)
130 : ownsMemory(true), ownsFlag(true), rows(_rows), cols(_cols), lda(_rows) {
131 size_t size = sizeof(T) * rows * cols;
132 void * p;
133 #ifdef HAVE_JEMALLOC
134 p = initzero ? je_calloc(size, 1) : je_malloc(size);
135 #else
136 p = initzero ? calloc(size, 1) : malloc(size);
137 #endif
138 m = static_cast<T*>(p);
139 is_ortho = (int*)calloc(1, sizeof(int));
140 setOrtho(initzero ? 1 : 0); // buffer filled with 0 is orthogonal
141 HMAT_ASSERT_MSG(m, "Trying to allocate %ldb of memory failed (rows=%d cols=%d sizeof(T)=%d)", size, rows, cols, sizeof(T));
142 MemoryInstrumenter::instance().alloc(size, MemoryInstrumenter::FULL_MATRIX);
143 }
144
~ScalarArray()145 template<typename T> ScalarArray<T>::~ScalarArray() {
146 if (ownsMemory) {
147 size_t size = ((size_t) rows) * cols * sizeof(T);
148 MemoryInstrumenter::instance().free(size, MemoryInstrumenter::FULL_MATRIX);
149 #ifdef HAVE_JEMALLOC
150 je_free(m);
151 #else
152 free(m);
153 #endif
154 m = NULL;
155 }
156 if (ownsFlag) {
157 free(is_ortho);
158 is_ortho = NULL;
159 }
160 }
161
resize(int col_num)162 template<typename T> void ScalarArray<T>::resize(int col_num) {
163 assert(ownsFlag);
164 if(col_num > cols)
165 setOrtho(0);
166 int diffcol = col_num - cols;
167 if(diffcol > 0)
168 MemoryInstrumenter::instance().alloc(sizeof(T) * rows * diffcol,
169 MemoryInstrumenter::FULL_MATRIX);
170 else
171 MemoryInstrumenter::instance().free(sizeof(T) * rows * -diffcol,
172 MemoryInstrumenter::FULL_MATRIX);
173 cols = col_num;
174 #ifdef HAVE_JEMALLOC
175 void * p = je_realloc(m, sizeof(T) * rows * cols);
176 #else
177 void * p = realloc(m, sizeof(T) * rows * cols);
178 #endif
179 m = static_cast<T*>(p);
180 }
181
clear()182 template<typename T> void ScalarArray<T>::clear() {
183 assert(lda == rows);
184 std::fill(m, m + ((size_t) rows) * cols, Constants<T>::zero);
185 setOrtho(1); // we dont use ptr(): buffer filled with 0 is orthogonal
186 }
187
storedZeros() const188 template<typename T> size_t ScalarArray<T>::storedZeros() const {
189 size_t result = 0;
190 for (int col = 0; col < cols; col++) {
191 for (int row = 0; row < rows; row++) {
192 if (std::abs(get(row, col)) < 1e-16) {
193 result++;
194 }
195 }
196 }
197 return result;
198 }
199
scale(T alpha)200 template<typename T> void ScalarArray<T>::scale(T alpha) {
201 increment_flops(Multipliers<T>::mul * ((size_t) rows) * cols);
202 if (lda == rows) {
203 if (alpha == Constants<T>::zero) {
204 this->clear();
205 } else {
206 // Warning: check for overflow
207 size_t nm = ((size_t) rows) * cols;
208 const size_t block_size_blas = 1 << 30;
209 while (nm > block_size_blas) {
210 proxy_cblas::scal(block_size_blas, alpha, ptr() + nm - block_size_blas, 1);
211 nm -= block_size_blas;
212 }
213 proxy_cblas::scal(nm, alpha, ptr(), 1);
214 }
215 } else {
216 T* x = ptr();
217 if (alpha == Constants<T>::zero) {
218 for (int col = 0; col < cols; col++) {
219 std::fill(x, x + rows, Constants<T>::zero);
220 x += lda;
221 }
222 } else {
223 for (int col = 0; col < cols; col++) {
224 proxy_cblas::scal(rows, alpha, x, 1);
225 x += lda;
226 }
227 }
228 }
229 if (alpha == Constants<T>::zero) setOrtho(1); // buffer filled with 0 is orthogonal
230 }
231
transpose()232 template<typename T> void ScalarArray<T>::transpose() {
233 assert(lda == rows);
234 #ifdef HAVE_MKL_IMATCOPY
235 proxy_mkl::imatcopy(rows, cols, ptr());
236 std::swap(rows, cols);
237 lda = rows;
238 #else
239 if (rows == cols) {
240 // "Fast" path
241 for (int col = 0; col < cols; col++) {
242 for (int row = 0; row < col; row++) {
243 T tmp = get(row, col);
244 get(row, col) = get(col, row);
245 get(col, row) = tmp;
246 }
247 }
248 } else {
249 ScalarArray<T> *tmp=copy();
250 std::swap(rows, cols);
251 lda = rows;
252 for (int i = 0; i < rows; i++) {
253 for (int j = 0; j < cols; j++) {
254 get(i, j) = tmp->get(j, i);
255 }
256 }
257 delete(tmp);
258 }
259 #endif
260 }
261
conjugate()262 template<typename T> void ScalarArray<T>::conjugate() {
263 if (lda == rows) {
264 // Warning: check for overflow
265 size_t nm = ((size_t) rows) * cols;
266 const size_t block_size_blas = 1 << 30;
267 while (nm > block_size_blas) {
268 // We don't use c->ptr() on purpose, because is_ortho is preserved here
269 proxy_lapack::lacgv(block_size_blas, m + nm - block_size_blas, 1);
270 nm -= block_size_blas;
271 }
272 proxy_lapack::lacgv(nm, m, 1);
273 } else {
274 T* x = m;
275 for (int col = 0; col < cols; col++) {
276 proxy_lapack::lacgv(rows, x, 1);
277 x += lda;
278 }
279 }
280 }
281
copy(ScalarArray<T> * result) const282 template<typename T> ScalarArray<T>* ScalarArray<T>::copy(ScalarArray<T>* result) const {
283 if(result == NULL)
284 result = new ScalarArray<T>(rows, cols, false);
285
286 if (lda == rows && result->lda == result->rows) {
287 size_t size = ((size_t) rows) * cols * sizeof(T);
288 memcpy(result->ptr(), const_ptr(), size);
289 } else {
290 for (int col = 0; col < cols; col++) {
291 size_t resultOffset = ((size_t) result->lda) * col;
292 size_t offset = ((size_t) lda) * col;
293 memcpy(result->ptr() + resultOffset, const_ptr() + offset, rows * sizeof(T));
294 }
295 }
296 result->setOrtho(getOrtho());
297 return result;
298 }
299
copyAndTranspose(ScalarArray<T> * result) const300 template<typename T> ScalarArray<T>* ScalarArray<T>::copyAndTranspose(ScalarArray<T>* result) const {
301 if(result == NULL)
302 result = new ScalarArray<T>(cols, rows);
303 #ifdef HAVE_MKL_IMATCOPY
304 if (lda == rows && result->lda == result->rows) {
305 proxy_mkl::omatcopy(rows, cols, const_ptr(), result->ptr());
306 } else {
307 #endif
308 for (int i = 0; i < rows; i++) {
309 for (int j = 0; j < cols; j++) {
310 result->get(j, i) = get(i, j);
311 }
312 }
313 #ifdef HAVE_MKL_IMATCOPY
314 }
315 #endif
316 return result;
317 }
318
319 template<typename T>
rowsSubset(const int rowsOffset,const int rowsSize) const320 ScalarArray<T> ScalarArray<T>::rowsSubset(const int rowsOffset, const int rowsSize) const { //TODO: remove it, we have a constructor to do this
321 assert(rowsOffset + rowsSize <= rows);
322 return ScalarArray<T>(*this, rowsOffset, rowsSize, 0, cols);
323 }
324
325 template<typename T>
gemm(char transA,char transB,T alpha,const ScalarArray<T> * a,const ScalarArray<T> * b,T beta)326 void ScalarArray<T>::gemm(char transA, char transB, T alpha,
327 const ScalarArray<T>* a, const ScalarArray<T>* b,
328 T beta) {
329 const int aRows = (transA == 'N' ? a->rows : a->cols);
330 const int n = (transB == 'N' ? b->cols : b->rows);
331 const int k = (transA == 'N' ? a->cols : a->rows);
332 const int tA = (transA == 'N' ? 0 : ( transA == 'T' ? 1 : 2 ));
333 const int tB = (transB == 'N' ? 0 : ( transB == 'T' ? 1 : 2 ));
334 Timeline::Task t(Timeline::BLASGEMM, &rows, &cols, &k, &tA, &tB);
335 (void)t;
336 assert(rows == aRows);
337 assert(cols == n);
338 assert(k == (transB == 'N' ? b->rows : b->cols));
339 {
340 const size_t _m = aRows, _n = n, _k = k;
341 const size_t adds = _m * _n * _k;
342 const size_t muls = _m * _n * _k;
343 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
344 }
345 assert(a->lda >= a->rows);
346 assert(b->lda >= b->rows);
347 assert(a->lda > 0);
348 assert(b->lda > 0);
349
350 if (n > 1 || transB != 'N')
351 proxy_cblas::gemm(transA, transB, aRows, n, k, alpha, a->const_ptr(), a->lda, b->const_ptr(), b->lda,
352 beta, this->ptr(), this->lda);
353 else
354 proxy_cblas::gemv(transA, a->rows, a->cols, alpha, a->const_ptr(), a->lda, b->const_ptr(), 1, beta, this->ptr(), 1);
355 }
356
357 template<typename T>
copyMatrixAtOffset(const ScalarArray<T> * a,int rowOffset,int colOffset)358 void ScalarArray<T>::copyMatrixAtOffset(const ScalarArray<T>* a,
359 int rowOffset, int colOffset) {
360 assert(rowOffset + a->rows <= rows);
361 assert(colOffset + a->cols <= cols);
362
363 if (rowOffset == 0 && a->rows == rows &&
364 a->lda == a->rows && lda == rows) {
365 memcpy(ptr() + colOffset * lda, a->const_ptr(), sizeof(T) * rows * a->cols);
366 // If I copy the whole matrix, I copy this flag
367 if(a->cols == cols)
368 setOrtho(a->getOrtho());
369 } else {
370 for (int col = 0; col < a->cols; col++) {
371 memcpy(ptr() + rowOffset + (colOffset + col) * lda,
372 a->const_ptr() + col * a->lda,
373 sizeof(T) * a->rows);
374 }
375 }
376 }
377
378 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
addRandSFINAE(ScalarArray<T> & a,double epsilon)379 void addRandSFINAE(ScalarArray<T>& a, double epsilon) {
380 if (a.lda == a.rows) {
381 for (size_t i = 0; i < ((size_t) a.rows) * a.cols; ++i) {
382 a.get(i) *= 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
383 }
384 } else {
385 for (int col = 0; col < a.cols; ++col) {
386 for (int row = 0; row < a.rows; ++row) {
387 a.get(row, col) *= 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
388 }
389 }
390 }
391 }
392
393 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
addRandSFINAE(ScalarArray<T> & a,double epsilon)394 void addRandSFINAE(ScalarArray<T>& a, double epsilon) {
395 if (a.lda == a.rows) {
396 for (size_t i = 0; i < ((size_t) a.rows) * a.cols; ++i) {
397 typename T::value_type c1 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
398 typename T::value_type c2 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
399 a.get(i) *= T(c1, c2);
400 }
401 } else {
402 for (int col = 0; col < a.cols; ++col) {
403 for (int row = 0; row < a.rows; ++row) {
404 typename T::value_type c1 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
405 typename T::value_type c2 = 1.0 + epsilon*(1.0-2.0*rand()/(double)RAND_MAX);
406 a.get(row, col) *= T(c1, c2);
407 }
408 }
409 }
410 }
411
412 template<typename T>
addRand(double epsilon)413 void ScalarArray<T>::addRand(double epsilon) {
414 DECLARE_CONTEXT;
415 addRandSFINAE<T>(*this, epsilon);
416 }
417
418 template<typename T>
axpy(T alpha,const ScalarArray<T> * a)419 void ScalarArray<T>::axpy(T alpha, const ScalarArray<T>* a) {
420 assert(rows == a->rows);
421 assert(cols == a->cols);
422 size_t size = ((size_t) rows) * cols;
423
424 increment_flops(Multipliers<T>::add * size
425 + (alpha == Constants<T>::pone ? 0 : Multipliers<T>::mul * size));
426 // Fast path
427 if ((lda == rows) && (a->lda == a->rows) && (size < 1000000000)) {
428 proxy_cblas::axpy(size, alpha, a->const_ptr(), 1, ptr(), 1);
429 return;
430 }
431
432 for (int col = 0; col < cols; col++) {
433 proxy_cblas::axpy(rows, alpha, a->const_ptr() + ((size_t) col) * a->lda, 1, ptr() + ((size_t) col) * lda, 1);
434 }
435 }
436
437 template<typename T>
normSqr() const438 double ScalarArray<T>::normSqr() const {
439 size_t size = ((size_t) rows) * cols;
440 T result = Constants<T>::zero;
441
442 // Fast path
443 if ((size < 1000000000) && (lda == rows)) {
444 result += proxy_cblas_convenience::dot_c(size, const_ptr(), 1, const_ptr(), 1);
445 return real(result);
446 }
447 for (int col = 0; col < cols; col++) {
448 result += proxy_cblas_convenience::dot_c(rows, const_ptr() + col * lda, 1, const_ptr() + col * lda, 1);
449 }
450 return real(result);
451 }
452
norm() const453 template<typename T> double ScalarArray<T>::norm() const {
454 return sqrt(normSqr());
455 }
456
457 // Compute squared Frobenius norm of a.b^t (a=this)
norm_abt_Sqr(const ScalarArray<T> & b) const458 template<typename T> double ScalarArray<T>::norm_abt_Sqr(const ScalarArray<T> &b) const {
459 double result = 0;
460 const int k = cols;
461 for (int i = 1; i < k; ++i) {
462 for (int j = 0; j < i; ++j) {
463 result += real(proxy_cblas_convenience::dot_c(rows, const_ptr(0, i), 1, const_ptr(0, j), 1) *
464 proxy_cblas_convenience::dot_c(b.rows, b.const_ptr(0, i), 1, b.const_ptr(0, j), 1));
465 }
466 }
467 result *= 2.0;
468 for (int i = 0; i < k; ++i) {
469 result += real(proxy_cblas_convenience::dot_c(rows, const_ptr(0, i), 1, const_ptr(0, i), 1) *
470 proxy_cblas_convenience::dot_c(b.rows, b.const_ptr(0, i), 1, b.const_ptr(0, i), 1));
471 }
472 return result;
473 }
474
475 // Compute dot product between this[i,*] and b[j,*]
dot_aibj(int i,const ScalarArray<T> & b,int j) const476 template<typename T> T ScalarArray<T>::dot_aibj(int i, const ScalarArray<T> &b, int j) const {
477 return proxy_cblas::dot(cols, &get(i,0), lda, &b.get(j,0), b.lda);
478 }
479
fromFile(const char * filename)480 template<typename T> void ScalarArray<T>::fromFile(const char * filename) {
481 FILE * f = fopen(filename, "rb");
482 /* Read the header before data : [stype, rows, cols, sieof(T), 0] */
483 int code;
484 int r = fread(&code, sizeof(int), 1, f);
485 HMAT_ASSERT(r == 1);
486 HMAT_ASSERT(code == Constants<T>::code);
487 r = fread(&rows, sizeof(int), 1, f);
488 lda = rows;
489 HMAT_ASSERT(r == 1);
490 r = fread(&cols, sizeof(int), 1, f);
491 HMAT_ASSERT(r == 1);
492 r = fseek(f, 2 * sizeof(int), SEEK_CUR);
493 HMAT_ASSERT(r == 0);
494 if(m)
495 free(m);
496 size_t size = ((size_t) rows) * cols * sizeof(T);
497 m = (T*) calloc(size, 1);
498 r = fread(ptr(), size, 1, f);
499 fclose(f);
500 HMAT_ASSERT(r == 1);
501 }
502
toFile(const char * filename) const503 template<typename T> void ScalarArray<T>::toFile(const char *filename) const {
504 int ierr;
505 int fd;
506 size_t size = ((size_t) rows) * cols * sizeof(T) + 5 * sizeof(int);
507
508 HMAT_ASSERT(lda == rows);
509
510 fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
511 HMAT_ASSERT(fd != -1);
512 ierr = lseek(fd, size - 1, SEEK_SET);
513 HMAT_ASSERT(ierr != -1);
514 ierr = write(fd, "", 1);
515 HMAT_ASSERT(ierr == 1);
516 #ifndef _WIN32
517 void* mmapedFile = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
518 ierr = (mmapedFile == MAP_FAILED) ? 1 : 0;
519 HMAT_ASSERT(!ierr);
520 /* Write the header before data : [stype, rows, cols, sieof(T), 0] */
521 int *asIntArray = (int*) mmapedFile;
522 asIntArray[0] = Constants<T>::code;
523 asIntArray[1] = rows;
524 asIntArray[2] = cols;
525 asIntArray[3] = sizeof(T);
526 asIntArray[4] = 0;
527 asIntArray += 5;
528 T* mat = (T*) asIntArray;
529 memcpy(mat, const_ptr(), size - 5 * sizeof(int));
530 close(fd);
531 munmap(mmapedFile, size);
532 #else
533 HMAT_ASSERT_MSG(false, "mmap not available on this platform");
534 #endif
535
536 }
537
memorySize() const538 template<typename T> size_t ScalarArray<T>::memorySize() const {
539 return ((size_t) rows) * cols * sizeof(T);
540 }
541
542 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
checkNanSFINAE(const ScalarArray<T> * m)543 void checkNanSFINAE(const ScalarArray<T>* m) {
544 for (int col = 0; col < m->cols; col++) {
545 for (int row = 0; row < m->rows; row++) {
546 HMAT_ASSERT(!isnan(m->get(row, col)));
547 }
548 }
549 }
550
551 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
checkNanSFINAE(const ScalarArray<T> * m)552 void checkNanSFINAE(const ScalarArray<T>* m) {
553 for (int col = 0; col < m->cols; col++) {
554 for (int row = 0; row < m->rows; row++) {
555 HMAT_ASSERT(!isnan(m->get(row, col).real()));
556 HMAT_ASSERT(!isnan(m->get(row, col).imag()));
557 }
558 }
559 }
560
561 template<typename T>
checkNan() const562 void ScalarArray<T>::checkNan() const {
563 checkNanSFINAE<T>(this);
564 }
565
isZero() const566 template<typename T> bool ScalarArray<T>::isZero() const {
567 for(int i = 0; i < rows; i++)
568 for(int j = 0; j < cols; j++)
569 if (get(i, j) != Constants<T>::zero)
570 return false;
571 return true;
572 }
573
574 // this := alpha*x*b^T + this
rankOneUpdate(const T alpha,const ScalarArray<T> & x,const ScalarArray<T> & b)575 template<typename T> void ScalarArray<T>::rankOneUpdate(const T alpha, const ScalarArray<T> &x, const ScalarArray<T> &b){
576 assert(x.rows==rows);
577 assert(x.cols==1);
578 assert(b.rows==cols);
579 assert(b.cols==1);
580 proxy_cblas::ger(rows, cols, alpha, x.const_ptr(), 1, b.const_ptr(), 1, ptr(), lda);
581 }
582
583 // this := alpha*x*b + this
rankOneUpdateT(const T alpha,const ScalarArray<T> & x,const ScalarArray<T> & tb)584 template<typename T> void ScalarArray<T>::rankOneUpdateT(const T alpha, const ScalarArray<T> &x, const ScalarArray<T> &tb){
585 assert(x.rows==rows);
586 assert(x.cols==1);
587 assert(tb.rows==1);
588 assert(tb.cols==cols);
589 proxy_cblas::ger(rows, cols, alpha, x.const_ptr(), 1, tb.const_ptr(), tb.lda, ptr(), lda);
590 }
591
writeArray(hmat_iostream writeFunc,void * userData) const592 template<typename T> void ScalarArray<T>::writeArray(hmat_iostream writeFunc, void * userData) const{
593 assert(lda == rows);
594 size_t s = (size_t)rows * cols;
595 // We use m instead of const_ptr() because writeFunc() expects a void*, not a const void*
596 writeFunc(m, sizeof(T) * s, userData);
597 }
598
readArray(hmat_iostream readFunc,void * userData)599 template<typename T> void ScalarArray<T>::readArray(hmat_iostream readFunc, void * userData) {
600 assert(lda == rows);
601 size_t s = (size_t)rows * cols;
602 readFunc(ptr(), sizeof(T) * s, userData);
603 }
604
luDecomposition(int * pivots)605 template<typename T> void ScalarArray<T>::luDecomposition(int *pivots) {
606 int info;
607 {
608 const size_t _m = rows, _n = cols;
609 const size_t muls = _m * _n *_n / 2 - _n *_n*_n / 6 + _m * _n / 2 - _n*_n / 2 + 2 * _n / 3;
610 const size_t adds = _m * _n *_n / 2 - _n *_n*_n / 6 + _m * _n / 2 + _n / 6;
611 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
612 }
613 info = proxy_lapack::getrf(rows, cols, ptr(), lda, pivots);
614 if (info)
615 throw LapackException("getrf", info);
616 }
617
618 template<typename T>
619 class InvalidDiagonalException: public LapackException {
620 std::string invalidDiagonalMessage_;
621 public:
InvalidDiagonalException(const T value,const int j,const char * where)622 InvalidDiagonalException(const T value, const int j, const char * where)
623 : LapackException(where, -1)
624 {
625 std::stringstream sstm;
626 sstm << "In " << where << ", diagonal index "<< j << " has an invalid value " << value;
627 invalidDiagonalMessage_ = sstm.str();
628 }
629
what() const630 virtual const char* what() const noexcept {
631 return invalidDiagonalMessage_.c_str();
632 }
633 };
634
635 template<typename T, typename std::enable_if<hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
assertPositive(const T v,const int j,const char * const where)636 void assertPositive(const T v, const int j, const char * const where) {
637 if(!(v > 0))
638 throw InvalidDiagonalException<S_t>(v, j, where);
639 }
640
641 template<typename T, typename std::enable_if<!hmat::Types<T>::IS_REAL::value, T*>::type = nullptr>
assertPositive(const T v,const int j,const char * const where)642 void assertPositive(const T v, const int j, const char * const where) {
643 if(v == Constants<T>::zero)
644 throw InvalidDiagonalException<T>(v, j, where);
645 }
646
647 template<typename T>
ldltDecomposition(Vector<T> & diagonal)648 void ScalarArray<T>::ldltDecomposition(Vector<T>& diagonal) {
649 assert(rows == cols); // We expect a square matrix
650 //TODO : add flops counter
651
652 // Standard LDLt factorization algorithm is:
653 // diag[j] = A(j,j) - sum_{k < j} L(j,k)^2 diag[k]
654 // L(i,j) = (A(i,j) - sum_{k < j} (L(i,k)L(j,k)diag[k])) / diag[j]
655 // See for instance http://en.wikipedia.org/wiki/Cholesky_decomposition
656 // An auxiliary array is introduced in order to perform less multiplications,
657 // see algorithm 1 in http://icl.cs.utk.edu/projectsfiles/plasma/pubs/ICL-UT-11-03.pdf
658 int n = rows;
659 T* v = new T[n];
660 HMAT_ASSERT(v);
661 for (int j = 0; j < n; j++) {
662 for (int i = 0; i < j; i++)
663 v[i] = get(j,i) * get(i,i);
664
665 v[j] = get(j,j);
666 for (int i = 0; i < j; i++)
667 // Do not use the -= operator because it's buggy in the intel compiler
668 v[j] = v[j] - get(j,i) * v[i];
669
670 get(j,j) = v[j];
671 for (int i = 0; i < j; i++)
672 for (int k = j+1; k < n; k++)
673 get(k,j) -= get(k,i) * v[i];
674
675 for (int k = j+1; k < n; k++) {
676 if (v[j] == Constants<T>::zero)
677 throw InvalidDiagonalException<T>(v[j], j, "ldltDecomposition");
678 get(k,j) /= v[j];
679 }
680 }
681
682 for(int i = 0; i < n; i++) {
683 diagonal[i] = get(i,i);
684 get(i,i) = Constants<T>::pone;
685 for (int j = i + 1; j < n; j++)
686 get(i,j) = Constants<T>::zero;
687 }
688
689 delete[] v;
690 }
691
692 template<typename T>
lltDecomposition()693 void ScalarArray<T>::lltDecomposition() {
694 assert(rows == cols); // We expect a square matrix
695
696 // Standard LLt factorization algorithm is:
697 // L(j,j) = sqrt( A(j,j) - sum_{k < j} L(j,k)^2)
698 // L(i,j) = ( A(i,j) - sum_{k < j} L(i,k)L(j,k) ) / L(j,j)
699
700 // from http://www.netlib.org/lapack/lawnspdf/lawn41.pdf page 120
701 int n = rows;
702 const size_t n2 = (size_t) n*n;
703 const size_t n3 = n2 * n;
704 const size_t muls = n3 / 6 + n2 / 2 + n / 3;
705 const size_t adds = n3 / 6 - n / 6;
706 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
707
708 for (int j = 0; j < n; j++) {
709 for (int k = 0; k < j; k++)
710 get(j,j) -= get(j,k) * get(j,k);
711 assertPositive(get(j, j), j, "lltDecomposition");
712
713 get(j,j) = std::sqrt(get(j,j));
714
715 for (int k = 0; k < j; k++)
716 for (int i = j+1; i < n; i++)
717 get(i,j) -= get(i,k) * get(j,k);
718
719 for (int i = j+1; i < n; i++) {
720 get(i,j) /= get(j,j);
721 }
722 }
723
724 // For real matrices, we could use the lapack version, could be faster
725 // (There is no L.Lt factorisation for complex matrices in lapack)
726 // int info = proxy_lapack::potrf('L', this->rows(), this->data.m, this->data.lda);
727 // if(info != 0)
728 // // throw a pointer to be compliant with the Status class
729 // throw hmat::LapackException("potrf", info);
730
731 for (int j = 0; j < n; j++) {
732 for(int i = 0; i < j; i++) {
733 get(i,j) = Constants<T>::zero;
734 }
735 }
736 }
737
738 // Helper functions for solveTriangular
to_blas(const Side side)739 inline char to_blas(const Side side) {
740 return side == Side::LEFT ? 'L' : 'R';
741 }
742
to_blas(const Uplo uplo)743 inline char to_blas(const Uplo uplo) {
744 return uplo == Uplo::LOWER ? 'L' : 'U';
745 }
746
to_blas(const Diag diag)747 inline char to_blas(const Diag diag) {
748 return diag == Diag::UNIT ? 'U' : 'N';
749 }
750
751
752 // The following code is very close to that of ZGETRS in LAPACK.
753 // However, the resolution here is divided in the two parties.
754
755 // Warning! The matrix has been obtained with ZGETRF therefore it is
756 // permuted! We have the factorization A = P L U with P the
757 // permutation matrix. So to solve L X = B, we must
758 // solve LX = (P ^ -1 B), which is done by ZLASWP with
759 // the permutation. we used it just like in ZGETRS.
760 template<typename T>
solveLowerTriangularLeft(ScalarArray<T> * x,const FactorizationData<T> & context,Diag diag,Uplo uplo) const761 void ScalarArray<T>::solveLowerTriangularLeft(ScalarArray<T>* x, const FactorizationData<T>& context, Diag diag, Uplo uplo) const {
762 {
763 const size_t _m = rows, _n = x->cols;
764 const size_t adds = _n * _m * (_m - 1) / 2;
765 const size_t muls = _n * _m * (_m + 1) / 2;
766 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
767 }
768 if (context.algo == Factorization::LU && uplo == Uplo::LOWER)
769 proxy_lapack::laswp(x->cols, x->ptr(), x->lda, 1, rows, context.data.pivots, 1);
770 proxy_cblas::trsm('L', to_blas(uplo), uplo == Uplo::LOWER ? 'N' : 'T', to_blas(diag), rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
771 }
772
773 // The resolution of the upper triangular system does not need to
774 // change the order of columns.
775 // The pivots are not necessary here, but this helps to check
776 // the matrix was factorized before.
777
778 template<typename T>
solveUpperTriangularRight(ScalarArray<T> * x,const FactorizationData<T> & context,Diag diag,Uplo uplo) const779 void ScalarArray<T>::solveUpperTriangularRight(ScalarArray<T>* x, const FactorizationData<T>& context, Diag diag, Uplo uplo) const {
780 // Void matrix
781 if (x->rows == 0 || x->cols == 0) return;
782
783 {
784 const size_t _m = rows, _n = x->cols;
785 const size_t adds = _n * _m * (_m - 1) / 2;
786 const size_t muls = _n * _m * (_m + 1) / 2;
787 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
788 }
789 proxy_cblas::trsm('R', to_blas(uplo), uplo == Uplo::LOWER ? 'T' : 'N', to_blas(diag),
790 x->rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
791 }
792
793 template<typename T>
solveUpperTriangularLeft(ScalarArray<T> * x,const FactorizationData<T> &,Diag diag,Uplo uplo) const794 void ScalarArray<T>::solveUpperTriangularLeft(ScalarArray<T>* x, const FactorizationData<T>&, Diag diag, Uplo uplo) const {
795 // Void matrix
796 if (x->rows == 0 || x->cols == 0) return;
797
798 {
799 const size_t _m = rows, _n = x->cols;
800 const size_t adds = _n * _m * (_n - 1) / 2;
801 const size_t muls = _n * _m * (_n + 1) / 2;
802 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
803 }
804 proxy_cblas::trsm('L', to_blas(uplo), uplo == Uplo::LOWER ? 'T' : 'N', to_blas(diag),
805 x->rows, x->cols, Constants<T>::pone, const_ptr(), lda, x->ptr(), x->lda);
806 }
807
808 template<typename T>
solve(ScalarArray<T> * x,const FactorizationData<T> & context) const809 void ScalarArray<T>::solve(ScalarArray<T>* x, const FactorizationData<T>& context) const {
810 // Void matrix
811 if (x->rows == 0 || x->cols == 0) return;
812
813 int ierr = 0;
814 {
815 const size_t nrhs = x->cols;
816 const size_t n = rows;
817 const size_t adds = n * n * nrhs;
818 const size_t muls = (n * n - n) * nrhs;
819 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
820 }
821 HMAT_ASSERT(context.algo == Factorization::LU);
822 ierr = proxy_lapack::getrs('N', rows, x->cols, const_ptr(), lda, context.data.pivots, x->ptr(), x->rows);
823 if (ierr)
824 throw LapackException("getrs", ierr);
825 }
826
827
828 template<typename T>
inverse()829 void ScalarArray<T>::inverse() {
830
831 // The inversion is done in two steps with dgetrf for LU decomposition and
832 // dgetri for inversion of triangular matrices
833
834 assert(rows == cols);
835
836 int *ipiv = new int[rows];
837 int info;
838 {
839 size_t vn = cols, vm = cols;
840 // getrf
841 size_t additions = (vm*vn*vn)/2 - (vn*vn*vn)/6 - (vm*vn)/2 + vn/6;
842 size_t multiplications = (vm*vn*vn)/2 - (vn*vn*vn)/6 + (vm*vn)/2
843 - (vn*vn)/2 + 2*vn/3;
844 increment_flops(Multipliers<T>::add * additions + Multipliers<T>::mul * multiplications);
845 // getri
846 additions = (2*vn*vn*vn)/3 - (3*vn*vn)/2 + (5*vn)/6;
847 multiplications = (2*vn*vn*vn)/3 + (vn*vn)/2 + (5*vn)/6;
848 increment_flops(Multipliers<T>::add * additions + Multipliers<T>::mul * multiplications);
849 }
850 info = proxy_lapack::getrf(rows, cols, ptr(), lda, ipiv);
851 HMAT_ASSERT(!info);
852 // We call it twice: the first time to know the optimal size of
853 // temporary arrays, and the second time for real calculation.
854 int workSize;
855 T workSize_req;
856 info = proxy_lapack::getri(rows, ptr(), lda, ipiv, &workSize_req, -1);
857 workSize = (int) real(workSize_req) + 1;
858 T* work = new T[workSize];
859 HMAT_ASSERT(work);
860 info = proxy_lapack::getri(rows, ptr(), lda, ipiv, work, workSize);
861 delete[] work;
862 if (info)
863 throw LapackException("getri", info);
864 delete[] ipiv;
865 }
866
867 /*! \brief Returns the number of singular values to keep.
868
869 The stop criterion is (assuming that the singular value
870 are in descending order):
871 sigma [k] / sigma[0] <epsilon
872 except if env. var. HMAT_SUM_CRITERION is set, in which case the criterion is:
873 sigma [k] / SUM (sigma) <epsilon
874
875 \param sigma table of singular values at least maxK elements.
876 \param epsilon tolerance.
877 \return int the number of singular values to keep.
878 */
findK(Vector<double> & sigma,double epsilon)879 static int findK(Vector<double> &sigma, double epsilon) {
880 assert(epsilon >= 0.);
881 static char *useSumCriterion = getenv("HMAT_SUM_CRITERION");
882 double threshold_eigenvalue = 0.0;
883 if (useSumCriterion == NULL) {
884 threshold_eigenvalue = sigma[0];
885 } else {
886 for (int i = 0; i < sigma.rows; i++) {
887 threshold_eigenvalue += sigma[i];
888 }
889 }
890 threshold_eigenvalue *= epsilon;
891 int i = 0;
892 for (i = 0; i < sigma.rows; i++) {
893 if (sigma[i] <= threshold_eigenvalue){
894 break;
895 }
896 }
897 return i;
898 }
899
truncatedSvdDecomposition(ScalarArray<T> ** u,ScalarArray<T> ** v,double epsilon,bool workAroundFailures) const900 template<typename T> int ScalarArray<T>::truncatedSvdDecomposition(ScalarArray<T>** u, ScalarArray<T>** v, double epsilon, bool workAroundFailures) const {
901 Vector<double>* sigma = NULL;
902
903 svdDecomposition(u, &sigma, v, workAroundFailures);
904
905 // Control of the approximation
906 int newK = findK(*sigma, epsilon);
907
908 if(newK == 0) {
909 delete *u;
910 delete *v;
911 delete sigma;
912 *u = NULL;
913 *v = NULL;
914 return 0;
915 }
916
917 (*u)->resize(newK);
918 sigma->rows = newK;
919 (*v)->resize(newK);
920
921 // We put the square root of singular values in sigma
922 for (int i = 0; i < newK; i++)
923 (*sigma)[i] = sqrt((*sigma)[i]);
924
925 // Apply sigma 'symmetrically' on u and v
926 (*u)->multiplyWithDiag(sigma);
927 (*v)->multiplyWithDiag(sigma);
928 delete sigma;
929
930 return newK;
931 }
932
svdDecomposition(ScalarArray<T> ** u,Vector<double> ** sigma,ScalarArray<T> ** v,bool workAroundFailures) const933 template<typename T> int ScalarArray<T>::svdDecomposition(ScalarArray<T>** u, Vector<double>** sigma, ScalarArray<T>** v, bool workAroundFailures) const {
934 DECLARE_CONTEXT;
935 static char * useGESDD = getenv("HMAT_GESDD");
936 Timeline::Task t(Timeline::SVD, &rows, &cols);
937 (void)t;
938 // Allocate free space for U, S, V
939 int p = std::min(rows, cols);
940
941 *u = new ScalarArray<T>(rows, p, false);
942 *sigma = new Vector<double>(p);
943 *v = new ScalarArray<T>(p, cols, false); // We create v in transposed shape (as expected by lapack zgesvd)
944
945 // To be prepared for working around a failure in SVD, I must do a copy of 'this'
946 ScalarArray<T> *a = workAroundFailures ? copy() : NULL;
947
948 assert(lda >= rows);
949
950 char jobz = 'S';
951 int info=0;
952
953 {
954 const size_t _m = rows, _n = cols;
955 // Warning: These quantities are a rough approximation.
956 // What's wrong with these estimates:
957 // - Golub only gives 14 * M*N*N + 8 N*N*N
958 // - This is for real numbers
959 // - We assume the same number of * and +
960 size_t adds = 7 * _m * _n * _n + 4 * _n * _n * _n;
961 size_t muls = 7 * _m * _n * _n + 4 * _n * _n * _n;
962 increment_flops(Multipliers<T>::add * adds + Multipliers<T>::mul * muls);
963 }
964
965 try {
966 if(useGESDD)
967 info = sddCall(jobz, rows, cols, ptr(), lda, (*sigma)->ptr(), (*u)->ptr(),
968 (*u)->lda, (*v)->ptr(), (*v)->lda);
969 else
970 info = svdCall(jobz, jobz, rows, cols, ptr(), lda, (*sigma)->ptr(), (*u)->ptr(),
971 (*u)->lda, (*v)->ptr(), (*v)->lda);
972 (*v)->transpose();
973 (*u)->setOrtho(1);
974 (*v)->setOrtho(1);
975 delete a;
976 } catch(LapackException & e) {
977 // if SVD fails, and if workAroundFailures is set to true, I return a "fake" result that allows
978 // computation to proceed. Otherwise, I stop here.
979 if (!workAroundFailures) throw;
980
981 printf("%s overriden...\n", e.what());
982 // If rows<cols, then p==rows, 'u' is square, 'v' has the dimensions of 'this'.
983 // fake 'u' is identity, fake 'v' is 'this^T'
984 if (rows<cols) {
985 for (int i=0 ; i<rows ; i++)
986 for (int j=0 ; j<p ; j++)
987 (*u)->get(i,j) = i==j ? Constants<T>::pone : Constants<T>::zero ;
988 (*u)->setOrtho(1);
989 delete *v;
990 *v = a ;
991 (*v)->transpose();
992 } else {
993 // Otherwise rows>=cols, then p==cols, 'u' has the dimensions of 'this', 'v' is square.
994 // fake 'u' is 'this', fake 'v' is identity
995 delete *u;
996 *u = a ;
997 for (int i=0 ; i<p ; i++)
998 for (int j=0 ; j<cols ; j++)
999 (*v)->get(i,j) = i==j ? Constants<T>::pone : Constants<T>::zero ;
1000 (*v)->setOrtho(1);
1001 }
1002 // Fake 'sigma' is all 1
1003 for (int i=0 ; i<p ; i++)
1004 (**sigma)[i] = Constants<double>::pone ;
1005 }
1006
1007 return info;
1008 }
1009
orthoColumns(ScalarArray<T> * resultR,int initialPivot)1010 template<typename T> void ScalarArray<T>::orthoColumns(ScalarArray<T> *resultR, int initialPivot) {
1011 DECLARE_CONTEXT;
1012
1013 // We use the fact that the initial 'initialPivot' are orthogonal
1014 // We just normalize them and update the columns >= initialPivot using MGS-like approach
1015 ScalarArray<T> bK(*this, 0, rows, initialPivot, cols-initialPivot); // All the columns of 'this' after column 'initialPivot'
1016 for(int j = 0; j < initialPivot; ++j) {
1017 // Normalisation of column j
1018 Vector<T> aj(*this, j);
1019 resultR->get(j,j) = aj.norm();
1020 T coef = Constants<T>::pone / resultR->get(j,j);
1021 aj.scale(coef);
1022 }
1023 // Remove the qj-component from vectors bk (k=initialPivot,...,n-1)
1024 if (initialPivot<cols) {
1025 static char *useBlas3 = getenv("HMAT_MGS_BLAS3");
1026 if (useBlas3) {
1027 ScalarArray<T> aJ(*this, 0, rows, 0, initialPivot); // All the columns of 'this' from 0 to 'initialPivot-1'
1028 ScalarArray<T> aJ_bK(*resultR, 0, initialPivot, initialPivot, cols-initialPivot); // In 'r': row '0' to 'initialPivot-1', all the columns after column 'initialPivot-1'
1029 // Compute in 1 operation all the scalar products between a_0,...,a_init-1 and a_init, ..., a_n-1
1030 aJ_bK.gemm('C', 'N', Constants<T>::pone, &aJ, &bK, Constants<T>::zero);
1031 // Update a_init, ..., a_n-1
1032 bK.gemm('N', 'N', Constants<T>::mone, &aJ, &aJ_bK, Constants<T>::pone);
1033 } else {
1034 for(int j = 0; j < initialPivot; ++j) {
1035 Vector<T> aj(*this, j);
1036 ScalarArray<T> aj_bK(*resultR, j, 1, initialPivot, cols-initialPivot); // In 'r': row 'j', all the columns after column 'initialPivot'
1037 // Compute in 1 operation all the scalar products between aj and a_firstcol, ..., a_n
1038 aj_bK.gemm('C', 'N', Constants<T>::pone, &aj, &bK, Constants<T>::zero);
1039 // Update a_firstcol, ..., a_n
1040 bK.rankOneUpdateT(Constants<T>::mone, aj, aj_bK);
1041 }
1042 }
1043 } // if (initialPivot<cols)
1044 }
1045
qrDecomposition(ScalarArray<T> * resultR,int initialPivot)1046 template<typename T> void ScalarArray<T>::qrDecomposition(ScalarArray<T> *resultR, int initialPivot) {
1047 DECLARE_CONTEXT;
1048 Timeline::Task t(Timeline::QR, &rows, &cols, &initialPivot);
1049 (void)t;
1050
1051 static char *useInitPivot = getenv("HMAT_TRUNC_INITPIV");
1052 if (!useInitPivot) initialPivot=0;
1053 assert(initialPivot>=0 && initialPivot<=cols);
1054
1055 ScalarArray<T> *bK = NULL, *restR = NULL, *a = this; // we need this because if initialPivot>0, we will run the end of the computation on a subset of 'this'
1056
1057 // Initial pivot
1058 if (initialPivot) {
1059 // modify the columns [initialPivot, cols[ to make them orthogonals to columns [0, initialPivot[
1060 orthoColumns(resultR, initialPivot);
1061 // then we do qrDecomposition on the remaining columns (without initial pivot)
1062 bK = new ScalarArray<T> (*this, 0, rows, initialPivot, cols-initialPivot); // All the columns of 'this' after column 'initialPivot'
1063 restR = new ScalarArray<T> (*resultR, initialPivot, cols-initialPivot, initialPivot, cols-initialPivot); // In 'r': all the rows and columns after row/column 'initialPivot'
1064 a = bK;
1065 resultR = restR;
1066 }
1067
1068 // SUBROUTINE DGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
1069 T* tau = (T*) calloc(std::min(a->rows, a->cols), sizeof(T));
1070 {
1071 size_t mm = std::max(a->rows, a->cols);
1072 size_t n = std::min(a->rows, a->cols);
1073 size_t multiplications = mm * n * n - (n * n * n) / 3 + mm * n + (n * n) / 2 + (29 * n) / 6;
1074 size_t additions = mm * n * n + (n * n * n) / 3 + 2 * mm * n - (n * n) / 2 + (5 * n) / 6;
1075 increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1076 }
1077 int info;
1078 int workSize;
1079 T workSize_S;
1080 // int info = LAPACKE_sgeqrf(LAPACK_COL_MAJOR, rows, cols, m, rows, *tau);
1081 info = proxy_lapack::geqrf(a->rows, a->cols, a->ptr(), a->rows, tau, &workSize_S, -1);
1082 HMAT_ASSERT(!info);
1083 workSize = (int) hmat::real(workSize_S) + 1;
1084 T* work = new T[workSize];// TODO Mettre dans la pile ??
1085 HMAT_ASSERT(work) ;
1086 info = proxy_lapack::geqrf(a->rows, a->cols, a->ptr(), a->rows, tau, work, workSize);
1087 delete[] work;
1088
1089 HMAT_ASSERT(!info);
1090
1091 // Copy the 'r' factor in the upper part of resultR
1092 for (int col = 0; col < a->cols; col++) {
1093 for (int row = 0; row <= col; row++) {
1094 resultR->get(row, col) = a->get(row, col);
1095 }
1096 }
1097
1098 // Copy tau in the last column of 'this'
1099 memcpy(a->ptr(0, a->cols-1), tau, sizeof(T)*std::min(a->rows, a->cols));
1100 free(tau);
1101
1102 // temporary data created if initialPivot>0
1103 if (bK) delete(bK);
1104 if (restR) delete(restR);
1105
1106 return;
1107 }
1108
1109 // aFull <- aFull.bTri^t with aFull=this and bTri upper triangular matrix
1110 template<typename T>
myTrmm(const ScalarArray<T> * bTri)1111 void ScalarArray<T>::myTrmm(const ScalarArray<T>* bTri) {
1112 DECLARE_CONTEXT;
1113 int mm = rows;
1114 int n = rows;
1115 T alpha = Constants<T>::pone;
1116 const T *bData = bTri->const_ptr();
1117 HMAT_ASSERT(bTri->lda == bTri->rows);
1118 HMAT_ASSERT(lda == rows);
1119 {
1120 size_t m_ = mm;
1121 size_t nn = n;
1122 size_t multiplications = m_ * nn * (nn + 1) / 2;
1123 size_t additions = m_ * nn * (nn - 1) / 2;
1124 increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1125 }
1126 proxy_cblas::trmm('R', 'U', 'T', 'N', mm, n, alpha, bData, bTri->lda, ptr(), lda);
1127 }
1128
1129 template<typename T>
productQ(char side,char trans,ScalarArray<T> * c) const1130 int ScalarArray<T>::productQ(char side, char trans, ScalarArray<T>* c) const {
1131 DECLARE_CONTEXT;
1132 Timeline::Task t(Timeline::PRODUCTQ, &cols, &c->rows, &c->cols);
1133 (void)t;
1134 assert((side == 'L') ? rows == c->rows : rows == c->cols);
1135 int info;
1136 int workSize;
1137 T workSize_req;
1138 {
1139 size_t _m = c->rows, _n = c->cols, _k = cols;
1140 size_t muls = 2 * _m * _n * _k - _n * _k * _k + 2 * _n * _k;
1141 size_t adds = 2 * _m * _n * _k - _n * _k * _k + _n * _k;
1142 increment_flops(Multipliers<T>::mul * muls + Multipliers<T>::add * adds);
1143 }
1144
1145 // In qrDecomposition(), tau is stored in the last column of 'this'
1146 // it is not valid to work with 'tau' inside the array 'a' because zunmqr modifies 'a'
1147 // during computation. So we work on a copy of tau.
1148 T tau[std::min(rows, cols)];
1149 memcpy(tau, const_ptr(0, cols-1), sizeof(T)*std::min(rows, cols));
1150
1151 // We don't use c->ptr() on purpose, because c->is_ortho is preserved here (Q is orthogonal)
1152 info = proxy_lapack_convenience::or_un_mqr(side, trans, c->rows, c->cols, cols, const_ptr(), lda, tau, c->m, c->lda, &workSize_req, -1);
1153 HMAT_ASSERT(!info);
1154 workSize = (int) hmat::real(workSize_req) + 1;
1155
1156 // If the previous call to 'or_un_mqr' does not give us a large enough work
1157 // space size, we set the latter to the count of rows or columns of the matrix
1158 // depending on the value of 'side'.
1159 if(side == 'L') {
1160 workSize = workSize < c->rows ? c->rows : workSize;
1161 } else if(side == 'R') {
1162 workSize = workSize < c->cols ? c->cols : workSize;
1163 }
1164
1165 T* work = new T[workSize];
1166 HMAT_ASSERT(work);
1167 info = proxy_lapack_convenience::or_un_mqr(side, trans, c->rows, c->cols, cols, const_ptr(), lda, tau, c->m, c->lda, work, workSize);
1168 HMAT_ASSERT(!info);
1169 delete[] work;
1170 return 0;
1171 }
1172
modifiedGramSchmidt(ScalarArray<T> * result,double prec,int initialPivot)1173 template<typename T> int ScalarArray<T>::modifiedGramSchmidt(ScalarArray<T> *result, double prec, int initialPivot ) {
1174 DECLARE_CONTEXT;
1175 Timeline::Task t(Timeline::MGS, &rows, &cols, &initialPivot);
1176 (void)t;
1177
1178 static char *useInitPivot = getenv("HMAT_MGS_INITPIV");
1179 if (!useInitPivot) initialPivot=0;
1180 assert(initialPivot>=0 && initialPivot<=cols);
1181
1182
1183 {
1184 size_t mm = rows;
1185 size_t n = cols;
1186 size_t multiplications = mm*n*n;
1187 size_t additions = mm*n*n;
1188 increment_flops(Multipliers<T>::mul * multiplications + Multipliers<T>::add * additions);
1189 }
1190 int rank=0;
1191 double relative_epsilon;
1192 static const double LOWEST_EPSILON = 1.0e-6;
1193
1194 const int original_rank(result->rows);
1195 assert(original_rank == result->cols);
1196 assert(original_rank >= cols);
1197 int* perm = new int[original_rank];
1198 for(int k = 0; k < original_rank; ++k) {
1199 perm[k] = k;
1200 }
1201 // Temporary arrays
1202 ScalarArray<T> r(original_rank, original_rank);
1203 Vector<T> buffer(std::max(original_rank, rows));
1204
1205 // Lower threshold for relative precision
1206 if(prec < LOWEST_EPSILON) {
1207 prec = LOWEST_EPSILON;
1208 }
1209
1210 // Apply Modified Gram-Schmidt process with column pivoting to the 'initialPivot' first columns,
1211 // where the j-th pivot is column j
1212 // No stopping criterion here. Since these columns come from another recompression, we assume
1213 // they are all kept.
1214 // With this approach, we can use blas3 (more computationnaly efficient) but we do a classical
1215 // (not a "modified") Gram Schmidt algorithm, which is known to be numerically unstable
1216 // So expect to lose on final accuracy of hmat
1217
1218 // Initial pivot
1219 if (initialPivot) {
1220 // modify the columns [initialPivot, cols[ to make them orthogonals to columns [0, initialPivot[
1221 orthoColumns(&r, initialPivot);
1222 rank = initialPivot;
1223 }
1224
1225 // Init.
1226 Vector<double> norm2(cols); // Norm^2 of the columns of 'this' during computation
1227 Vector<double> norm2_orig(cols); // Norm^2 of the original columns of 'this'
1228 Vector<double> norm2_update(cols); // Norm^2 of the columns of 'this' normalized (will be 1. at the beginning, then decrease)
1229 // The actual norm^2 of column j during computation (before it is finally normalized) will be norm2[j] = norm2_orig[j] * norm2_update[j]
1230 // The choice of the pivot will be based on norm2_update[] or norm2[]
1231 relative_epsilon = 0.0;
1232 for(int j=rank; j < cols; ++j) {
1233 const Vector<T> aj(*this, j);
1234 norm2[j] = aj.normSqr();
1235 relative_epsilon = std::max(relative_epsilon, norm2[j]);
1236 norm2_orig[j] = norm2[j];
1237 norm2_update[j] = 1.0 ;
1238 if(norm2_orig[j]==0) { // Neutralize the null columns
1239 norm2_orig[j] = 1.;
1240 norm2_update[j] = 0.0 ;
1241 }
1242 }
1243 relative_epsilon *= prec * prec;
1244
1245 // Modified Gram-Schmidt process with column pivoting
1246 for(int j = rank; j < cols; ++j) {
1247 // Find the largest pivot
1248 int pivot = norm2.absoluteMaxIndex(j);
1249 double pivmax = norm2[pivot];
1250
1251 static char *newPivot = getenv("HMAT_MGS_ALTPIV");
1252 if (newPivot) {
1253 pivot = norm2_update.absoluteMaxIndex(j);
1254 pivmax = norm2_update[pivot];
1255 relative_epsilon = prec * prec;
1256 }
1257 if (j<initialPivot) {
1258 pivot = j;
1259 pivmax = 1.;
1260 }
1261
1262 // Stopping criterion
1263 if (pivmax > relative_epsilon) {
1264 ++rank;
1265
1266 // Pivoting
1267 if (j != pivot) {
1268 std::swap(perm[j], perm[pivot]);
1269 std::swap(norm2[j], norm2[pivot]);
1270 std::swap(norm2_orig[j], norm2_orig[pivot]);
1271 std::swap(norm2_update[j], norm2_update[pivot]);
1272
1273 // Exchange the column 'j' and 'pivot' in this[] using buffer as temp space
1274 memcpy(buffer.ptr(), const_ptr(0, j), rows*sizeof(T));
1275 memcpy(ptr(0, j), const_ptr(0, pivot), rows*sizeof(T));
1276 memcpy(ptr(0, pivot), buffer.const_ptr(), rows*sizeof(T));
1277
1278 // Idem for r[]
1279 memcpy(buffer.ptr(), r.const_ptr(0, j), cols*sizeof(T));
1280 memcpy(r.ptr(0, j), r.const_ptr(0, pivot), cols*sizeof(T));
1281 memcpy(r.ptr(0, pivot), buffer.const_ptr(), cols*sizeof(T));
1282 }
1283
1284 // Normalisation of qj
1285 r.get(j,j) = sqrt(norm2[j]);
1286 Vector<T> aj(*this, j);
1287 T coef = Constants<T>::pone / r.get(j,j);
1288 aj.scale(coef);
1289
1290 // Remove the qj-component from vectors bk (k=j+1,...,n-1)
1291 if (j<cols-1) {
1292 int firstcol=std::max(j+1, initialPivot) ;
1293 ScalarArray<T> bK(*this, 0, rows, firstcol, cols-firstcol); // All the columns of 'this' after column 'firstcol'
1294 ScalarArray<T> aj_bK(r, j, 1, firstcol, cols-firstcol); // In 'r': row 'j', all the columns after column 'firstcol'
1295 // Compute in 1 operation all the scalar products between aj and a_firstcol, ..., a_n
1296 aj_bK.gemm('C', 'N', Constants<T>::pone, &aj, &bK, Constants<T>::zero);
1297 // Update a_firstcol, ..., a_n
1298 bK.rankOneUpdateT(Constants<T>::mone, aj, aj_bK);
1299 // Update the norms
1300 for(int k = firstcol; k < cols; ++k) {
1301 double rjk = std::abs(r.get(j,k));
1302 norm2[k] -= rjk * rjk;
1303 norm2_update[k] -= rjk * rjk / norm2_orig[k];
1304 }
1305 }
1306 } else
1307 break;
1308 } // for(int j = rank;
1309
1310 // Update matrix dimensions
1311 cols = rank;
1312 result->rows = rank;
1313
1314 // Apply perm to result
1315 for(int j = 0; j < result->cols; ++j) {
1316 // Copy the column j of r into the column perm[j] of result (only 'rank' rows)
1317 memcpy(result->ptr(0, perm[j]), r.const_ptr(0, j), result->rows*sizeof(T));
1318 }
1319 setOrtho(1);
1320 // Clean up
1321 delete[] perm;
1322 /* end of modified Gram-Schmidt */
1323 return rank;
1324 }
1325
1326 template<typename T>
multiplyWithDiagOrDiagInv(const ScalarArray<T> * d,bool inverse,Side side)1327 void ScalarArray<T>::multiplyWithDiagOrDiagInv(const ScalarArray<T>* d, bool inverse, Side side) {
1328 assert(d);
1329 assert(side == Side::LEFT || (cols == d->rows));
1330 assert(side == Side::RIGHT || (rows == d->rows));
1331 assert(d->cols==1);
1332
1333 {
1334 const size_t _rows = rows, _cols = cols;
1335 increment_flops(Multipliers<T>::mul * _rows * _cols);
1336 }
1337 if (side == Side::LEFT) { // line i is multiplied by d[i] or 1/d[i]
1338 // TODO: Test with scale to see if it is better.
1339 if (inverse) {
1340 ScalarArray<T> *d2 = new ScalarArray<T>(rows,1);
1341 for (int i = 0; i < rows; i++)
1342 d2->get(i) = Constants<T>::pone / d->get(i);
1343 d = d2;
1344 }
1345 for (int j = 0; j < cols; j++) {
1346 for (int i = 0; i < rows; i++) {
1347 get(i, j) *= d->get(i);
1348 }
1349 }
1350 if (inverse) delete(d);
1351 } else { // column j is multiplied by d[j] or 1/d[j]
1352 for (int j = 0; j < cols; j++) {
1353 T diag_val = inverse ? Constants<T>::pone / d->get(j,0) : d->get(j);
1354 proxy_cblas::scal(rows, diag_val, ptr(0,j), 1);
1355 }
1356 }
1357 }
1358
1359 template<typename T>
multiplyWithDiag(const ScalarArray<double> * d)1360 void ScalarArray<T>::multiplyWithDiag(const ScalarArray<double>* d) {
1361 assert(d);
1362 assert(cols <= d->rows); // d can be larger than needed
1363 assert(d->cols==1);
1364
1365 {
1366 const size_t _rows = rows, _cols = cols;
1367 increment_flops(Multipliers<T>::mul * _rows * _cols);
1368 }
1369 for (int j = 0; j < cols; j++) {
1370 T diag_val = T(d->get(j));
1371 proxy_cblas::scal(rows, diag_val, m+j*lda, 1); // We don't use ptr() on purpose, because is_ortho is preserved here
1372 }
1373 }
1374
1375 template<typename T>
testOrtho() const1376 bool ScalarArray<T>::testOrtho() const {
1377 static char *test = getenv("HMAT_TEST_ORTHO");
1378 // code % 2 == 0 means we are in simple precision (real or complex)
1379 static double machine_accuracy = Constants<T>::code % 2 == 0 ? 1.19e-7 : 1.11e-16 ;
1380 static double test_accuracy = Constants<T>::code % 2 == 0 ? 1.e-3 : 1.e-7 ;
1381 static double ratioMax=0.;
1382 double ref = norm();
1383 if (ref==0.) return true;
1384 ScalarArray<T> *sp = new ScalarArray<T>(cols, cols);
1385 // Compute the scalar product sp = X^H.X
1386 sp->gemm('C', 'N', Constants<T>::pone, this, this, Constants<T>::zero);
1387 // Nullify the diagonal elements
1388 for (int i=0 ; i<cols ; i++)
1389 sp->get(i,i) = Constants<T>::zero;
1390 // The norm of the rest should be below 'epsilon x norm of this' to have orthogonality and return true
1391 double res = sp->norm();
1392 delete sp;
1393 if (test) {
1394 double ratio = res/ref/machine_accuracy/sqrt((double)rows);
1395 if (ratio > ratioMax) {
1396 ratioMax = ratio;
1397 printf("testOrtho[%dx%d] test=%d get=%d res=%g ref=%g res/ref=%g ratio=%g ratioMax=%g\n",
1398 rows, cols, (res < ref * test_accuracy), getOrtho(), res, ref, res/ref, ratio, ratioMax);
1399 }
1400 }
1401 return (res < ref * test_accuracy);
1402 }
1403
1404 template<typename T>
dot(const Vector<T> * x,const Vector<T> * y)1405 T Vector<T>::dot(const Vector<T>* x, const Vector<T>* y) {
1406 assert(x->cols == 1);
1407 assert(y->cols == 1);
1408 assert(x->rows == y->rows);
1409 // TODO: Beware of large vectors (>2 billion elements) !
1410 return proxy_cblas_convenience::dot_c(x->rows, x->const_ptr(), 1, y->const_ptr(), 1);
1411 }
1412
1413 template<typename T>
absoluteMaxIndex(int startIndex) const1414 int Vector<T>::absoluteMaxIndex(int startIndex) const {
1415 assert(this->cols == 1);
1416 return startIndex + proxy_cblas::i_amax(this->rows - startIndex, this->const_ptr() + startIndex, 1);
1417 }
1418
1419 // the classes declaration
1420 template class ScalarArray<S_t>;
1421 template class ScalarArray<D_t>;
1422 template class ScalarArray<C_t>;
1423 template class ScalarArray<Z_t>;
1424
1425 // the classes declaration
1426 template class Vector<S_t>;
1427 template class Vector<D_t>;
1428 template class Vector<C_t>;
1429 template class Vector<Z_t>;
1430 } // end namespace hmat
1431