1 /* 2 * Copyright (C) 2015 the FFLAS-FFPACK group 3 * Written by Brice Boyer (briceboyer) <boyer.brice@gmail.com> 4 * 5 * This file is Free Software and part of FFLAS-FFPACK. 6 * 7 * ========LICENCE======== 8 * This file is part of the library FFLAS-FFPACK. 9 * 10 * FFLAS-FFPACK is free software: you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * This library is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with this library; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 * ========LICENCE======== 24 * 25 */ 26 27 28 #include "fflas-ffpack/utils/timer.h" 29 #include "fflas-ffpack/fflas/fflas.h" 30 #include "fflas-ffpack/fflas-ffpack-config.h" 31 #include "fflas-ffpack/utils/test-utils.h" 32 #include "assert.h" 33 #include "fflas-ffpack/utils/args-parser.h" 34 #include "fflas-ffpack/utils/flimits.h" 35 36 #include <givaro/udl.h> 37 38 // using namespace FFPACK; 39 #define NEWWINO 40 // #define NOTRANDOM 41 // 42 #define DIVIDE_INTO(x,y) (((x) + (y) - 1)/(y)) 43 44 const int algos = 6 ; 45 const int algos_k = 2 ; 46 47 using Givaro::Modular; 48 using Givaro::ModularBalanced; 49 using Givaro::Timer; 50 using FFLAS::FieldTraits; 51 typedef std::vector<Timer> time_v ; 52 typedef std::vector<int> int_v ; 53 54 const int selec[] = { 55 0 56 ,1 57 ,2 58 ,3 59 ,4 60 ,5 61 }; 62 63 const int selec_k[] = { 64 0 65 ,1 66 }; 67 68 const char * descr[] = { 69 "322 low mem" 70 , "322 first 1" 71 , "322 4 tmp " 72 , "223 low mem" 73 , "232 first 1" 74 , "232 all tmp" 75 , "comp left " 76 , "comp right " 77 // , "322 sqrt " 78 }; 79 80 const char * descr_k[] = { 81 "comp left " 82 , "comp right " 83 }; 84 85 namespace FFLAS { /* compression */ 86 87 template<class Elem, int Num> 88 struct Packer ; 89 90 template<> 91 struct Packer<double,2> { 92 uint64_t bits = (limits<double>::digits()/2) ; 93 double base = (double) (1_ui64 << bits) ; 94 uint64_t mask = (1_ui64 << bits) - 1_ui64 ; 95 96 template<class T> 97 void accu(double * p, T * w) { 98 *p *= base ; 99 *p += (double)*w ; 100 } 101 } ; 102 103 104 /* ****** */ 105 /* pack */ 106 /* ****** */ 107 108 /* pack nb words (a,b,c) -> [a|b|c] */ 109 template<class wide_T, class pack_T, int Nb> 110 void pack_word( pack_T * packed, 111 const wide_T * words, int32_t stride, 112 Packer<pack_T,Nb> & packer) ; 113 114 115 template<class wide_T> 116 void pack_word/*<wide_T,double,2>*/( double * packed, 117 const wide_T * words, int32_t stride, 118 Packer<double,2> & packer) 119 { 120 // std::cout << "pack " << *words << '+' << *(words+stride) << " * " << (uint64_t) packer.base << " = "; 121 // words += stride ; 122 *packed = (double) *words ; 123 words += stride ; 124 packer.accu(packed,words); 125 // std::cout << (uint64_t) *packed << std::endl; 126 } 127 128 /* pack nb words (a,b) -> [a|b|0] filling with zeros */ 129 template<class wide_T, class pack_T, int Nb> 130 void pack_word_part( pack_T * packed, int32_t nb, 131 const wide_T * words, int32_t stride, 132 Packer<pack_T,Nb> & packer) ; 133 134 template<class wide_T> 135 void pack_word_part/* <wide_T,double,2> */( double * packed, int32_t nb, 136 const wide_T * words, int32_t stride, 137 Packer<double,2> & packer) 138 { 139 assert(nb == 1); 140 *packed = (double) *words ; 141 // words += stride ; 142 // packer.accu(packed,words); 143 *packed *= packer.base ; 144 } 145 146 /* ****** */ 147 /* unpack */ 148 /* ****** */ 149 150 template<class wide_T, class pack_T, int Nb> 151 void unpack_word( wide_T * words, int32_t stride, 152 const pack_T * packed, 153 Packer<pack_T,Nb> & packer); 154 155 template<class wide_T> 156 void unpack_word/* <wide_T,double,2> */( wide_T * words, int32_t stride, 157 const double * packed, 158 Packer<double ,2> & packer) 159 { 160 uint64_t pck = (uint64_t) *packed ; 161 words += stride ; 162 *words = (double) (pck & packer.mask) ; 163 words -= stride ; 164 pck >>= packer.bits ; 165 *words = (double) pck /* & packer.mask */ ; 166 } 167 168 169 template<class wide_T, class pack_T, int Nb> 170 void unpack_word_part( wide_T * words, int32_t stride, 171 const pack_T * packed, int32_t nb, 172 Packer<pack_T,Nb> & packer); 173 174 template<class wide_T> 175 void unpack_word_part/* <wide_T,double,2> */( wide_T * words, int32_t stride, 176 const double * packed, int32_t nb, 177 Packer<double,2> & packer) 178 { 179 assert(nb == 1); 180 words += stride ; 181 *words = 0 ; 182 words -= stride ; 183 uint64_t pck = (uint64_t) *packed ; 184 pck >>= packer.bits ; 185 *words = (double)pck /* & packer.mask */ ; 186 } 187 188 /* ****** */ 189 /* pack */ 190 /* ****** */ 191 192 template<class wide_T, class pack_T, int Nb, bool row_packed> 193 void pack_matrix( pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p, 194 const wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e, 195 Packer<pack_T,Nb> & packer) 196 { 197 if (row_packed == true) { 198 for (int32_t i = 0 ; i < row_e ; i++ ) { 199 const wide_T * e_p = elemts + i * ldm_e ; 200 pack_T * p_p = packed + i * ldm_p ; 201 int32_t j = 0 ; 202 for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) { 203 pack_word<wide_T>(p_p,e_p,1,packer); 204 205 } 206 if (j < col_e) 207 pack_word_part<wide_T>(p_p,col_e-j,e_p,1,packer); 208 } 209 } 210 else { /* col_packed */ 211 int32_t i = 0 ; 212 int32_t ii = 0 ; 213 for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) { 214 const wide_T * e_p = elemts + i * ldm_e ; 215 pack_T * p_p = packed + ii * ldm_p ; 216 for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) { 217 pack_word<wide_T>(p_p,e_p,ldm_e,packer); 218 219 } 220 } 221 if (i < row_e) 222 pack_word_part<wide_T>(packed+i*ldm_p,row_e-i,elemts+ii*ldm_e,ldm_e,packer); 223 224 } 225 } 226 227 /* ****** */ 228 /* unpack */ 229 /* ****** */ 230 231 template<class wide_T, class pack_T, int Nb, bool row_packed> 232 void unpack_matrix( wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e, 233 const pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p, 234 Packer<pack_T,Nb> & packer) 235 { 236 if (row_packed == true) { 237 for (int32_t i = 0 ; i < row_e ; i++ ) { 238 wide_T * e_p = elemts + i * ldm_e ; 239 const pack_T * p_p = packed + i * ldm_p ; 240 int32_t j = 0 ; 241 for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) { 242 unpack_word<wide_T>(e_p,1,p_p,packer); 243 244 } 245 if (j < col_e) 246 unpack_word_part<wide_T>(e_p,1,p_p,col_e-j,packer); 247 } 248 } 249 else { /* col_packed */ 250 int32_t i = 0 ; 251 int32_t ii = 0 ; 252 for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) { 253 wide_T * e_p = elemts + i * ldm_e ; 254 const pack_T * p_p = packed + ii * ldm_p ; 255 for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) { 256 unpack_word<wide_T>(e_p,ldm_e,p_p,packer); 257 258 } 259 } 260 if (i < row_e) 261 unpack_word_part<wide_T>(elemts+i*ldm_e,ldm_e,packed+ii*ldm_p,row_e-i,packer); 262 263 } 264 } 265 266 /* compress A */ 267 template<class Field, bool left_compress > 268 void 269 fgemm_compressed(const Field & F, 270 int m, int n, int k, 271 const typename Field::Element * A, int lda, 272 const typename Field::Element * B, int ldb, 273 typename Field::Element * C, int ldc 274 ) 275 { 276 Givaro::ZRing<double> NoField; 277 double * A_k, * B_k, * C_k ; 278 279 typedef typename Field::Element elem_t ; 280 Packer<elem_t,2> packer ; 281 282 int m_k = m , n_k = n , lda_k = lda, ldb_k = ldb, ldc_k = ldc ; 283 if (left_compress) { 284 m_k = DIVIDE_INTO(m,2)*2 ; 285 lda_k = m_k ; 286 ldc_k = n ; 287 288 A_k = FFLAS::fflas_new<double>(m_k*k) ; 289 //!@bug don't zero all, just the "border" 290 FFLAS::fzero(NoField,m_k,k,A_k,k); 291 292 B_k = const_cast<typename Field::Element *>(B) ; 293 294 pack_matrix<elem_t,elem_t,2,false>(A_k,m_k,k,lda_k, 295 A,m,k,lda, 296 packer); 297 } 298 else { 299 n_k = DIVIDE_INTO(n,2)*2 ; 300 ldb_k = n_k ; 301 ldc_k = n_k ; 302 303 A_k = const_cast<typename Field::Element *>(A) ; 304 B_k = FFLAS::fflas_new<double>(k*n_k) ; 305 //!@bug don't zero all, just the "border" 306 FFLAS::fzero(NoField,k,n_k,B_k,n_k); 307 308 pack_matrix<elem_t,elem_t,2,true>(B_k,k,n_k,ldb_k, 309 B,k,n,ldb, 310 packer); 311 } 312 313 C_k = FFLAS::fflas_new<double>(m_k*n_k) ; 314 //!@bug don't zero all, just the "border" 315 FFLAS::fzero(NoField,m_k,n_k,C_k,n_k); 316 317 pack_matrix<elem_t,elem_t,2,!left_compress>(C_k,m_k,n_k,ldc_k, 318 C,m,n,ldc, 319 packer); 320 321 #if 0 322 double * C_e = FFLAS::fflas_new<double>(m*ldc); 323 unpack_matrix<elem_t,elem_t,2,!left_compress>(C_e,m,n,ldc, 324 C_k,m_k,n_k,ldc_k, 325 packer); 326 327 int faux = 0 ; 328 for (int i = 0 ; i < m ; ++i) { 329 for (int j = 0 ; j < n ; ++j) { 330 if (! (C[i*ldc+j] == C_e[i*ldc+j]) ) { 331 ++faux ; 332 } 333 } 334 } 335 if (faux) { 336 std::cout << "bad pack/unpack ; bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl; 337 } 338 339 if (faux && (n<20)) { 340 std::cout << "IN " << std::endl; 341 for (int i = 0 ; i < m ; ++i) { 342 for (int j = 0 ; j < n ; ++j) 343 std::cout << C[i*ldc+j] << ' '; 344 std::cout << std::endl; 345 } 346 std::cout << "OUT" << std::endl; 347 for (int i = 0 ; i < m ; ++i) { 348 for (int j = 0 ; j < n ; ++j) 349 std::cout << C_e[i*ldc+j] << ' '; 350 std::cout << std::endl; 351 } 352 353 354 } 355 356 if (faux) 357 exit(-1); 358 #endif 359 360 361 362 363 Givaro::DoubleDomain G ; 364 365 fgemm(G,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, 366 m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k); 367 368 // cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, 369 // m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k); 370 371 372 unpack_matrix<elem_t,elem_t,2,!left_compress>(C,m,n,ldc, 373 C_k,m_k,n_k,ldc_k, 374 packer); 375 376 if (left_compress) 377 FFLAS::fflas_delete(A_k); 378 else 379 FFLAS::fflas_delete(B_k); 380 FFLAS::fflas_delete(C_k); 381 } 382 383 } 384 385 namespace FFLAS { /* tools */ 386 387 388 template<class Field> 389 void finit_fuzzy(Field & F, size_t m, size_t n, double * C, size_t ldc) 390 { 391 392 393 if (n == ldc) 394 // FFLAS::vectorised::modp<true,true>(C,C,m*n,p,invp,0,p-1); 395 FFLAS::vectorised::modp<Field,true>(F,C,m*n,C); 396 else 397 for (size_t i = 0 ; i < m ; ++i) 398 // FFLAS::vectorised::modp<true,true>(C+i*ldc,C+i*ldc,n,p,invp,0,p-1); 399 FFLAS::vectorised::modp<Field,true>(F,C+i*ldc,n,C+i*ldc); 400 } 401 402 403 // C = a*A + B 404 void add(const size_t m, const size_t n, 405 double a, 406 const double *A, const size_t lda, 407 const double *B, const size_t ldb, 408 double *C, const size_t ldc) 409 { 410 const double *Ai = A,*Bi = B; 411 double *Ci = C; 412 for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) 413 for (size_t j = 0 ; j < n ; ++j) 414 Ci[j] = a * Ai[j] + Bi[j]; 415 } 416 417 // C = C-(A+B) 418 void subadd(const size_t m, const size_t n, 419 const double *A, const size_t lda, 420 const double *B, const size_t ldb, 421 double *C, const size_t ldc) 422 { 423 const double *Ai = A,*Bi = B; 424 double *Ci = C; 425 for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) 426 for (size_t j = 0 ; j < n ; ++j) { 427 Ci[j] = Ci[j] - Ai[j] - Bi[j] ; 428 } 429 430 } 431 432 // C = -(A+B) 433 void negadd(const size_t m, const size_t n, 434 const double *A, const size_t lda, 435 const double *B, const size_t ldb, 436 double *C, const size_t ldc) 437 { 438 const double *Ai = A,*Bi = B; 439 double *Ci = C; 440 for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) 441 for (size_t j = 0 ; j < n ; ++j) { 442 Ci[j] = - Ai[j] - Bi[j] ; 443 } 444 445 } 446 447 448 // C = C+A-B 449 void addsub(const size_t m, const size_t n, 450 const double *A, const size_t lda, 451 const double *B, const size_t ldb, 452 double *C, const size_t ldc) 453 { 454 const double *Ai = A,*Bi = B; 455 double *Ci = C; 456 for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc) 457 for (size_t j = 0 ; j < n ; ++j) { 458 Ci[j] = Ci[j] + Ai[j] - Bi[j] ; 459 } 460 461 } 462 463 464 // C = (C+B)/e 465 template<class Field> 466 void addscalinf(const Field & F, const size_t m, const size_t n, 467 const double *B, const size_t ldb, 468 double e, 469 double *C, const size_t ldc) 470 { 471 const double * Bi = B; 472 double * Ci = C; 473 for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb) 474 for (size_t j = 0 ; j < n ; ++j) 475 Ci[j]= (Ci[j]+Bi[j])*e ; 476 // F.init( Ci[j], (Ci[j]+Bi[j])/e ); 477 478 } 479 480 // C = (C-B)/e 481 template<class Field> 482 void subscalinf(const Field & F, const size_t m, const size_t n, 483 const double *B, const size_t ldb, 484 double e, 485 double *C, const size_t ldc) 486 { 487 const double * Bi = B; 488 double * Ci = C; 489 for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb) 490 for (size_t j = 0 ; j < n ; ++j) 491 Ci[j]= (Ci[j]-Bi[j])*e ; 492 // F.init( Ci[j], (Ci[j]-Bi[j])/e ); 493 494 } 495 496 // C = (D-B)/e 497 template<class Field> 498 void subscal(const Field & F, const size_t m, const size_t n, 499 const double *D, const size_t ldd, 500 const double *B, const size_t ldb, 501 double e, 502 double *C, const size_t ldc) 503 { 504 const double * Bi = B; 505 const double * Di = D; 506 double * Ci = C; 507 for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) 508 for (size_t j = 0 ; j < n ; ++j) 509 Ci[j] = (Di[j]-Bi[j])*e ; 510 511 } 512 513 // C = (D+B)/e 514 template<class Field> 515 void addscal(const Field & F, const size_t m, const size_t n, 516 const double *D, const size_t ldd, 517 const double *B, const size_t ldb, 518 double e, 519 double *C, const size_t ldc) 520 { 521 const double * Bi = B; 522 const double * Di = D; 523 double * Ci = C; 524 for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) 525 for (size_t j = 0 ; j < n ; ++j) 526 Ci[j] = (Di[j]+Bi[j])*e ; 527 528 } 529 530 // C = C + (D-B)/e 531 template<class Field> 532 void subscalacc(const Field & F, const size_t m, const size_t n, 533 const double *D, const size_t ldd, 534 const double *B, const size_t ldb, 535 double e, 536 double *C, const size_t ldc) 537 { 538 const double * Bi = B; 539 const double * Di = D; 540 double * Ci = C; 541 for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd) 542 for (size_t j = 0 ; j < n ; ++j) 543 Ci[j] += (Di[j]-Bi[j])*e ; 544 545 } 546 547 #ifndef TRE 548 // #ifndef NDEBUG 549 // #define TRE 1 550 // #else 551 #define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD) 552 // #define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD*0.9) 553 // #endif 554 #endif 555 template<class Field> 556 double * gemm_fflas(const Field & F, 557 const size_t m, const size_t n, const size_t k, 558 const double *A, size_t lda, 559 const double *B, size_t ldb, 560 double * C, size_t ldc, 561 int rec = 0) 562 { 563 Givaro::DoubleDomain R; 564 FFLAS::fgemm(R, 565 FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, 566 m,n,k, 567 1, 568 A,lda, B,ldb, 569 0, 570 C, ldc); 571 572 // cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, 573 // m,n,k,1,A,lda,B,ldb,0,C,ldc); 574 575 return C; 576 } 577 } // FFLAS 578 579 namespace FFLAS { namespace Protected { namespace Rec { 580 581 // Field must be Givaro::Modular<double> 582 template<class Field> 583 double * 584 gemm_bini_322_0(const Field & F 585 , const size_t m 586 , const size_t n 587 , const size_t k 588 , const double *A , const size_t lda 589 , const double *B , const size_t ldb 590 , double *C , const size_t ldc 591 , int rec 592 , const double & epsilon 593 ) 594 { 595 Givaro::ZRing<double> NoField; 596 // const double p = (double)F.characteristic(); 597 size_t M = (n>m)?std::min(k,m):std::min(k,n); 598 // std::cout << rec << ',' << M << std::endl; 599 // Field G(p*p); 600 601 if ( M < TRE || rec <= 0) { 602 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 603 } 604 605 assert(k/2*2==k); // k divisible par 2 606 assert(n/2*2==n); // n divisible par 2 607 assert(m/3*3==m); // m divisible par 3 608 609 610 size_t n2 = n/2; 611 size_t k2 = k/2; 612 size_t m3 = m/3; 613 614 // std::cout << "€ = " << epsilon << std::endl; 615 616 // sub matrices in A 617 const double * A11 = A; 618 const double * A12 = A +k2; 619 const double * A21 = A +lda*m3; 620 const double * A22 = A21 +k2; 621 const double * A31 = A21 +lda*m3; 622 const double * A32 = A31 +k2; 623 624 // sub matrices in C 625 double * C11 = C; 626 double * C12 = C +n2; 627 double * C21 = C +ldc*m3; 628 double * C22 = C21 +n2; 629 double * C31 = C21 +ldc*m3; 630 double * C32 = C31 +n2; 631 632 // sub matrices in B 633 const double * B11 = B; 634 const double * B12 = B +n2; 635 const double * B21 = B +ldb*k2; 636 const double * B22 = B21 +n2; 637 638 FFLAS::fzero(NoField,m,n,C,ldc); 639 640 /* 641 * Algo : 642 * S1 := A11 +A22; 643 * S4 := e*A12+A22; 644 * S5 := A11 +e*A12; 645 * S6 := A21 +A32; 646 * S9 := A21 +e*A31; 647 * S10 := e*A31+A32; 648 * 649 * T1 := e*B11 +B22; 650 * T2 := B21 +B22; 651 * T4 := -e*B11+B21; 652 * T5 := e*B12 +B22; 653 * T6 := B11 +e*B22; 654 * T7 := B11 +B12; 655 * T9 := B12 -e*B22; 656 * T10 := B11 +e*B21; 657 * 658 * P1 := S1 *T1; 659 * P2 := A22*T2; 660 * P3 := A11*B22; 661 * P4 := S4 *T4; 662 * P5 := S5 *T5; 663 * P6 := S6 *T6; 664 * P7 := A21*T7; 665 * P8 := A32*B11; 666 * P9 := S9 *T9; 667 * P10:= S10*T10; 668 * 669 * C11 := (P1-P2-P3+P4)/e; 670 * C12 := (P3-P5)/(-e) ; 671 * C21 := P4+P6-P10 ; 672 * C22 := P1-P5+P9; 673 * C31 := (-P8+P10)/e; 674 * C32 := (P6-P7-P8+P9)/e; 675 * 676 */ 677 678 double * S1 = FFLAS::fflas_new<double>(m3*k2) ; 679 // double * C11t = FFLAS::fflas_new<double>(n2*m3) ; 680 // S1 := A11 +A22; 681 FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,S1,k2); 682 // T1 := e*B11 +B22; 683 double * T1 = FFLAS::fflas_new<double>(n2*k2) ; // ou aire 684 add(k2,n2,epsilon,B11,ldb,B22,ldb,T1,n2); 685 // P1 := S1 *T1; (dans C22) 686 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C22,ldc,rec-1,epsilon); 687 // S4 := e*A12+A22; 688 double * eA12 = FFLAS::fflas_new<double >(m3*k2); 689 FFLAS::fscal(NoField,m3,k2,epsilon,A12,lda,eA12,k2) ; 690 FFLAS::fadd(NoField,m3,k2,eA12,k2,A22,lda,S1,k2); 691 // T4 := -e*B11+B21; 692 add(k2,n2,-epsilon,B11,ldb,B21,ldb,T1,n2); 693 // P4 := S4 *T4; (dans C21) 694 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C21,ldc,rec-1,epsilon); 695 // C11 = P1+P4 696 FFLAS::fadd(NoField,m3,n2,C21,ldc,C22,ldc,C11,ldc); 697 // T2 := B21 +B22; 698 FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,T1,n2); 699 // P2 := A22*T2; 700 double * P1 = FFLAS::fflas_new<double>(n2*m3) ; // ou aire 701 gemm_bini_322_0(F,m3,n2,k2,A22,lda,T1,n2,P1,n2,rec-1,epsilon); 702 // P3 := A11*B22; (dans C12) 703 gemm_bini_322_0(F,m3,n2,k2,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); 704 // C11 -= (P2+P3) 705 subadd(m3,n2,P1,n2,C12,ldc,C11,ldc); 706 // S5 := A11 +e*A12; 707 FFLAS::fadd(NoField,m3,k2,eA12,k2,A11,lda,S1,k2); 708 // T5 := e*B12 +B22; 709 add(k2,n2,epsilon,B12,ldb,B22,ldb,T1,n2); 710 // P5 := S5 *T5; 711 double * P2 = FFLAS::fflas_new<double>(n2*m3) ; // ou aire 712 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P2,n2,rec-1,epsilon); 713 // C12 -= P5 714 subscalinf(NoField,m3,n2,P2,n2,-(double)1/epsilon,C12,ldc); 715 // S6 := A21 +A32; 716 FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,S1,k2); 717 // T6 := B11 +e*B22; 718 add(k2,n2,epsilon,B22,ldb,B11,ldb,T1,n2); 719 // P6 := S6 *T6; dans C32 720 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C32,ldc,rec-1,epsilon); 721 // C21+= P6 722 FFLAS::faddin(NoField,m3,n2,C32,ldc,C21,ldc); 723 // T7 := B11 +B12; 724 FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,T1,n2); 725 // P7 := A21*T7; !signe 726 gemm_bini_322_0(F,m3,n2,k2,A21,lda,T1,n2,P1,n2,rec-1,epsilon); 727 // P8 := A32*B11; dans C31 !signe 728 gemm_bini_322_0(F,m3,n2,k2,A32,lda,B11,ldb,C31,ldc,rec-1,epsilon); 729 // C32 -= P8+P7 730 subadd(m3,n2,P1,n2,C31,ldc,C32,ldc); 731 // S9 := A21 +e*A31; 732 double * eA31 = eA12 ; 733 FFLAS::fscal(NoField,m3,k2,epsilon,A31,lda,eA31,k2); 734 FFLAS::fadd(NoField,m3,k2,eA31,k2,A21,lda,S1,k2); 735 // T9 := B12 -e*B22; 736 add(k2,n2,-epsilon,B22,ldb,B12,ldb,T1,n2); 737 // P9 := S9 *T9; 738 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon); 739 // C32= (C32+P9)/p 740 addscalinf(NoField,m3,n2,P1,n2,(double)1/epsilon,C32,ldc); 741 // C22+= P9-P5 742 addsub(m3,n2,P1,n2,P2,n2,C22,ldc); 743 FFLAS::fflas_delete( P2); 744 // S10 := e*A31+A32; 745 FFLAS::fadd(NoField,m3,k2,eA31,k2,A32,lda,S1,k2); 746 FFLAS::fflas_delete( eA12 ); 747 // T10 := B11 +e*B21; 748 add(k2,n2,epsilon,B21,ldb,B11,ldb,T1,n2); 749 // P10:= S10*T10; 750 gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon); 751 FFLAS::fflas_delete( S1); 752 FFLAS::fflas_delete( T1); 753 // C21-= P10 754 FFLAS::fsubin(NoField,m3,n2,P1,n2,C21,ldc); 755 // C31= (C31-P10)/(-epsilon) 756 subscalinf(NoField,m3,n2,P1,n2,-(double)1/epsilon,C31,ldc); 757 FFLAS::fflas_delete( P1); 758 // C11 := (P1+P-P3+P4)/e; 759 FFLAS::fscalin(NoField,m3,n2,(double)1/epsilon,C11,ldc); 760 761 return C; 762 763 } 764 765 // Field must be Givaro::Modular<double> 766 template<class Field> 767 double * 768 gemm_bini_322_mem(const Field & F 769 , const size_t m 770 , const size_t n 771 , const size_t k 772 , const double *A , const size_t lda 773 , const double *B , const size_t ldb 774 , double *C , const size_t ldc 775 , int rec 776 , const double & epsilon 777 ) 778 { 779 Givaro::ZRing<double> NoField; 780 // const double p = (double)F.characteristic(); 781 size_t M = (n>m)?std::min(k,m):std::min(k,n); 782 // std::cout << rec << ',' << M << std::endl; 783 // Field G(p*p); 784 785 if ( M < TRE || rec <= 0) { 786 // std::cout << "ffw" << std::endl; 787 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 788 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 789 } 790 791 assert(k/2*2==k); // k divisible par 2 792 assert(n/2*2==n); // n divisible par 2 793 assert(m/3*3==m); // m divisible par 3 794 795 // std::cout << "tested" << std::endl; 796 797 size_t n2 = n/2; 798 size_t k2 = k/2; 799 size_t m3 = m/3; 800 801 // std::cout << "€ = " << epsilon << std::endl; 802 803 // sub matrices in A 804 const double * A11 = A; 805 const double * A12 = A +k2; 806 const double * A21 = A +lda*m3; 807 const double * A22 = A21 +k2; 808 const double * A31 = A21 +lda*m3; 809 const double * A32 = A31 +k2; 810 811 // sub matrices in C 812 double * C11 = C; 813 double * C12 = C +n2; 814 double * C21 = C +ldc*m3; 815 double * C22 = C21 +n2; 816 double * C31 = C21 +ldc*m3; 817 double * C32 = C31 +n2; 818 819 // sub matrices in B 820 const double * B11 = B; 821 const double * B12 = B +n2; 822 const double * B21 = B +ldb*k2; 823 const double * B22 = B21 +n2; 824 825 FFLAS::fzero(F,m,n,C,ldc); 826 827 /* 828 * Algo : 829 * S1 := A11 +A22; 830 * S4 := e*A12+A22; 831 * S5 := A11 +e*A12; 832 * S6 := A21 +A32; 833 * S9 := A21 +e*A31; 834 * S3 := e*A31+A32; 835 * 836 * T1 := e*B11 +B22; 837 * T2 := B21 +B22; 838 * T4 := -e*B11+B21; 839 * T5 := e*B12 +B22; 840 * T6 := B11 +e*B22; 841 * T7 := B11 +B12; 842 * T9 := B12 -e*B22; 843 * T3 := B11 +e*B21; 844 * 845 * P1 := S1 *T1; 846 * P2 := A22*T2; 847 * P10 := A11*B22; 848 * P4 := S4 *T4; 849 * P5 := S5 *T5; 850 * P6 := S6 *T6; 851 * P7 := A21*T7; 852 * P8 := A32*B11; 853 * P9 := S9 *T9; 854 * P3:= S3*T3; 855 * 856 * C11 := (P1-P2-P10+P4)/e; 857 * C12 := (P10-P5)/(-e) ; 858 * C21 := P4+P6-P3 ; 859 * C22 := P1-P5+P9; 860 * C31 := (-P8+P3)/e; 861 * C32 := (P6-P7-P8+P9)/e; 862 * 863 */ 864 865 866 // P10 867 gemm_bini_322_mem(F,m3,n2,k2,A11,lda,B22,ldb,C11,ldc,rec-1,epsilon); 868 // S5 869 double * X = FFLAS::fflas_new<double>(m3*k2); 870 add(m3,k2,epsilon,A12,lda,A11,lda,X,k2); 871 // T5 872 // double * Y = FFLAS::fflas_new<double>(std::max(k2,m3)*n2); 873 double * Y = FFLAS::fflas_new<double>(k2*n2); 874 add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2); 875 // P5 876 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C22,ldc,rec-1,epsilon); 877 // C12 878 subscal(NoField,m3,n2,C22,ldc,C11,ldc,(double)1/epsilon,C12,ldc); 879 // T2 880 FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2); 881 // P2 882 gemm_bini_322_mem(F,m3,n2,k2,A22,lda,Y,n2,C31,ldc,rec-1,epsilon); 883 // C11 884 FFLAS::faddin(NoField,m3,n2,C31,ldc,C11,ldc); 885 // S1 886 FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2); 887 // T1 888 add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2); 889 // P1 890 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon); 891 // C22 892 FFLAS::fsub(NoField,m3,n2,C21,ldc,C22,ldc,C22,ldc); 893 // C11 894 FFLAS::fsub(NoField,m3,n2,C21,ldc,C11,ldc,C11,ldc); 895 // S4 896 add(m3,k2,epsilon,A12,lda,A22,lda,X,k2); 897 // T4 898 add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2); 899 // P4 900 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon); 901 // C11 902 addscalinf(NoField,m3,n2,C21,ldc,(double)1/epsilon,C11,ldc); 903 // S9 904 add(m3,k2,epsilon,A31,lda,A21,lda,X,k2); 905 // T9 906 add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2); 907 // P9 908 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C32,ldc,rec-1,epsilon); 909 // C22 910 FFLAS::faddin(NoField,m3,n2,C32,ldc,C22,ldc); 911 // S6 912 FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2); 913 // T6 914 add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2); 915 // P6 916 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); 917 // C21 918 FFLAS::faddin(NoField,m3,n2,C31,ldc,C21,ldc); 919 // C32 920 FFLAS::faddin(NoField,m3,n2,C31,ldc,C32,ldc); 921 // T7 922 FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2); 923 // P7 924 gemm_bini_322_mem(F,m3,n2,k2,A21,lda,Y,n2,C31,ldc,rec-1,epsilon); 925 // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m3,n2,C31,ldc);} 926 // C32 927 FFLAS::fsubin(NoField,m3,n2,C31,ldc,C32,ldc); 928 // S3 929 add(m3,k2,epsilon,A31,lda,A32,lda,X,k2); 930 // T3 931 add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2); 932 // P3 933 gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); 934 FFLAS::fflas_delete( X); 935 FFLAS::fflas_delete( Y ); 936 // C21 937 FFLAS::fsubin(NoField,m3,n2,C31,ldc,C21,ldc); 938 // P8 939 Y = FFLAS::fflas_new<double>(m3*n2); 940 gemm_bini_322_mem(F,m3,n2,k2,A32,lda,B11,ldb,Y,n2,rec-1,epsilon); 941 // C31 942 subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C31,ldc); 943 // FFLAS::fsubin(NoField,m3,n2,Y,n2,C31,ldc); 944 // C32 945 subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C32,ldc); 946 // FFLAS::fsubin(NoField,m3,n2,Y,n2,C32,ldc); 947 // FFLAS::fscalin(NoField,m3,n,(double)1/epsilon,C31,ldc); 948 FFLAS::fflas_delete( Y ); 949 950 951 return C; 952 953 } 954 955 // Field must be Givaro::Modular<double> 956 template<class Field> 957 double * 958 gemm_bini_223_mem(const Field & F 959 , const size_t m 960 , const size_t n 961 , const size_t k 962 , const double *A , const size_t lda 963 , const double *B , const size_t ldb 964 , double *C , const size_t ldc 965 , int rec 966 , const double & epsilon 967 ) 968 { 969 Givaro::ZRing<double> NoField; 970 // const double p = (double)F.characteristic(); 971 size_t M = (n>m)?std::min(k,m):std::min(k,n); 972 // std::cout << rec << ',' << M << std::endl; 973 // Field G(p*p); 974 975 if ( M < TRE || rec <= 0) { 976 // std::cout << "ffw" << std::endl; 977 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 978 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 979 } 980 981 assert(k/2*2==k); // k divisible par 2 982 assert(n/3*3==n); // n divisible par 2 983 assert(m/2*2==m); // m divisible par 3 984 985 // std::cout << "tested" << std::endl; 986 987 size_t m2 = m/2; 988 size_t k2 = k/2; 989 size_t n3 = n/3; 990 991 // std::cout << "€ = " << epsilon << std::endl; 992 993 // sub matrices in A 994 const double * A11 = A; 995 const double * A12 = A +k2; 996 const double * A21 = A +lda*m2; 997 const double * A22 = A21 +k2; 998 999 // sub matrices in C 1000 double * C11 = C; 1001 double * C12 = C +n3; 1002 double * C13 = C +2*n3; 1003 double * C21 = C +ldc*m2; 1004 double * C22 = C21 +n3; 1005 double * C23 = C21 +2*n3; 1006 1007 1008 1009 // sub matrices in B 1010 const double * B11 = B; 1011 const double * B12 = B +n3; 1012 const double * B13 = B +2*n3; 1013 const double * B21 = B +ldb*k2; 1014 const double * B22 = B21 +n3; 1015 const double * B23 = B21 +2*n3; 1016 1017 1018 FFLAS::fzero(F,m,n,C,ldc); 1019 1020 /* 1021 * Algo : 1022 * S1 := B11 +B22; 1023 * S4 := e*B21+B22; 1024 * S5 := B11 +e*B21; 1025 * S6 := B12 +B23; 1026 * S9 := B12 +e*B13; 1027 * S3 := e*B13+B23; 1028 * 1029 * T1 := e*A11 +A22; 1030 * T2 := A12 +A22; 1031 * T4 := -e*A11+A12; 1032 * T5 := e*A21 +A22; 1033 * T6 := A11 +e*A22; 1034 * T7 := A11 +A21; 1035 * T9 := A21 -e*A22; 1036 * T3 := A11 +e*A12; 1037 * 1038 * P1 := S1 *T1; 1039 * P2 := T2 * B22; 1040 * P10 := A22 * B11; 1041 * P4 := S4 *T4; 1042 * P5 := S5 *T5; 1043 * P6 := S6 *T6; 1044 * P7 := T7*B12; 1045 * P8 := A11*B23; 1046 * P9 := S9 *T9; 1047 * P3 := S3*T3; 1048 * 1049 * C11 := (P1-P2-P10+P4)/e; 1050 * C21 := (P10-P5)/(-e) ; 1051 * C12 := P4+P6-P3 ; 1052 * C22 := P1-P5+P9; 1053 * C13 := (-P8+P3)/e; 1054 * C23 := (P6-P7-P8+P9)/e; 1055 * 1056 */ 1057 1058 1059 // P10 1060 gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon); 1061 // S5 1062 double * Y = FFLAS::fflas_new<double>(k2*n3); 1063 add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3); 1064 // T5 1065 double * X = FFLAS::fflas_new<double>(m2*k2); 1066 add(m2,k2,epsilon,A21,lda,A22,lda,X,k2); 1067 // P5 1068 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon); 1069 // C12 1070 subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc); 1071 // T2 1072 FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2); 1073 // P2 1074 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon); 1075 // C11 1076 FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc); 1077 // S1 1078 FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3); 1079 // T1 1080 add(m2,k2,epsilon,A11,lda,A22,lda,X,k2); 1081 // P1 1082 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); 1083 // C22 1084 FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc); 1085 // C11 1086 FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc); 1087 // S4 1088 add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3); 1089 // T4 1090 add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2); 1091 // P4 1092 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); 1093 // C11 1094 addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc); 1095 // S9 1096 add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3); 1097 // T9 1098 add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2); 1099 // P9 1100 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon); 1101 // C22 1102 FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc); 1103 // S6 1104 FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3); 1105 // T6 1106 add(m2,k2,epsilon,A22,lda,A11,lda,X,k2); 1107 // P6 1108 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); 1109 // C21 1110 FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc); 1111 // C32 1112 FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc); 1113 // T7 1114 FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2); 1115 // P7 1116 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon); 1117 // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);} 1118 // C32 1119 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc); 1120 // S3 1121 add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3); 1122 // T3 1123 add(m2,k2,epsilon,A12,lda,A11,lda,X,k2); 1124 // P3 1125 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); 1126 FFLAS::fflas_delete( Y ); 1127 FFLAS::fflas_delete( X ); 1128 // C21 1129 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc); 1130 // P8 1131 Y = FFLAS::fflas_new<double>(m2*n3); 1132 gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon); 1133 // C31 1134 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc); 1135 // C32 1136 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc); 1137 FFLAS::fflas_delete( Y ); 1138 1139 1140 return C; 1141 1142 } 1143 1144 // Field must be Givaro::Modular<double> 1145 template<class Field> 1146 double * 1147 gemm_bini_322_2(const Field & F 1148 , const size_t m 1149 , const size_t n 1150 , const size_t k 1151 , const double *A , const size_t lda 1152 , const double *B , const size_t ldb 1153 , double *C , const size_t ldc 1154 , int rec 1155 , const double & epsilon 1156 ) 1157 { 1158 Givaro::ZRing<double> NoField; 1159 // const double p = (double)F.characteristic(); 1160 size_t M = (n>m)?std::min(k,m):std::min(k,n); 1161 // std::cout << rec << ',' << M << std::endl; 1162 // Field G(p*p); 1163 1164 if ( M < TRE || rec <= 0) { 1165 // std::cout << "ffw" << std::endl; 1166 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 1167 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 1168 } 1169 1170 assert(k/2*2==k); // k divisible par 2 1171 assert(n/2*2==n); // n divisible par 2 1172 assert(m/3*3==m); // m divisible par 3 1173 1174 // std::cout << "tested" << std::endl; 1175 1176 size_t n2 = n/2; 1177 size_t k2 = k/2; 1178 size_t m3 = m/3; 1179 1180 // std::cout << "€ = " << epsilon << std::endl; 1181 1182 // sub matrices in A 1183 const double * A11 = A; 1184 const double * A12 = A +k2; 1185 const double * A21 = A +lda*m3; 1186 const double * A22 = A21 +k2; 1187 const double * A31 = A21 +lda*m3; 1188 const double * A32 = A31 +k2; 1189 1190 // sub matrices in C 1191 double * C11 = C; 1192 double * C12 = C +n2; 1193 double * C21 = C +ldc*m3; 1194 double * C22 = C21 +n2; 1195 double * C31 = C21 +ldc*m3; 1196 double * C32 = C31 +n2; 1197 1198 // sub matrices in B 1199 const double * B11 = B; 1200 const double * B12 = B +n2; 1201 const double * B21 = B +ldb*k2; 1202 const double * B22 = B21 +n2; 1203 1204 FFLAS::fzero(F,m,n,C,ldc); 1205 1206 /* 1207 * Algo : 1208 * S1 := A11 +A22; 1209 * S4 := e*A12+A22; 1210 * S5 := A11 +e*A12; 1211 * S3 := e*A31+A32; 1212 * S6 := A21 +A32; 1213 * S9 := A21 +e*A31; 1214 * 1215 * T1 := e*B11 +B22; 1216 * T2 := B21 +B22; 1217 * T3 := B11 +e*B21; 1218 * T4 := -e*B11+B21; 1219 * T5 := e*B12 +B22; 1220 * T6 := B11 +e*B22; 1221 * T7 := B11 +B12; 1222 * T9 := B12 -e*B22; 1223 * 1224 * P1 := S1 *T1; 1225 * P2 := A22*T2; 1226 * P10 := A11*B22; 1227 * P4 := S4 *T4; 1228 * P5 := S5 *T5; 1229 * P6 := S6 *T6; 1230 * P7 := A21*T7; 1231 * P8 := A32*B11; 1232 * P9 := S9 *T9; 1233 * P3:= S3*T3; 1234 * 1235 * C11 := (P1-P2-P10+P4)/e; 1236 * C12 := (P10-P5)/(-e) ; 1237 * C21 := P4+P6-P3 ; 1238 * C22 := P1-P5+P9; 1239 * C31 := (-P8+P3)/e; 1240 * C32 := (P6-P7-P8+P9)/e; 1241 * 1242 */ 1243 1244 double * U = FFLAS::fflas_new<double>(m3*n2); 1245 double * V = FFLAS::fflas_new<double>(m3*n2); 1246 double * X = FFLAS::fflas_new<double>(m3*std::max(k2,n2)); 1247 double * Y = FFLAS::fflas_new<double>(std::max(k2,m3)*n2); 1248 1249 // S4 1250 add(m3,k2,epsilon,A12,lda,A22,lda,X,k2); 1251 // T4 1252 add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2); 1253 // P4 1254 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); 1255 // S9 1256 add(m3,k2,epsilon,A31,lda,A21,lda,X,k2); 1257 // T9 1258 add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2); 1259 // P9 1260 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,V,n2,rec-1,epsilon); 1261 // S5 1262 add(m3,k2,epsilon,A12,lda,A11,lda,X,k2); 1263 // T5 1264 add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2); 1265 // P5 1266 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C12,ldc,rec-1,epsilon); 1267 // S3 1268 add(m3,k2,epsilon,A31,lda,A32,lda,X,k2); 1269 // T3 1270 add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2); 1271 // P3 1272 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon); 1273 // C22 = P9-P5 1274 FFLAS::fsub(NoField,m3,n2,V,n2,C12,ldc,C22,ldc); 1275 // C21 = P4-P3 1276 FFLAS::fsub(NoField,m3,n2,U,n2,C31,ldc,C21,ldc); 1277 // T2 1278 FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2); 1279 // P2 1280 gemm_bini_322_2(F,m3,n2,k2,A22,lda,Y,n2,X,n2,rec-1,epsilon); 1281 // XXX approximate 1282 // C11 = (P4 - P2) / e 1283 subscal(NoField,m3,n2,U,n2,X,n2,1./epsilon,C11,ldc); 1284 // T7 1285 FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2); 1286 // P7 1287 gemm_bini_322_2(F,m3,n2,k2,A21,lda,Y,n2,X,n2,rec-1,epsilon); 1288 // XXX approximate 1289 // C32 = (P9-P7) / e 1290 subscal(NoField,m3,n2,V,n2,X,n2,1./epsilon,C32,ldc); 1291 // S1 1292 FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2); 1293 // T1 1294 add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2); 1295 // P1 1296 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); 1297 // C22 += P1 1298 FFLAS::faddin(NoField,m3,n2,U,n2,C22,ldc); 1299 // P10 1300 gemm_bini_322_2(F,m3,n2,k2,A11,lda,B22,ldb,V,n2,rec-1,epsilon); 1301 // C12 = (P5-P10)/e 1302 subscalinf(NoField,m3,n2,V,n2,1./epsilon,C12,ldc); 1303 // XXX approximate 1304 // C11 = C11 + (P1-P10)/e 1305 subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C11,ldc); 1306 // S6 1307 FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2); 1308 // T6 1309 add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2); 1310 // P6 1311 gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon); 1312 // C21 += P6 1313 FFLAS::faddin(NoField,m3,n2,U,n2,C21,ldc); 1314 // P8 1315 gemm_bini_322_2(F,m3,n2,k2,A32,lda,B11,ldb,V,n2,rec-1,epsilon); 1316 // C31 = (P3-P8)/2 1317 subscalinf(NoField,m3,n2,V,n2,1./epsilon,C31,ldc); 1318 // XXX approximate 1319 // C32 = C32 + (P6-P8)/e 1320 subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C32,ldc); 1321 1322 1323 FFLAS::fflas_delete( X); 1324 FFLAS::fflas_delete( Y ); 1325 FFLAS::fflas_delete( U); 1326 FFLAS::fflas_delete( V); 1327 1328 1329 return C; 1330 1331 } 1332 1333 1334 // Field must be Givaro::Modular<double> 1335 template<class Field> 1336 double * 1337 gemm_bini_232_2(const Field & F 1338 , const size_t m 1339 , const size_t n 1340 , const size_t k 1341 , const double *A , const size_t lda 1342 , const double *B , const size_t ldb 1343 , double *C , const size_t ldc 1344 , int rec 1345 , const double & epsilon 1346 ) 1347 { 1348 Givaro::ZRing<double> NoField; 1349 // const double p = (double)F.characteristic(); 1350 size_t M = (n>m)?std::min(k,m):std::min(k,n); 1351 // Field G(p*p); 1352 1353 if ( M < TRE || rec <= 0) { 1354 // std::cout << "ffw" << std::endl; 1355 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 1356 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 1357 } 1358 1359 assert(k/3*3==k); // k divisible par 3 1360 assert(n/2*2==n); // n divisible par 2 1361 assert(m/2*2==m); // m divisible par 2 1362 1363 // std::cout << "tested" << std::endl; 1364 1365 size_t n2 = n/2; 1366 size_t k3 = k/3; 1367 size_t m2 = m/2; 1368 1369 // std::cout << "€ = " << epsilon << std::endl; 1370 1371 // sub matrices in B 1372 const double * B11 = B; 1373 const double * B12 = B +n2; 1374 const double * B21 = B +ldb*k3; 1375 const double * B22 = B21 +n2; 1376 const double * B31 = B21 +ldb*k3; 1377 const double * B32 = B31 +n2; 1378 1379 // sub matrices in C 1380 double * C11 = C; 1381 double * C12 = C +n2; 1382 double * C21 = C +ldc*m2; 1383 double * C22 = C21 +n2; 1384 1385 // sub matrices in A 1386 1387 const double * A11 = A; 1388 const double * A12 = A +k3; 1389 const double * A13 = A +2*k3; 1390 const double * A21 = A +lda*m2; 1391 const double * A22 = A21 +k3; 1392 const double * A23 = A21 +2*k3; 1393 1394 1395 FFLAS::fzero(F,m,n,C,ldc); 1396 1397 /* 1398 * Algo : 1399 * 1400 * S1 := A11 +A22*e; 1401 * S3 := -(A11+A21); 1402 * S4 := A11+A12*e; 1403 * S5 := A21 - A22*e; 1404 * S6 := A12*e + A23; 1405 * S8 := -(A13+A23): 1406 * S9 := A22*e + A23; 1407 * S10 := -A12*e+A13; 1408 * 1409 * T1 := B11 +B22; 1410 * T4 := e*B12+B22; 1411 * T5 := B11 +e*B12; 1412 * T6 := B21 +B32; 1413 * T9 := B21 + e*B31; 1414 * T10 := e*B31 +B32; 1415 * 1416 * P1 := Bini232(S1,T1 ,e); 1417 * P2 := Bini232(A11,B22 ,e); 1418 * P3 := Bini232(S3,B11,e); 1419 * P4 := Bini232(S4,T4 ,e); 1420 * P5 := Bini232(S5,T5 ,e); 1421 * P6 := Bini232(S6,T6 ,e); 1422 * P7 := Bini232(A23,B21 ,e); 1423 * P8 := Bini232(S8,B32,e); 1424 * P9 := Bini232(S9,T9 ,e); 1425 * P10:= Bini232(S10,T10,e); 1426 * 1427 * 1428 * C11 := evalm(P1-P4+(P6-P7+P8+P10)/e); 1429 * C12 := evalm((-P2+P4)/e+P10) ; 1430 * C21 := evalm(P5+(-P7+P9)/e) ; 1431 * C22 := evalm((P1-P2+P3+P5)/e+P6-P9); 1432 * 1433 */ 1434 1435 double * U = FFLAS::fflas_new<double>(m2*n2); 1436 double * V = FFLAS::fflas_new<double>(m2*n2); 1437 double * X = FFLAS::fflas_new<double>(m2*k3); 1438 double * Y = FFLAS::fflas_new<double>(k3*n2); 1439 1440 // S1 1441 add(m2,k3,epsilon,A22,lda,A11,lda,X,k3); 1442 // T1 1443 FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2); 1444 // P1 (in U) 1445 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1446 // S3 1447 negadd(m2,k3,A11,lda,A21,lda,X,k3); 1448 // P3 (in V) 1449 gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon); 1450 // C22 = (P1+P3)/e 1451 // FFLAS::fadd(NoField,m2,n2,U,n2,V,n2,C22,ldc); // XXX acc 1452 addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc); 1453 // S6 1454 add(m2,k3,epsilon,A12,lda,A23,lda,X,k3); 1455 // T6 1456 FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2); 1457 // P6 (in V) 1458 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); 1459 // C22 += P6 1460 FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc); 1461 // S8 1462 negadd(m2,k3,A13,lda,A23,lda,X,k3); 1463 // P8 (in C11) 1464 gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon); 1465 // C11 = (P8+P6)/e 1466 addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc); 1467 // C11 += P1 1468 FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc); 1469 // S4 1470 add(m2,k3,epsilon,A12,lda,A11,lda,X,k3); 1471 // T4 1472 add(k3,n2,epsilon,B12,ldb,B22,ldb,Y,n2); 1473 // P4 (in U) 1474 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1475 // C11 -= P4 1476 FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc); 1477 // P2 (in C12) 1478 gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); 1479 // S5 1480 add(m2,k3,-epsilon,A22,lda,A21,lda,X,k3); 1481 // T5 1482 add(k3,n2,epsilon,B12,ldb,B11,ldb,Y,n2); 1483 // P5 (in V) 1484 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); 1485 // C22 += (P5-P2)/e 1486 subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc); 1487 // C12 = (P4-P2)/e 1488 subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc); 1489 // S9 1490 add(m2,k3,epsilon,A22,lda,A23,lda,X,k3); 1491 // T9 1492 add(k3,n2,epsilon,B31,ldb,B21,ldb,Y,n2); 1493 // P9 (in U) 1494 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1495 // C22 -= P9 1496 FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc); 1497 // P7 (in C21) 1498 gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon); 1499 // C11 = C11 - P7/e 1500 add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc); 1501 // C21 = (P9-P7)/e 1502 subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc); 1503 // C21 += P5 1504 FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc); 1505 // S10 1506 add(m2,k3,-epsilon,A12,lda,A13,lda,X,k3); 1507 // T10 1508 add(k3,n2,epsilon,B31,ldb,B32,ldb,Y,n2); 1509 // P10 (in U) 1510 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1511 // C12 += P10 1512 FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc); 1513 // C11 += P10/e 1514 add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc); 1515 1516 1517 FFLAS::fflas_delete( X ); 1518 FFLAS::fflas_delete( Y ); 1519 FFLAS::fflas_delete( U ); 1520 FFLAS::fflas_delete( V ); 1521 1522 1523 return C; 1524 1525 } 1526 1527 template<class Field> 1528 double * 1529 gemm_bini_232_3_acc(const Field & F 1530 , const size_t m 1531 , const size_t n 1532 , const size_t k 1533 , const double *A , const size_t lda 1534 , const double *B , const size_t ldb 1535 , double *C , const size_t ldc 1536 , int rec 1537 , const double & epsilon 1538 ) 1539 { 1540 if (rec != 0) 1541 exit(-1); 1542 Givaro::DoubleDomain R; 1543 FFLAS::fgemm(R, 1544 FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, 1545 m,n,k, 1546 1, 1547 A,lda, B,ldb, 1548 1, 1549 C, ldc); 1550 1551 1552 } 1553 1554 template<class Field> 1555 double * 1556 gemm_bini_232_3(const Field & F 1557 , const size_t m 1558 , const size_t n 1559 , const size_t k 1560 , const double *A , const size_t lda 1561 , const double *B , const size_t ldb 1562 , double *C , const size_t ldc 1563 , int rec 1564 , const double & epsilon 1565 ) 1566 { 1567 Givaro::ZRing<double> NoField; 1568 // const double p = (double)F.characteristic(); 1569 size_t M = (n>m)?std::min(k,m):std::min(k,n); 1570 // Field G(p*p); 1571 1572 if ( M < TRE || rec <= 0) { 1573 // std::cout << "ffw" << std::endl; 1574 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 1575 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 1576 } 1577 1578 assert(k/3*3==k); // k divisible par 3 1579 assert(n/2*2==n); // n divisible par 2 1580 assert(m/2*2==m); // m divisible par 2 1581 1582 // std::cout << "tested" << std::endl; 1583 1584 size_t n2 = n/2; 1585 size_t k3 = k/3; 1586 size_t m2 = m/2; 1587 1588 // std::cout << "€ = " << epsilon << std::endl; 1589 1590 // sub matrices in B 1591 const double * B11 = B; 1592 const double * B12 = B +n2; 1593 const double * B21 = B +ldb*k3; 1594 const double * B22 = B21 +n2; 1595 const double * B31 = B21 +ldb*k3; 1596 const double * B32 = B31 +n2; 1597 1598 // sub matrices in C 1599 double * C11 = C; 1600 double * C12 = C +n2; 1601 double * C21 = C +ldc*m2; 1602 double * C22 = C21 +n2; 1603 1604 // sub matrices in A 1605 1606 const double * A11 = A; 1607 const double * A12 = A +k3; 1608 const double * A13 = A +2*k3; 1609 const double * A21 = A +lda*m2; 1610 const double * A22 = A21 +k3; 1611 const double * A23 = A21 +2*k3; 1612 1613 1614 FFLAS::fzero(F,m,n,C,ldc); 1615 1616 /* 1617 * Algo : 1618 * 1619 * S1 := A11 +A22*e; 1620 * S3 := -(A11+A21); 1621 * S4 := A11+A12*e; 1622 * S5 := A21 - A22*e; 1623 * S6 := A12*e + A23; 1624 * S8 := -(A13+A23): 1625 * S9 := A22*e + A23; 1626 * S10 := -A12*e+A13; 1627 * 1628 * T1 := B11 +B22; 1629 * T4 := e*B12+B22; 1630 * T5 := B11 +e*B12; 1631 * T6 := B21 +B32; 1632 * T9 := B21 + e*B31; 1633 * T10 := e*B31 +B32; 1634 * 1635 * P1 := Bini232(S1,T1 ,e); 1636 * P2 := Bini232(A11,B22 ,e); 1637 * P3 := Bini232(S3,B11,e); 1638 * P4 := Bini232(S4,T4 ,e); 1639 * P5 := Bini232(S5,T5 ,e); 1640 * P6 := Bini232(S6,T6 ,e); 1641 * P7 := Bini232(A23,B21 ,e); 1642 * P8 := Bini232(S8,B32,e); 1643 * P9 := Bini232(S9,T9 ,e); 1644 * P10:= Bini232(S10,T10,e); 1645 * 1646 * 1647 * C11 := evalm(P1-P4+(P6-P7+P8+P10)/e); 1648 * C12 := evalm((-P2+P4)/e+P10) ; 1649 * C21 := evalm(P5+(-P7+P9)/e) ; 1650 * C22 := evalm((P1-P2+P3+P5)/e+P6-P9); 1651 * 1652 */ 1653 1654 // could be just one band for the scalings 1655 1656 1657 1658 double * U = FFLAS::fflas_new<double>(m2*n2); 1659 double * V = FFLAS::fflas_new<double>(std::max(k3,m2)*n2); 1660 double * X = FFLAS::fflas_new<double>(m2*k3); 1661 double * Y = FFLAS::fflas_new<double>(k3*n2); 1662 1663 // S1 1664 double * eA22 = FFLAS::fflas_new<double>(std::max(m2,n2)*k3); 1665 FFLAS::fscal(NoField,m2,k3,epsilon,A22,lda,eA22,k3); 1666 FFLAS::fadd(NoField,m2,k3,eA22,k3,A11,lda,X,k3); 1667 // T1 1668 FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2); 1669 // P1 (in U) 1670 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1671 // S3 1672 negadd(m2,k3,A11,lda,A21,lda,X,k3); 1673 // P3 (in V) 1674 gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon); 1675 // C22 = (P1+P3)/e 1676 addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc); 1677 // S6 1678 double * eA12 = FFLAS::fflas_new<double>(m2*k3); 1679 FFLAS::fscal(NoField,m2,k3,epsilon,A12,lda,eA12,k3); 1680 FFLAS::fadd(NoField,m2,k3,eA12,k3,A23,lda,X,k3); 1681 // T6 1682 FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2); 1683 // P6 (in V) 1684 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); 1685 // C22 += P6 1686 FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc); 1687 // S8 1688 negadd(m2,k3,A13,lda,A23,lda,X,k3); 1689 // P8 (in C11) 1690 gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon); 1691 // C11 = (P8+P6)/e 1692 addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc); 1693 // C11 += P1 1694 FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc); 1695 // S4 1696 FFLAS::fadd(NoField,m2,k3,eA12,k3,A11,lda,X,k3); 1697 // T4 1698 double * eB12 = V ; // FFLAS::fflas_new<double>(n2*k3); 1699 FFLAS::fscal(NoField,k3,n2,epsilon,B12,ldb,eB12,n2); 1700 FFLAS::fadd(NoField,k3,n2,eB12,n2,B22,ldb,Y,n2); 1701 // P4 (in U) 1702 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1703 // C11 -= P4 1704 FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc); 1705 // P2 (in C12) 1706 gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon); 1707 // S5 1708 FFLAS::fsub(NoField,m2,k3,A21,lda,eA22,k3,X,k3); 1709 // T5 1710 FFLAS::fadd(NoField,k3,n2,eB12,n2,B11,ldb,Y,n2); 1711 // FFLAS::fflas_delete( eB12); 1712 // P5 (in V) 1713 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon); 1714 // C22 += (P5-P2)/e 1715 subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc); 1716 // C12 = (P4-P2)/e 1717 subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc); 1718 // S9 1719 FFLAS::fadd(NoField,m2,k3,eA22,k3,A23,lda,X,k3); 1720 double * eB31 = eA22 ; 1721 FFLAS::fscal(NoField,k3,n2,epsilon,B31,ldb,eB31,n2); 1722 // T9 1723 FFLAS::fadd(NoField,k3,n2,eB31,n2,B21,ldb,Y,n2); 1724 // P9 (in U) 1725 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1726 // C22 -= P9 1727 FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc); 1728 // P7 (in C21) 1729 gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon); 1730 // C11 = C11 - P7/e 1731 add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc); 1732 // C21 = (P9-P7)/e 1733 subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc); 1734 // C21 += P5 1735 FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc); 1736 // S10 1737 FFLAS::fsub(NoField,m2,k3,A13,lda,eA12,k3,X,k3); 1738 FFLAS::fflas_delete( eA12); 1739 // T10 1740 FFLAS::fadd(NoField,k3,n2,eB31,n2,B32,ldb,Y,n2); 1741 FFLAS::fflas_delete( eA22); 1742 // P10 (in U) 1743 gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon); 1744 // C12 += P10 1745 FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc); 1746 // C11 += P10/e 1747 add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc); 1748 1749 1750 FFLAS::fflas_delete( X ); 1751 FFLAS::fflas_delete( Y ); 1752 FFLAS::fflas_delete( U ); 1753 FFLAS::fflas_delete( V ); 1754 1755 1756 return C; 1757 1758 } 1759 1760 #if 0 1761 template<class Field> 1762 double * 1763 gemm_bini_322_sqrt(const Field & F 1764 , const size_t m 1765 , const size_t n 1766 , const size_t k 1767 , const double *A , const size_t lda 1768 , const double *B , const size_t ldb 1769 , double *C , const size_t ldc 1770 , int rec 1771 , const double & epsilon 1772 ) 1773 { 1774 Givaro::ZRing<double> NoField; 1775 // const double p = (double)F.characteristic(); 1776 size_t M = (n>m)?std::min(k,m):std::min(k,n); 1777 // std::cout << rec << ',' << M << std::endl; 1778 // Field G(p*p); 1779 1780 if ( M < TRE || rec <= 0) { 1781 // std::cout << "ffw" << std::endl; 1782 return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc); 1783 // return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc); 1784 } 1785 1786 assert(k/2*2==k); // k divisible par 2 1787 assert(n/3*3==n); // n divisible par 2 1788 assert(m/2*2==m); // m divisible par 3 1789 1790 // std::cout << "tested" << std::endl; 1791 1792 size_t m2 = m/2; 1793 size_t k2 = k/2; 1794 size_t n3 = n/3; 1795 1796 // std::cout << "€ = " << epsilon << std::endl; 1797 1798 // sub matrices in A 1799 const double * A11 = A; 1800 const double * A12 = A +k2; 1801 const double * A21 = A +lda*m2; 1802 const double * A22 = A21 +k2; 1803 1804 // sub matrices in C 1805 double * C11 = C; 1806 double * C12 = C +n3; 1807 double * C13 = C +2*n3; 1808 double * C21 = C +ldc*m2; 1809 double * C22 = C21 +n3; 1810 double * C23 = C21 +2*n3; 1811 1812 1813 1814 // sub matrices in B 1815 const double * B11 = B; 1816 const double * B12 = B +n3; 1817 const double * B13 = B +2*n3; 1818 const double * B21 = B +ldb*k2; 1819 const double * B22 = B21 +n3; 1820 const double * B23 = B21 +2*n3; 1821 1822 1823 FFLAS::fzero(F,m,n,C,ldc); 1824 1825 /* 1826 * Algo : 1827 * S1 := B11 +B22; 1828 * S4 := e*B21+B22; 1829 * S5 := B11 +e*B21; 1830 * S6 := B12 +B23; 1831 * S9 := B12 +e*B13; 1832 * S3 := e*B13+B23; 1833 * 1834 * T1 := e*A11 +A22; 1835 * T2 := A12 +A22; 1836 * T4 := -e*A11+A12; 1837 * T5 := e*A21 +A22; 1838 * T6 := A11 +e*A22; 1839 * T7 := A11 +A21; 1840 * T9 := A21 -e*A22; 1841 * T3 := A11 +e*A12; 1842 * 1843 * P1 := S1 *T1; 1844 * P2 := T2 * B22; 1845 * P10 := A22 * B11; 1846 * P4 := S4 *T4; 1847 * P5 := S5 *T5; 1848 * P6 := S6 *T6; 1849 * P7 := T7*B12; 1850 * P8 := A11*B23; 1851 * P9 := S9 *T9; 1852 * P3 := S3*T3; 1853 * 1854 * C11 := (P1-P2-P10+P4)/e; 1855 * C21 := (P10-P5)/(-e) ; 1856 * C12 := P4+P6-P3 ; 1857 * C22 := P1-P5+P9; 1858 * C13 := (-P8+P3)/e; 1859 * C23 := (P6-P7-P8+P9)/e; 1860 * 1861 */ 1862 1863 1864 // P10 1865 gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon); 1866 // S5 1867 double * Y = FFLAS::fflas_new<double>(k2*n3); 1868 add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3); 1869 // T5 1870 double * X = FFLAS::fflas_new<double>(m2*k2); 1871 add(m2,k2,epsilon,A21,lda,A22,lda,X,k2); 1872 // P5 1873 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon); 1874 // C12 1875 subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc); 1876 // T2 1877 FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2); 1878 // P2 1879 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon); 1880 // C11 1881 FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc); 1882 // S1 1883 FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3); 1884 // T1 1885 add(m2,k2,epsilon,A11,lda,A22,lda,X,k2); 1886 // P1 1887 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); 1888 // C22 1889 FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc); 1890 // C11 1891 FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc); 1892 // S4 1893 add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3); 1894 // T4 1895 add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2); 1896 // P4 1897 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon); 1898 // C11 1899 addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc); 1900 // S9 1901 add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3); 1902 // T9 1903 add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2); 1904 // P9 1905 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon); 1906 // C22 1907 FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc); 1908 // S6 1909 FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3); 1910 // T6 1911 add(m2,k2,epsilon,A22,lda,A11,lda,X,k2); 1912 // P6 1913 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); 1914 // C21 1915 FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc); 1916 // C32 1917 FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc); 1918 // T7 1919 FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2); 1920 // P7 1921 gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon); 1922 // if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);} 1923 // C32 1924 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc); 1925 // S3 1926 add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3); 1927 // T3 1928 add(m2,k2,epsilon,A12,lda,A11,lda,X,k2); 1929 // P3 1930 gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon); 1931 FFLAS::fflas_delete( Y ); 1932 FFLAS::fflas_delete( X ); 1933 // C21 1934 FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc); 1935 // P8 1936 Y = FFLAS::fflas_new<double>(m2*n3); 1937 gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon); 1938 // C31 1939 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc); 1940 // C32 1941 subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc); 1942 FFLAS::fflas_delete( Y ); 1943 1944 1945 return C; 1946 1947 } 1948 #endif 1949 1950 1951 } // Rec 1952 } // Protected 1953 } // FFLAS 1954 1955 namespace FFLAS { namespace Protected { 1956 1957 template<class Field> 1958 typename Field::Element * 1959 gemm_bini_p(const Field &F 1960 , const size_t m 1961 , const size_t n 1962 , const size_t k 1963 , const typename Field::Element *A 1964 , const size_t lda 1965 , const typename Field::Element *B 1966 , const size_t ldb 1967 , typename Field::Element *C 1968 , const size_t ldc 1969 , int rec 1970 , size_t algo 1971 ) 1972 { 1973 1974 assert(k/6*6==k); // k divisible par 6 1975 assert(n/6*6==n); // n divisible par 6 1976 assert(m/6*6==m); // m divisible par 6 1977 1978 // e-formule 1979 double epsilon = (double) F.characteristic() ; 1980 switch(algo) { 1981 case 0 : 1982 Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 1983 FFLAS::finit_fuzzy(F,m,n,C,ldc); 1984 // FFLAS::finit(F,m,n,C,ldc); 1985 break; 1986 case 1 : 1987 Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 1988 FFLAS::finit_fuzzy(F,m,n,C,ldc); 1989 // FFLAS::finit(F,m,n,C,ldc); 1990 break; 1991 case 2 : 1992 Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 1993 FFLAS::finit_fuzzy(F,m,n,C,ldc); 1994 break; 1995 case 3 : 1996 Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 1997 FFLAS::finit_fuzzy(F,m,n,C,ldc); 1998 // FFLAS::finit(F,m,n,C,ldc); 1999 break; 2000 case 4 : 2001 Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2002 FFLAS::finit_fuzzy(F,m,n,C,ldc); 2003 break; 2004 case 5 : 2005 Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2006 FFLAS::finit_fuzzy(F,m,n,C,ldc); 2007 break; 2008 #if 0 2009 case 8 : { 2010 double epsilon2 = sqrt((double)epsilon); 2011 std::cout << epsilon2 << std::endl; 2012 Rec::gemm_bini_322_sqrt(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon2); 2013 // FFLAS::finit_fuzzy(F,m,n,C,ldc); 2014 for(size_t i = 0 ; i < m ; ++i) { 2015 for(size_t j = 0 ; j < n ; ++j) 2016 C[i*ldc+j] = rint(fmod(C[i*ldc+j],epsilon2)); 2017 } 2018 break; 2019 } 2020 #endif 2021 default : 2022 std::cout << " not an algo :" << algo << std::endl;; 2023 exit(-1); 2024 } 2025 2026 2027 2028 return C; 2029 2030 } 2031 2032 template<class Field> 2033 typename Field::Element * 2034 gemm_bini_e(const Field &F 2035 , const size_t m 2036 , const size_t n 2037 , const size_t k 2038 , const typename Field::Element *A 2039 , const size_t lda 2040 , const typename Field::Element *B 2041 , const size_t ldb 2042 , typename Field::Element *C 2043 , const size_t ldc 2044 , int rec 2045 , size_t algo 2046 ) 2047 { 2048 2049 assert(k/2*2==k); // k divisible par 2 2050 assert(n/2*2==n); // n divisible par 2 2051 assert(m/3*3==m); // m divisible par 3 2052 2053 // e-formule 2054 double epsilon = 1./(1<<27); 2055 switch(algo) { 2056 case 0 : 2057 Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2058 break; 2059 case 1 : 2060 Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2061 break; 2062 case 2 : 2063 Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2064 break; 2065 case 3 : 2066 Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2067 break; 2068 case 4 : 2069 Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2070 break; 2071 case 5 : 2072 Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon); 2073 break; 2074 default : 2075 std::cout << " not an algo :" << algo << std::endl;; 2076 exit(-1); 2077 } 2078 2079 2080 // vire les e. 2081 // FFLAS::finit_fuzzy(F,m,n,C,ldc); 2082 FFLAS::finit_fuzzy(F,m,n,C,ldc); 2083 2084 return C; 2085 2086 } 2087 2088 template<class Field> 2089 typename Field::Element * 2090 gemm_compress(const Field &F 2091 , const size_t m 2092 , const size_t n 2093 , const size_t k 2094 , const typename Field::Element *A 2095 , const size_t lda 2096 , const typename Field::Element *B 2097 , const size_t ldb 2098 , typename Field::Element *C 2099 , const size_t ldc 2100 , int rec 2101 , size_t algo 2102 ) 2103 { 2104 2105 assert(k/6*6==k); // k divisible par 6 2106 assert(n/6*6==n); // n divisible par 6 2107 assert(m/6*6==m); // m divisible par 6 2108 2109 switch(algo) { 2110 case 0 : 2111 fgemm_compressed<Field,true>(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc); 2112 FFLAS::freduce(F,m,n,C,ldc); 2113 break; 2114 case 1 : 2115 fgemm_compressed<Field,false>(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc); 2116 FFLAS::freduce(F,m,n,C,ldc); 2117 break; 2118 default : 2119 std::cout << " not an algo :" << algo << std::endl;; 2120 exit(-1); 2121 } 2122 2123 2124 2125 return C; 2126 2127 } 2128 2129 } // Protected 2130 } // FFLAS 2131 2132 template<class Field> 2133 void check_equal(const Field & F,int m,int n, 2134 typename Field::Element * D,int ldd, 2135 typename Field::Element * E,int lde, 2136 const char * nomalgo, const char * madescr, int & ok_p) 2137 { 2138 int faux = 0 ; 2139 for (int i = 0 ; i < m ; ++i) { 2140 for (int j = 0 ; j < n ; ++j) { 2141 if (!F.areEqual(D[i*ldd+j],E[i*lde+j])) { 2142 ++faux ; 2143 } 2144 } 2145 } 2146 if (faux) { 2147 std::cout << nomalgo << " " << madescr << " : bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl; 2148 } 2149 else ok_p ++ ; 2150 2151 2152 #if 1 2153 if (faux && (n<20)) { 2154 std::cout << "OK" << std::endl; 2155 for (int i = 0 ; i < m ; ++i) { 2156 for (int j = 0 ; j < n ; ++j) 2157 std::cout << D[i*ldd+j] << ' '; 2158 std::cout << std::endl; 2159 } 2160 std::cout << "KO" << std::endl; 2161 for (int i = 0 ; i < m ; ++i) { 2162 for (int j = 0 ; j < n ; ++j) 2163 std::cout << E[i*lde+j] << ' '; 2164 std::cout << std::endl; 2165 } 2166 2167 2168 std::cout << "Diff" << std::endl; 2169 for (int i = 0 ; i < m ; ++i) { 2170 for (int j = 0 ; j < n ; ++j) 2171 std::cout << D[i*ldd+j]-E[i*lde+j] << ' '; 2172 std::cout << std::endl; 2173 } 2174 } 2175 #endif 2176 } 2177 2178 2179 template<class Field> 2180 void test_algos(const Field &F, int m, int n, int k 2181 , const typename Field::Element * A, int lda 2182 , const typename Field::Element * B, int ldb 2183 , int r 2184 , time_v & tim_k, time_v & tim_e , time_v & tim_p 2185 , int_v & ok_k, int_v & ok_e, int_v & ok_p 2186 , FFLAS::Timer & tim_wd, int & nb_wd 2187 , bool with_e 2188 , bool with_k 2189 ) 2190 { 2191 FFLAS::Timer tmp ; 2192 typedef typename Field::Element Element; 2193 2194 Element * D = FFLAS::fflas_new<Element>(m*n); 2195 Element * C = FFLAS::fflas_new<Element>(m*n); 2196 2197 tmp.clear();tmp.start(); 2198 fgemm(F,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, 2199 m,n,k, 1, A,k, B,n, 0, D, n); 2200 tmp.stop(); tim_wd += tmp ; nb_wd ++; 2201 2202 /* bini_p */ 2203 if (with_e) { 2204 for (int algo = 0 ; algo < algos ; ++algo) { 2205 tmp.clear();tmp.start(); 2206 FFLAS::Protected::gemm_bini_e(F,m,n,k,A,k,B,n,C,n,r,selec[algo]); 2207 tmp.stop(); tim_e[algo] += tmp ; 2208 2209 /* checking */ 2210 check_equal(F,m,n,D,n,C,n,"bini_e",descr[algo],ok_e[algo]) ; 2211 } 2212 } 2213 2214 /* compress */ 2215 if (with_k && std::is_same<typename FieldTraits<Field>::category,FFLAS::FieldCategories::ModularTag>::value && (! FieldTraits<Field>::balanced)) { 2216 for (int algo = 0 ; algo < algos_k ; ++algo) { 2217 tmp.clear();tmp.start(); 2218 FFLAS::Protected::gemm_compress(F,m,n,k,A,k,B,n,C,n,r,selec_k[algo]); 2219 tmp.stop(); tim_k[algo] += tmp ; 2220 2221 /* checking */ 2222 check_equal(F,m,n,D,n,C,n,"compress",descr_k[algo],ok_k[algo]) ; 2223 2224 2225 } 2226 } 2227 2228 /* bini_p */ 2229 for (int algo = 0 ; algo < algos ; ++algo) { 2230 tmp.clear();tmp.start(); 2231 FFLAS::Protected::gemm_bini_p(F,m,n,k,A,k,B,n,C,n,r,selec[algo]); 2232 tmp.stop(); tim_p[algo] += tmp ; 2233 2234 /* checking */ 2235 check_equal(F,m,n,D,n,C,n,"bini_p",descr[algo],ok_p[algo]) ; 2236 2237 2238 } 2239 2240 FFLAS::fflas_delete(C); 2241 FFLAS::fflas_delete(D); 2242 } 2243 2244 template<class T> 2245 struct changeField { 2246 typedef T other ; 2247 }; 2248 2249 template<> 2250 struct changeField<Modular<double> > { 2251 typedef Givaro::Modular<float> other; 2252 }; 2253 2254 template<> 2255 struct changeField<ModularBalanced<double> > { 2256 typedef ModularBalanced<float> other; 2257 }; 2258 2259 double descrip(int algo, int_v & ok_e, time_v & tim_e, int iters, const char ** madescr, const char * nom) 2260 { 2261 int min_e = -1 ; 2262 double bini_e = -1 ; 2263 for (int i = 0 ; i < algo ; ++i){ 2264 if (ok_e[i] == (int)iters) { 2265 double bini1 = tim_e[i].usertime()/(double)ok_e[i] ; 2266 if (bini_e < 0) { 2267 bini_e = bini1; 2268 min_e = (int) i ; 2269 } 2270 else if (bini1 < bini_e) { 2271 min_e = (int)i ; 2272 bini_e = bini1 ; 2273 } 2274 } 2275 } 2276 for (int i = 0 ; i < algo ; ++i){ 2277 if (ok_e[i] == (int)iters) { 2278 double bini1 = tim_e[i].usertime()/(double)ok_e[i] ; 2279 std::cout << nom << " ( " << madescr[i] << " ) : " ; 2280 if ((int)i == min_e) std::cout << " * " ; 2281 else std::cout << " "; 2282 std::cout << bini1 << 's'<< std::endl; 2283 } 2284 } 2285 2286 return bini_e ; 2287 } 2288 2289 2290 template<class Field> 2291 void test(int m, int k, int n, int p, int r, bool with_e, bool with_k, int iters = 4, uint64_t seed=0) 2292 { 2293 2294 typedef typename Field::Element Element; 2295 2296 Element * A = FFLAS::fflas_new<Element>(m*k); 2297 Element * B = FFLAS::fflas_new<Element>(n*k); 2298 2299 2300 Field F(p); 2301 typename Field::RandIter G(F,0,seed); 2302 F.write(std::cout<< " * Field " ) << std::endl; 2303 2304 typedef typename changeField<Field>::other Field_f ; 2305 typedef typename Field_f::Element Element_f ; 2306 Field_f F_f(p); 2307 Element_f * A_f = FFLAS::fflas_new<Element_f>(m*k); 2308 Element_f * B_f = FFLAS::fflas_new<Element_f>(n*k); 2309 Element_f * C_f = FFLAS::fflas_new<Element_f>(m*n); 2310 2311 #if defined(NOTRANDOM) 2312 int i0 ; 2313 int j0 ; 2314 Element p2 ; F.init(p2,(int)F.mOne/2); 2315 std::cout << p2 << std::endl; 2316 #warning "not random" 2317 for (int i = 0 ; i < m ; ++i) 2318 for (int j = 0 ; j < k ; ++j) { 2319 i0 = i/(m/3); 2320 j0 = j/(k/2); 2321 if (i0 == 0 and j0 == 0) A[i*k+j] = F.mOne ; 2322 else if (i0 == 0 and j0 == 1) A[i*k+j] = F.zero ; 2323 else if (i0 == 1 and j0 == 0) A[i*k+j] = F.mOne ; 2324 else if (i0 == 1 and j0 == 1) A[i*k+j] = F.mOne ; 2325 else if (i0 == 2 and j0 == 0) A[i*k+j] = F.mOne ; 2326 else if (i0 == 2 and j0 == 1) A[i*k+j] = F.mOne ; 2327 else A[i*k+j] = F.mOne ; 2328 } 2329 for (int i = 0 ; i < k ; ++i) 2330 for (int j = 0 ; j < n ; ++j) { 2331 i0 = i/(k/2); 2332 j0 = j/(n/2); 2333 if (i0 == 0 and j0 == 0) B[i*n+j] = F.mOne ; 2334 else if (i0 == 0 and j0 == 1) B[i*n+j] = F.mOne ; 2335 else if (i0 == 1 and j0 == 0) B[i*n+j] = F.mOne ; 2336 else if (i0 == 1 and j0 == 1) B[i*n+j] = F.zero ; 2337 else B[i*n+j] = F.mOne ; 2338 2339 } 2340 #endif 2341 2342 time_v tim_e(algos), tim_p(algos), tim_k(algos_k); 2343 FFLAS::Timer tim_wd; tim_wd.clear(); 2344 FFLAS::Timer tim_wf; tim_wf.clear(); 2345 FFLAS::Timer tmp; 2346 for (int i = 0 ; i < algos ; ++i) { 2347 tim_e[i].clear(); 2348 tim_p[i].clear(); 2349 } 2350 for (int i = 0 ; i < algos_k ; ++i) { 2351 tim_k[i].clear(); 2352 } 2353 2354 int_v ok_p(algos,0), ok_e(algos,0), ok_k(algos_k,0); 2355 int nb_wd = 0 , nb_wf = 0 ; 2356 2357 for (int b = 0 ; b < iters ; ++b) { 2358 std::cout << "iter " << b+1 << " of " << iters << std::endl; 2359 #if not defined(NOTRANDOM) 2360 FFPACK::RandomMatrix(F, m, k, A, k, G); 2361 FFPACK::RandomMatrix(F, k, n, B, n, G); 2362 #endif 2363 FFLAS::finit(F_f,m,k,A,k,A_f,k); 2364 FFLAS::finit(F_f,k,n,B,n,B_f,n); 2365 2366 tmp.clear();tmp.start(); 2367 fgemm(F_f,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans, 2368 m,n,k, 1, A_f,k, B_f,n, 0, C_f, n); 2369 tmp.stop(); tim_wf += tmp ; nb_wf ++ ; 2370 2371 test_algos(F,m,n,k,A,k,B,n,r, 2372 tim_k,tim_e,tim_p, 2373 ok_k,ok_e,ok_p, 2374 tim_wd,nb_wd, 2375 with_e,with_k); 2376 } 2377 std::cout << std::endl << "results" << std::endl; 2378 2379 double bini_e = descrip(algos,ok_e,tim_e,iters,descr,"Bini_e"); 2380 double bini_p = descrip(algos,ok_p,tim_p,iters,descr,"Bini_p"); 2381 double bini_k = descrip(algos_k,ok_k,tim_k,iters,descr_k,"Bini_k"); 2382 2383 2384 double t_wd = tim_wd.usertime()/(double)(nb_wd); 2385 double t_wf = tim_wf.usertime()/(double)(nb_wf); 2386 2387 std::cout << "Wino d : " << t_wd << 's'<< std::endl; 2388 std::cout << "Wino f : " << t_wf << 's'<< std::endl; 2389 double wino = std::min(t_wd,t_wf) ; 2390 if (bini_e>=0) 2391 std::cout << "Gain e: " << ((bini_e-wino)/wino)*100 << '%' << std::endl; 2392 if (bini_p>=0) 2393 std::cout << "Gain p: " << ((bini_p-wino)/wino)*100 << '%' << std::endl; 2394 if (bini_k>=0) 2395 std::cout << "Gain k: " << ((bini_k-wino)/wino)*100 << '%' << std::endl; 2396 2397 2398 2399 2400 FFLAS::fflas_delete( A ); 2401 FFLAS::fflas_delete( B); 2402 2403 FFLAS::fflas_delete( A_f ); 2404 FFLAS::fflas_delete( B_f); 2405 FFLAS::fflas_delete( C_f); 2406 } 2407 2408 2409 2410 int main(int ac, char **av) { 2411 static int m = 36 ; 2412 static int n = 12 ; 2413 static int k = 18 ; 2414 static int p = 101; 2415 bool eps = false ; 2416 bool kom = false ; 2417 int r = 1 ; 2418 uint64_t seed = getSeed(); 2419 int iters = 4; 2420 2421 static Argument as[] = { 2422 { 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p }, 2423 { 'n', "-n N", "Set the number of cols in C.", TYPE_INT , &n }, 2424 { 'm', "-m N", "Set the number of rows in C.", TYPE_INT , &m }, 2425 { 'k', "-k N", "Set the number of rows in B.", TYPE_INT , &k }, 2426 { 'r', "-k N", "Set the recursive number Bini.", TYPE_INT , &r }, 2427 { 'i', "-i N", "Set the numebr of iterations.", TYPE_INT , &iters }, 2428 { 's', "-s N", "Set the seed .", TYPE_UINT64 , &seed }, 2429 { 'e', "-e " , "epsilon .", TYPE_NONE , &eps }, 2430 { 'c', "-c " , "compress .", TYPE_NONE , &kom}, 2431 END_OF_ARGUMENTS 2432 }; 2433 FFLAS::parseArguments(ac,av,as); 2434 2435 srand(seed); 2436 srand48(seed); 2437 2438 std::cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl; 2439 std::cout << "size: " << m << ',' << k << ',' << n << std::endl; 2440 std::cout << "mod : " << p << std::endl; 2441 std::cout << "rec : " << r << std::endl; 2442 std::cout << "seed: " << seed << std::endl; 2443 std::cout << "thre: " << TRE << std::endl; 2444 std::cout << "=====================================================" << std::endl; 2445 test<Modular<double> > (m,k,n,p,r,eps,kom,iters,seed); 2446 std::cout << "=====================================================" << std::endl; 2447 test<ModularBalanced<double> > (m,k,n,p,r,eps,kom,iters,seed); 2448 std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; 2449 2450 return 0; 2451 } 2452 /* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2453 // vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s 2454