1 #ifndef LINGEN_SUBSTEP_CHARACTERISTICS_HPP_ 2 #define LINGEN_SUBSTEP_CHARACTERISTICS_HPP_ 3 4 #include <iostream> 5 #include <memory> 6 #include <tuple> 7 8 #include "cxx_mpz.hpp" 9 #include "fmt/format.h" 10 #include "fmt/printf.h" 11 #include "subdivision.hpp" 12 #include "lingen_platform.hpp" 13 #include "lingen_substep_schedule.hpp" 14 #include "lingen_tuning_cache.hpp" 15 #include "lingen_matpoly_ft.hpp" 16 #include "lingen_mul_substeps.hpp" 17 #include "lingen_call_companion.hpp" 18 19 20 /* This file is an offspring from lingen_tuning.cpp - some of it is not 21 * really polished to the expected level of a usable interface. It 22 * happens to be also used by the binary 23 * time_matpoly_ft_parallel_${gfp_layer} 24 */ 25 26 template<typename OP> struct microbench_dft; 27 template<typename OP> struct microbench_ift; 28 template<typename OP> struct microbench_conv; 29 30 struct lingen_substep_characteristics { 31 typedef lingen_substep_characteristics ch_t; 32 typedef lingen_platform pc_t; 33 typedef lingen_substep_schedule sc_t; 34 typedef lingen_tuning_cache tc_t; 35 36 abdst_field ab; 37 cxx_mpz p; 38 gmp_randstate_t & rstate; 39 40 /* length of the input (E) for the call under consideration ; this is 41 * not the input length for the overall algorithm ! 42 * 43 * We only use it for printing. 44 * */ 45 size_t input_length; 46 47 op_mul_or_mp_base::op_type_t op_type; 48 49 /* We're talking products (or any other bilinear operation that uses 50 * transforms 1 by 1, e.g. MP) of matrices of size n0*n1 and n1*n2 51 */ 52 unsigned int n0; 53 unsigned int n1; 54 unsigned int n2; 55 56 /* length of the three operands. 57 * A has dimension n0*n1, length asize 58 * B has dimension n1*n2, length bsize 59 * C has dimension n1*n2, length csize 60 */ 61 size_t asize, bsize, csize; 62 63 public: 64 is_valid_schedulelingen_substep_characteristics65 bool is_valid_schedule(lingen_platform, unsigned int mesh, lingen_substep_schedule const & S) const 66 { 67 unsigned int shrink0 = S.shrink0; 68 unsigned int shrink2 = S.shrink2; 69 unsigned int r = mesh; 70 subdivision mpi_split0(n0, r); 71 subdivision mpi_split1(n1, r); 72 subdivision mpi_split2(n2, r); 73 subdivision shrink_split0(mpi_split0.block_size_upper_bound(), shrink0); 74 subdivision shrink_split2(mpi_split2.block_size_upper_bound(), shrink2); 75 unsigned int nrs0(shrink_split0.block_size_upper_bound()); 76 unsigned int nrs2(shrink_split2.block_size_upper_bound()); 77 const unsigned int nr1 = mpi_split1.block_size_upper_bound(); 78 unsigned int b0 = S.batch[0]; 79 unsigned int b1 = S.batch[1]; 80 unsigned int b2 = S.batch[2]; 81 subdivision loop1 = subdivision::by_block_size(nr1, b1); 82 for(unsigned int round = 0 ; round < shrink0 * shrink2 ; round++) { 83 unsigned round0 = round % shrink0; 84 unsigned round2 = round / shrink0; 85 unsigned int i0, i1; 86 unsigned int j0, j1; 87 std::tie(i0, i1) = shrink_split0.nth_block(round0); 88 std::tie(j0, j1) = shrink_split2.nth_block(round2); 89 ASSERT_ALWAYS((i1 - i0) <= nrs0); 90 ASSERT_ALWAYS((j1 - j0) <= nrs2); 91 subdivision loop0 = subdivision::by_block_size(i1 - i0, b0); 92 subdivision loop2 = subdivision::by_block_size(j1 - j0, b2); 93 94 if (loop0.nblocks() != 1 && loop2.nblocks() != 1) 95 return false; 96 bool process_blocks_row_major = b0 == nrs0; 97 98 for(unsigned int iloop1 = 0 ; iloop1 < loop1.nblocks() ; iloop1++) { 99 if (process_blocks_row_major) { 100 if (loop0.nblocks() != 1) return false; 101 if (nrs0 != b0) return false; 102 } else { 103 if (loop2.nblocks() != 1) return false; 104 if (nrs2 != b2) return false; 105 } 106 } 107 } 108 return true; 109 } 110 lingen_substep_characteristicslingen_substep_characteristics111 lingen_substep_characteristics(abdst_field ab, gmp_randstate_t & rstate, size_t input_length, op_mul_or_mp_base::op_type_t op_type, unsigned int n0, unsigned int n1, unsigned int n2, size_t asize, size_t bsize, size_t csize) :/*{{{*/ 112 ab(ab), 113 rstate(rstate), 114 input_length(input_length), 115 op_type(op_type), 116 n0(n0), n1(n1), n2(n2), 117 asize(asize), bsize(bsize), csize(csize) 118 { 119 #ifdef SELECT_MPFQ_LAYER_u64k1 120 mpz_set_ui(p, 2); 121 // op = OP(asize, bsize); 122 #else 123 mpz_set(p, abfield_characteristic_srcptr(ab)); 124 // op = OP(p, asize, bsize, n1); 125 #endif 126 // fft_alloc_sizes = op.fti.get_alloc_sizes(); 127 }/*}}}*/ 128 max_threadslingen_substep_characteristics129 static inline int max_threads() { 130 #ifdef HAVE_OPENMP 131 return omp_get_max_threads(); 132 #else 133 return 1; 134 #endif 135 } 136 137 private: has_cached_timelingen_substep_characteristics138 bool has_cached_time(tc_t const & C, lingen_substep_schedule::fft_type_t fft_type) const {/*{{{*/ 139 lingen_tuning_cache::mul_or_mp_key K { op_type, fft_type, mpz_sizeinbase(p, 2), asize, bsize }; 140 return C.has(K); 141 }/*}}}*/ 142 143 /* tl;dr: array of doubles is input, weighted_double is output. 144 * 145 * A parallelizable timing is created from an array of 146 * micro-measurements that tell how long (WCT) it takes to do k times 147 * the same operation in parallel, for k less than some bound 148 * (typically the number of threads, but it can be less if RAM 149 * allows only so many operations in parallel). 150 * 151 * We _use_ a parallelizable timing in the form of the parent 152 * weighted_double, essentially. This carries the info on the 153 * _amortized time_ (t) it takes to do some number (n) of operations. 154 * Originally t is initialized to what the array of concurrent 155 * timings says of what happens for just one thread. 156 * 157 * The gist of it is that we have the .parallelize() method. It 158 * changes the underlying weighted_double to reflect the amortized 159 * timing, again, but when some number of operations are done in 160 * parallel. 161 * 162 * This is in contrast with operator*(), which counts more operations 163 * to do but not in parallel. 164 */ 165 class parallelizable_timing : public weighted_double/*{{{*/ 166 { 167 std::vector<double> concurrent_timings; 168 bool counted_as_parallel = false; max_parallel() const169 unsigned int max_parallel() const { return concurrent_timings.size() - 1; } 170 private: get_concurrent_time(unsigned int k)171 double get_concurrent_time(unsigned int k) { 172 /* How long does it take (WCT) to do k operations in parallel ? */ 173 /* Ideally, this should be almost constant as long as k is 174 * less than the number of threads, and then it should reach 175 * linear behaviour (but we're restricting to small k 176 * anyway). 177 * In practice, "constant" isn't really constant because 178 * there's some overhead. After all, all cores and 179 * hyperthreads are competing for cache, for example. So we 180 * allow a linear regression. 181 */ 182 ASSERT_ALWAYS(k > 0); 183 ASSERT_ALWAYS(k <= max_parallel()); 184 unsigned int kl=1, kh=1; 185 double tl=t, th=t; 186 for(unsigned int i = 1 ; i <= k ; i++) { 187 if (concurrent_timings[i] < 0) continue; 188 kl = i; 189 tl = concurrent_timings[i]; 190 } 191 for(unsigned int i = max_parallel() ; i >= k ; i--) { 192 if (concurrent_timings[i] < 0) continue; 193 kh = i; 194 th = concurrent_timings[i]; 195 } 196 if (kl == kh) return tl; 197 return tl + (th-tl)*(k-kl)/(kh-kl); 198 } 199 public: parallelizable_timing(std::vector<double> const & c)200 parallelizable_timing(std::vector<double> const & c) : concurrent_timings(c) { 201 // ASSERT_ALWAYS(c.size() == (size_t) max_threads() + 1); 202 this->n = 1; 203 this->t = concurrent_timings[1]; 204 } 205 /* This ctor is only for something that we will _never_ 206 * parallelize, so that we have no use for an array of 207 * concurrent_timings. 208 * In other cases, we insist on providing the 209 * concurrent_timings array, via the other ctor. 210 */ parallelizable_timing(double d)211 parallelizable_timing(double d) 212 : weighted_double { 1, d } 213 , concurrent_timings { 0, d } 214 { 215 } 216 parallelizable_timing()217 parallelizable_timing() : weighted_double { 0, 0 } {} 218 operator *=(unsigned int x)219 parallelizable_timing& operator*=(unsigned int x) { 220 n *= x; 221 return *this; 222 } operator *(unsigned int x) const223 parallelizable_timing operator*(unsigned int x) const { 224 parallelizable_timing X = *this; 225 X *= x; 226 return X; 227 } parallelize(unsigned int k,unsigned int T)228 parallelizable_timing& parallelize(unsigned int k, unsigned int T) { 229 ASSERT_ALWAYS(!counted_as_parallel); 230 counted_as_parallel = true; 231 /* We suppose that on input, *this reprensents n operations 232 * that sequentially take time t, and that this time 233 * represents something single-threaded. This function 234 * replaces the inner operation by the fact of doing k times 235 * this same thing, in parallel. The max level of expected 236 * parallelism is given by T. The field t is replaced then 237 * by the _average time_ that these operation take (that is, 238 * 1/k times the cumulated time it takes to do k operations). 239 * If perfect parallelism is achieved (all k operations done 240 * up to T times in parallel, with linear scaling), this 241 * effectively does 242 * this->n *= k 243 * this->t *= iceildiv(k, T) / (double) k; 244 * However, even with T <= max_threads(), this 245 * process is likely to give somewhat different results 246 * because not everything achieves perfect parallelism at the 247 * CPU level. 248 * 249 * Note that T should really be max_threads(). 250 */ 251 T = std::min(T, max_parallel()); 252 unsigned int qk = k / T; 253 unsigned int rk = k - qk * T; 254 /* get_concurrent_time returns the wall-clock time needed to 255 * do T dfts in parallel 256 */ 257 double tt = qk * get_concurrent_time(T); 258 if (rk) tt += get_concurrent_time(rk); 259 n *= k; 260 t = tt / k; 261 return *this; 262 } operator double() const263 operator double() const { return n * t; } operator +(parallelizable_timing const & b) const264 double operator+(parallelizable_timing const & b) const { return *this + (double) b; } operator +(double y) const265 double operator+(double y) const { return (double) *this + y; } 266 };/*}}}*/ 267 268 269 /* We can measure timings in one of three manners here. 270 * - "cache-hot" do repeatedly the same thing, on identical data. 271 * - "cache-cold", quick: do the calculation only once (malloc, 272 * compute, free). 273 * - "cache-cold", thorough: do the calculation on matrices of 274 * different inputs. 275 * 276 * None is a true winner. "cache-hot" is easily optimistic, and both 277 * "cache-cold" options are somewhat pessimistic. 278 */ 279 280 // #define CALL_MEMBER_FN(object,ptrToMember) ((object).*(ptrToMember)) 281 282 public: 283 // typedef double (lingen_substep_characteristics::*raw_timer_member)(unsigned int, unsigned int, unsigned int) const; 284 285 private: 286 template<typename T> complete_tveclingen_substep_characteristics287 void complete_tvec(std::ostream& os, std::vector<double> & tvec, T & F, unsigned int kl, unsigned int kh) const/*{{{*/ 288 { 289 /* The ideal expected behaviour is that this should be constant. 290 * But it's not, since flint itself uses openmp. We're doing 291 * better, since we're coarser grain. But still, as timing goes, 292 * this means that we should be prepared to have a roughly linear 293 * curve. 294 */ 295 if (kh <= kl + 1) return; 296 double tl = tvec[kl]; 297 double th = tvec[kh]; 298 if (th - tl <= 0.1 * tl) return; 299 unsigned int k = (kl + kh) / 2; 300 double tk = F(k); 301 os << fmt::format(FMT_STRING(" {}:{:.2g}"), k, tk); 302 os.flush(); 303 tvec[k] = tk; 304 double linfit_tk = tl + (th-tl)*(k-kl)/(kh-kl); 305 if ((tk - linfit_tk) <= 0.1*linfit_tk) return; 306 complete_tvec(os, tvec, F, kl, k); 307 complete_tvec(os, tvec, F, k, kh); 308 }/*}}}*/ 309 310 public: 311 template<typename T> fill_tveclingen_substep_characteristics312 std::vector<double> fill_tvec(std::ostream& os, T const & F) const {/*{{{*/ 313 /* can't make my mind as to whether I should just output that to 314 * the terminal, or stow it in a string first... 315 */ 316 317 constexpr const char * Fname = T::name; 318 319 unsigned int TMAX = max_threads(); 320 321 std::vector<double> tvec(TMAX + 1, -1); 322 323 if (Fname == NULL) { 324 tvec[1] = tvec[TMAX] = 0; 325 return tvec; 326 } 327 328 os << fmt::sprintf("# %s%s;%s (@%d) wct for %s by nthreads:", 329 F.mesh > 1 ? "MPI-" : "", 330 F.op.op_name(), 331 lingen_substep_schedule::fft_name(encode_fft_type<typename T::OP::FFT>()), 332 input_length, 333 Fname 334 ); 335 336 if (F.max_parallel() < TMAX) { 337 TMAX = F.max_parallel(); 338 os << fmt::format(FMT_STRING(" [capped to {}]"), TMAX); 339 } 340 341 unsigned int kl = 1; 342 double tl = F(kl); 343 tvec[kl] = tl; 344 os << fmt::format(FMT_STRING(" {}:{:.2g}"), kl, tl); 345 os.flush(); 346 347 348 unsigned int kh = TMAX; 349 if (kh > 1) { 350 double th = F(kh); 351 tvec[kh] = th; 352 os << fmt::format(FMT_STRING(" {}:{:.2g}"), kh, th); 353 os.flush(); 354 } 355 356 complete_tvec(os, tvec, F, kl, kh); 357 358 os << std::endl; 359 return tvec; 360 }/*}}}*/ 361 362 private: 363 template<typename T> get_ft_time_from_cache_or_recomputelingen_substep_characteristics364 parallelizable_timing get_ft_time_from_cache_or_recompute(std::ostream& os, T const & F, tc_t::mul_or_mp_value::value_type & store) const 365 { 366 if (!store.empty()) { 367 /* what do we have in cache, to start with ? We need to know how 368 * many threads max were considered at the time when the tuning 369 * was made. 370 */ 371 unsigned int th_cache = 0; 372 for(auto const & x : store) { 373 if (x.first > th_cache) 374 th_cache = x.first; 375 } 376 /* How many threads do we support max on the current platform ? 377 * This is not necessarily equal. 378 */ 379 if (F.max_parallel() == th_cache) { 380 std::vector<double> tvec(th_cache + 1, -1); 381 for(auto const & x : store) { 382 tvec[x.first] = x.second; 383 } 384 return parallelizable_timing(tvec); 385 } 386 os << fmt::format(FMT_STRING("# ignoring cached entry, computed for up to {} threads (here {} max)\n"), th_cache, F.max_parallel()); 387 store.clear(); 388 } 389 390 std::vector<double> tvec = fill_tvec(os, F); 391 for(unsigned int i = 1 ; i < tvec.size() ; i++) { 392 if (tvec[i] >= 0) store.push_back( { i, tvec[i] } ); 393 } 394 return parallelizable_timing(tvec); 395 } 396 397 template<typename OP> get_ft_timeslingen_substep_characteristics398 std::array<parallelizable_timing, 4> get_ft_times(std::ostream& os, OP const & op, pc_t const& P, unsigned int mesh, lingen_substep_schedule const & S, tc_t & C) const {/*{{{*/ 399 lingen_tuning_cache::mul_or_mp_key K { op_type, S.fft_type, mpz_sizeinbase(p, 2), asize, bsize }; 400 typedef lingen_tuning_cache::mul_or_mp_value cache_value_type; 401 402 std::array<parallelizable_timing, 4> res; 403 cache_value_type cached_res; 404 405 if (C.has(K)) 406 cached_res = C[K]; 407 408 { 409 microbench_dft<OP> F(op, P, mesh, *this); 410 res[0] = get_ft_time_from_cache_or_recompute(os, F, cached_res[0]); 411 res[1] = res[0]; 412 cached_res[1] = cached_res[0]; 413 } 414 415 { 416 microbench_ift<OP> F(op, P, mesh, *this); 417 res[2] = get_ft_time_from_cache_or_recompute(os, F, cached_res[2]); 418 } 419 420 { 421 microbench_conv<OP> F(op, P, mesh, *this); 422 res[3] = get_ft_time_from_cache_or_recompute(os, F, cached_res[3]); 423 } 424 425 /* A priori this is either all-fresh or all-old. But nothing in 426 * the code checks that the situation is indeed the same for the 427 * four items. 428 */ 429 C[K] = cached_res; 430 431 return res; 432 }/*}}}*/ 433 434 public: 435 /* {{{ std::tuple<size_t, size_t, size_t> get_operand_ram() const { 436 return { 437 n0 * n1 * asize * mpz_size(p) * sizeof(mp_limb_t), 438 n1 * n2 * bsize * mpz_size(p) * sizeof(mp_limb_t), 439 n0 * n2 * csize * mpz_size(p) * sizeof(mp_limb_t) }; 440 } 441 *//*}}}*/ 442 #if 0 443 int mesh_inner_size(pc_t const & P) const {/*{{{*/ 444 return P.r; 445 }/*}}}*/ 446 #endif mpi_split0lingen_substep_characteristics447 subdivision mpi_split0(unsigned int mesh) const {/*{{{*/ 448 return subdivision(n0, mesh); 449 }/*}}}*/ mpi_split1lingen_substep_characteristics450 subdivision mpi_split1(unsigned int mesh) const {/*{{{*/ 451 return subdivision(n1, mesh); 452 }/*}}}*/ mpi_split2lingen_substep_characteristics453 subdivision mpi_split2(unsigned int mesh) const {/*{{{*/ 454 return subdivision(n2, mesh); 455 }/*}}}*/ shrink_split0lingen_substep_characteristics456 subdivision shrink_split0(unsigned int mesh, unsigned int shrink0) const {/*{{{*/ 457 unsigned int nr0 = mpi_split0(mesh).block_size_upper_bound(); 458 return subdivision(nr0, shrink0); 459 }/*}}}*/ shrink_split2lingen_substep_characteristics460 subdivision shrink_split2(unsigned int mesh, unsigned int shrink2) const {/*{{{*/ 461 unsigned int nr2 = mpi_split2(mesh).block_size_upper_bound(); 462 return subdivision(nr2, shrink2); 463 }/*}}}*/ shrink_split0lingen_substep_characteristics464 subdivision shrink_split0(unsigned int mesh, sc_t const & S) const {/*{{{*/ 465 return shrink_split0(mesh, S.shrink0); 466 }/*}}}*/ shrink_split2lingen_substep_characteristics467 subdivision shrink_split2(unsigned int mesh, sc_t const & S) const {/*{{{*/ 468 return shrink_split2(mesh, S.shrink2); 469 }/*}}}*/ get_peak_ram_multiplierslingen_substep_characteristics470 std::array<std::array<unsigned int, 3>, 2> get_peak_ram_multipliers(unsigned int mesh, sc_t const & S) const { /* {{{ */ 471 unsigned int nrs0 = shrink_split0(mesh, S).block_size_upper_bound(); 472 unsigned int nrs2 = shrink_split2(mesh, S).block_size_upper_bound(); 473 474 unsigned int b0 = S.batch[0]; 475 unsigned int b1 = S.batch[1]; 476 unsigned int b2 = S.batch[2]; 477 unsigned int mul0 = 0; 478 mul0 += b1 * mesh * b0; 479 mul0 += b1 * mesh * b2; 480 mul0 += nrs0*nrs2; 481 unsigned int mul1 = std::max(b0 * std::max(b1, b2), nrs0 * nrs2); 482 unsigned int mul12 = b0 * b2; 483 484 /* *IF* the pragma omp parallel statements go with an appropriate 485 * num_threads() clause, then yes, it makes sense to do this 486 * min(). Otherwise, the multipliers will be max_threads() in all 487 * cases. 488 */ 489 mul1 = std::min(mul1, (unsigned int) max_threads()); 490 mul12 = std::min(mul12, (unsigned int) max_threads()); 491 492 /* 493 size_t live_transforms = fft_alloc_sizes[0] * mul0; 494 size_t maxtemp_ft = std::min(nrs0 * nrs2, 495 (unsigned int) max_threads()) * fft_alloc_sizes[1]; 496 size_t maxtemp_addmul = std::min(S.batch[0] * S.batch[2], 497 (unsigned int) max_threads()) 498 * (fft_alloc_sizes[1] + fft_alloc_sizes[2]); 499 return live_transforms + std::max(maxtemp_ft, maxtemp_addmul); 500 */ 501 return {{ 502 // if (mul1 * op.get_alloc_sizes()[1] < mul12 * 503 // (op.get_alloc_sizes()[1] + op.get_alloc_sizes()[2])) then 504 // the largest amount of ram is with the first of the two 505 // options below. 506 {{ mul0, mul12, mul12 }}, 507 {{ mul0, mul1, 0 }} 508 }}; 509 } 510 /*}}}*/ get_peak_ramlingen_substep_characteristics511 size_t get_peak_ram(op_mul_or_mp_base const & op, unsigned int mesh, sc_t const & S) const { /* {{{ */ 512 auto multipliers = get_peak_ram_multipliers(mesh, S); 513 size_t rpeak = 0; 514 for(auto const & M : multipliers) { 515 size_t r = 0; 516 for(unsigned int i = 0 ; i < 3 ; i++) 517 r += M[i] * op.get_alloc_sizes()[i]; 518 if (r > rpeak) rpeak = r; 519 } 520 return rpeak; 521 } 522 /*}}}*/ get_peak_ramlingen_substep_characteristics523 size_t get_peak_ram(unsigned int mesh, sc_t const & S) const { /* {{{ */ 524 std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type); 525 return get_peak_ram(*op, mesh, S); 526 } 527 /*}}}*/ 528 fft_type_validlingen_substep_characteristics529 bool fft_type_valid(lingen_substep_schedule::fft_type_t fft_type) const { 530 try { 531 std::shared_ptr<op_mul_or_mp_base> op = instantiate(fft_type); 532 #ifdef SELECT_MPFQ_LAYER_u64k1 533 if (op_type == op_mul_or_mp_base::OP_MUL) { 534 auto x = dynamic_cast<op_mul<gf2x_ternary_fft_info> const *>(op.get()); 535 if (x) 536 return x->fti.K != 0; 537 } 538 if (op_type == op_mul_or_mp_base::OP_MP) { 539 auto x = dynamic_cast<op_mp<gf2x_ternary_fft_info> const *>(op.get()); 540 if (x) 541 return x->fti.K != 0; 542 } 543 #endif 544 } catch (std::exception const &) { 545 return false; 546 } 547 return true; 548 } 549 550 551 private: 552 553 struct call_time_digest { 554 parallelizable_timing T_dft0; 555 parallelizable_timing T_dft2; 556 parallelizable_timing T_conv; 557 parallelizable_timing T_ift; 558 parallelizable_timing T_comm0; 559 parallelizable_timing T_comm2; 560 std::array<size_t, 3> fft_alloc_sizes; totallingen_substep_characteristics::call_time_digest561 double total() const { 562 double tt = 0; 563 tt += T_dft0; 564 tt += T_dft2; 565 tt += T_conv; 566 tt += T_ift; 567 tt += T_comm0; 568 tt += T_comm2; 569 return tt; 570 } 571 }; 572 573 template<typename OP> get_call_time_backendlingen_substep_characteristics574 call_time_digest get_call_time_backend( 575 std::ostream& os, 576 OP const & op, 577 pc_t const & P, 578 unsigned int mesh, 579 sc_t const & S, 580 tc_t & C, 581 bool do_timings) const 582 { /* {{{ */ 583 ASSERT_FOR_STATIC_ANALYZER(mesh != 0); 584 585 /* XXX Any change here must also be reflected in the mp_or_mul 586 * structure in lingen_matpoly_bigmatpoly_ft_common.hpp 587 */ 588 parallelizable_timing T_dft0 { -1 }; 589 parallelizable_timing T_dft2 { -1 }; 590 parallelizable_timing T_ift { -1 }; 591 parallelizable_timing T_conv { -1 }; 592 593 if (do_timings) { 594 auto ft = get_ft_times(os, op, P, mesh, S, C); 595 /* These are just base values, we'll multiply them later on */ 596 T_dft0 = ft[0]; 597 T_dft2 = ft[1]; 598 T_ift = ft[2]; 599 T_conv = ft[3]; 600 } 601 602 /* Each of the r*r nodes has local matrices size (at most) 603 * nr0*nr1, nr1*nr2, and nr0*nr2. For the actual computations, we 604 * care more about the shrunk submatrices, though */ 605 unsigned int nr1 = mpi_split1(mesh).block_size_upper_bound(); 606 607 /* The shrink parameters will divide the size of the local 608 * matrices we consider by numbers shrink0 and shrink2. This 609 * increases the time, and decreases the memory footprint */ 610 unsigned int nrs0 = shrink_split0(mesh, S).block_size_upper_bound(); 611 unsigned int nrs2 = shrink_split2(mesh, S).block_size_upper_bound(); 612 613 /* The 1-threaded timing will be obtained by setting T=batch=1. */ 614 615 /* Locally, we'll add together r matrices of size nrs0*nrs2, 616 * which will be computed as products of vectors of size 617 * batch[0]*batch[1] and batch[1]*batch[2]. (with wither 618 * batch[0]==nrs0 or batch[2]==nrs2). The operation will be 619 * repeated nrs0/batch[0] * nr1/batch[1] * nrs2/batch[2] times 620 * (not all multipliers apply to everyone). 621 */ 622 623 T_dft0.parallelize(S.batch[0] * S.batch[1], P.T); 624 T_dft0 *= iceildiv(nrs0, S.batch[0]); 625 T_dft0 *= iceildiv(nr1, S.batch[1]); 626 627 T_dft2.parallelize(S.batch[1] * S.batch[2], P.T); 628 T_dft2 *= iceildiv(nrs2, S.batch[2]); 629 T_dft2 *= iceildiv(nr1, S.batch[1]); 630 631 T_conv.parallelize(S.batch[0] * S.batch[2], P.T); 632 T_conv *= mesh; 633 T_conv *= iceildiv(nr1, S.batch[1]) * S.batch[1]; 634 T_conv *= iceildiv(nrs0, S.batch[0]); 635 T_conv *= iceildiv(nrs2, S.batch[2]); 636 637 T_ift.parallelize(nrs0 * nrs2, P.T); 638 639 /* Now the communication time _per call_ */ 640 /* TODO: maybe include some model of latency... */ 641 /* FIXME: This is wrong with the non-caching approach */ 642 parallelizable_timing T = op.get_alloc_sizes()[0] / P.mpi_xput; 643 T *= S.batch[1] * iceildiv(nr1, S.batch[1]); 644 /* allgather over n nodes */ 645 T *= mesh - 1; 646 parallelizable_timing T_comm0 = T; 647 parallelizable_timing T_comm2 = T; 648 T_comm0 *= S.batch[0] * iceildiv(nrs0, S.batch[0]); 649 T_comm2 *= S.batch[2] * iceildiv(nrs2, S.batch[2]); 650 651 call_time_digest ret; 652 653 ret.T_dft0 = T_dft0 *= S.shrink0 * S.shrink2; 654 ret.T_dft2 = T_dft2 *= S.shrink0 * S.shrink2; 655 ret.T_conv = T_conv *= S.shrink0 * S.shrink2; 656 ret.T_ift = T_ift *= S.shrink0 * S.shrink2; 657 ret.T_comm0 = T_comm0 *= S.shrink0 * S.shrink2; 658 ret.T_comm2 = T_comm2 *= S.shrink0 * S.shrink2; 659 ret.fft_alloc_sizes = op.get_alloc_sizes(); 660 661 /* This is for _one_ call only (one node in the tree). 662 * We must apply multiplier coefficients (typically 1<<depth) */ 663 return ret; 664 }/*}}}*/ 665 instantiatelingen_substep_characteristics666 std::shared_ptr<op_mul_or_mp_base> instantiate(lingen_substep_schedule::fft_type_t fft_type) const 667 { 668 /* must create the op type from the lingen_substep_schedule 669 * (a.k.a. sc_t) type */ 670 switch(op_type) { 671 case op_mul_or_mp_base::OP_MP: 672 switch(fft_type) { 673 case lingen_substep_schedule::FFT_NONE: 674 return std::make_shared<op_mp<void>>(p, asize, bsize, n1); 675 #ifndef SELECT_MPFQ_LAYER_u64k1 676 case lingen_substep_schedule::FFT_FLINT: 677 return std::make_shared<op_mp<fft_transform_info>>(p, asize, bsize, n1); 678 #else 679 case lingen_substep_schedule::FFT_CANTOR: 680 return std::make_shared<op_mp<gf2x_cantor_fft_info>>(p, asize, bsize, n1); 681 case lingen_substep_schedule::FFT_TERNARY: 682 return std::make_shared<op_mp<gf2x_ternary_fft_info>>(p, asize, bsize, n1); 683 #endif 684 default: 685 throw std::runtime_error("invalid data (fft_type)"); 686 } 687 case op_mul_or_mp_base::OP_MUL: 688 switch(fft_type) { 689 case lingen_substep_schedule::FFT_NONE: 690 return std::make_shared<op_mul<void>>(p, asize, bsize, n1); 691 #ifndef SELECT_MPFQ_LAYER_u64k1 692 case lingen_substep_schedule::FFT_FLINT: 693 return std::make_shared<op_mul<fft_transform_info>>(p, asize, bsize, n1); 694 #else 695 case lingen_substep_schedule::FFT_CANTOR: 696 return std::make_shared<op_mul<gf2x_cantor_fft_info>>(p, asize, bsize, n1); 697 case lingen_substep_schedule::FFT_TERNARY: 698 return std::make_shared<op_mul<gf2x_ternary_fft_info>>(p, asize, bsize, n1); 699 #endif 700 default: 701 throw std::runtime_error("invalid data (fft_type)"); 702 } 703 default: 704 throw std::runtime_error("invalid data (op)"); 705 } 706 } 707 get_call_time_backendlingen_substep_characteristics708 call_time_digest get_call_time_backend(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const { /* {{{ */ 709 switch(op_type) { 710 case op_mul_or_mp_base::OP_MP: 711 switch(S.fft_type) { 712 case lingen_substep_schedule::FFT_NONE: 713 return get_call_time_backend(os, op_mp<void>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 714 #ifndef SELECT_MPFQ_LAYER_u64k1 715 case lingen_substep_schedule::FFT_FLINT: 716 return get_call_time_backend(os, op_mp<fft_transform_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 717 #else 718 case lingen_substep_schedule::FFT_CANTOR: 719 return get_call_time_backend(os, op_mp<gf2x_cantor_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 720 case lingen_substep_schedule::FFT_TERNARY: 721 return get_call_time_backend(os, op_mp<gf2x_ternary_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 722 #endif 723 default: 724 throw std::runtime_error("invalid data (fft_type)"); 725 } 726 case op_mul_or_mp_base::OP_MUL: 727 switch(S.fft_type) { 728 case lingen_substep_schedule::FFT_NONE: 729 return get_call_time_backend(os, op_mul<void>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 730 #ifndef SELECT_MPFQ_LAYER_u64k1 731 case lingen_substep_schedule::FFT_FLINT: 732 return get_call_time_backend(os, op_mul<fft_transform_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 733 #else 734 case lingen_substep_schedule::FFT_CANTOR: 735 return get_call_time_backend(os, op_mul<gf2x_cantor_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 736 case lingen_substep_schedule::FFT_TERNARY: 737 return get_call_time_backend(os, op_mul<gf2x_ternary_fft_info>(p, asize, bsize, n1), P, mesh, S, C, do_timings); 738 #endif 739 default: 740 throw std::runtime_error("invalid data (fft_type)"); 741 } 742 default: 743 throw std::runtime_error("invalid data (op)"); 744 } 745 746 }/*}}}*/ 747 748 public: 749 get_companionlingen_substep_characteristics750 lingen_call_companion::mul_or_mp_times get_companion(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, size_t reserved_ram, bool do_timings) const { /* {{{ */ 751 lingen_call_companion::mul_or_mp_times D { op_type }; 752 D.S = S; 753 auto A = get_call_time_backend(os, P, mesh, S, C, do_timings); 754 double tt = A.total(); 755 D.tt = { 1, tt }; 756 D.t_dft_A = A.T_dft0; 757 D.t_dft_B = A.T_dft2; 758 D.t_conv = A.T_conv; 759 D.t_ift_C = A.T_ift; 760 D.t_dft_A_comm = A.T_comm0; 761 D.t_dft_B_comm = A.T_comm2; 762 D.fft_alloc_sizes = A.fft_alloc_sizes; 763 D.peak_ram_multipliers = get_peak_ram_multipliers(mesh, S); 764 D.asize = asize; 765 D.bsize = bsize; 766 D.csize = csize; 767 D.reserved_ram = reserved_ram; 768 return D; 769 }/*}}}*/ get_call_timelingen_substep_characteristics770 double get_call_time(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const {/*{{{*/ 771 return get_call_time_backend(os, P, mesh, S, C, do_timings).total(); 772 }/*}}}*/ 773 774 private: 775 struct sortop { operator ()lingen_substep_characteristics::sortop776 bool operator()(std::tuple<double, unsigned int, lingen_substep_schedule> const & a, std::tuple<double, unsigned int, lingen_substep_schedule> const & b) const { 777 if (std::get<0>(a) != std::get<0>(b)) 778 return std::get<0>(a) < std::get<0>(b); 779 else 780 return std::get<1>(a) > std::get<1>(b); 781 } 782 }; 783 784 public: sort_scheduleslingen_substep_characteristics785 void sort_schedules( 786 std::ostream& os, 787 std::vector<lingen_substep_schedule>& schedules, 788 lingen_platform const & P, 789 unsigned int mesh, 790 lingen_tuning_cache & C, 791 bool do_timings) const 792 {/*{{{*/ 793 std::vector<std::tuple<double, unsigned int, lingen_substep_schedule>> precomp; 794 for(auto & S : schedules) { 795 /* This should ensure that all timings are obtained from cache */ 796 /* This may print timing info to the output stream */ 797 double t = do_timings ? get_call_time(os, P, mesh, S, C, do_timings) : 0; 798 unsigned int B = S.batch[1] * std::min(S.batch[0], S.batch[2]); 799 precomp.emplace_back(t, B, std::move(S)); 800 } 801 sort(precomp.begin(), precomp.end(), sortop()); 802 schedules.clear(); 803 for(auto & P : precomp) { 804 schedules.emplace_back(std::move(std::get<2>(P))); 805 } 806 }/*}}}*/ 807 get_and_report_call_timelingen_substep_characteristics808 double get_and_report_call_time(std::ostream& os, pc_t const & P, unsigned int mesh, sc_t const & S, tc_t & C, bool do_timings) const { /* {{{ */ 809 bool cached = has_cached_time(C, S.fft_type); 810 std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type); 811 os << fmt::sprintf("# %s%s;%s (@%zu) [shrink=(%u,%u) batch=(%u,%u,%u)] %s, ", 812 mesh > 1 ? "MPI-" : "", 813 op_mul_or_mp_base::op_name(op_type), 814 S.fft_name(), 815 input_length, 816 S.shrink0, 817 S.shrink2, 818 S.batch[0], 819 S.batch[1], 820 S.batch[2], 821 size_disp(get_peak_ram(*op, mesh, S))); 822 os << std::flush; 823 double tt = get_call_time(os, P, mesh, S, C, do_timings); 824 if (do_timings) { 825 os << fmt::sprintf("%.2f%s\n", 826 tt, 827 cached ? " [from cache]" : ""); 828 } else { 829 os << "not timed\n"; 830 tt = -1; 831 } 832 return tt; 833 }/*}}}*/ report_op_winnerlingen_substep_characteristics834 void report_op_winner(std::ostream& os, unsigned int mesh, sc_t const & S) const 835 { 836 std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type); 837 os << fmt::sprintf("# %s%s;%s wins : %s\n", 838 mesh > 1 ? "MPI-" : "", 839 op_mul_or_mp_base::op_name(op_type), 840 S.fft_name(), 841 op->explain()); 842 } 843 844 #if 0 845 void report_size_stats_human(std::ostream& os, sc_t const & S) const {/*{{{*/ 846 std::shared_ptr<op_mul_or_mp_base> op = instantiate(S.fft_type); 847 os << fmt::sprintf("# %s (per op): %s+%s+%s, transforms 3*%s\n", 848 step_name(), 849 size_disp(asize*mpz_size(p)*sizeof(mp_limb_t)), 850 size_disp(bsize*mpz_size(p)*sizeof(mp_limb_t)), 851 size_disp(csize*mpz_size(p)*sizeof(mp_limb_t)), 852 size_disp(op->get_alloc_sizes()[0])); 853 os << fmt::sprintf("# %s (total for %u*%u * %u*%u): %s, transforms %s\n", 854 step_name(), 855 n0,n1,n1,n2, 856 size_disp((n0*n1*asize+n1*n2*bsize+n0*n2*csize)*mpz_size(p)*sizeof(mp_limb_t)), 857 size_disp((n0*n1+n1*n2+n0*n2)*op->get_alloc_sizes()[0])); 858 }/*}}}*/ 859 #endif 860 }; 861 862 /* the three structs below return the WCT needed to do the 863 * operation named (name) doing it with (nparallel) threads working 864 * at the same time (at our level -- we don't preclude the use of 865 * more omp threads at the level below). The computation is repeated 866 * (n) times, and we work with allocated space that is (k) times 867 * bigger than what is necessary to accomodate the (nparallel) 868 * parallel calls. 869 */ 870 871 template<typename OP_T> 872 struct microbench_dft { /*{{{*/ 873 typedef lingen_substep_characteristics ch_t; 874 typedef lingen_platform pc_t; 875 typedef OP_T OP; 876 OP op; 877 pc_t P; 878 unsigned int mesh; 879 ch_t const & U; 880 static constexpr const char * name = "dft"; microbench_dftmicrobench_dft881 microbench_dft(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_dft882 unsigned int max_parallel() const { 883 size_t R = 0; 884 /* storage for one input coeff and one output transform */ 885 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1; 886 R += iceildiv(U.asize, simd) * mpz_size(U.p) * sizeof(mp_limb_t); 887 R += op.get_alloc_sizes()[0]; 888 /* plus the temp memory for the dft operation */ 889 R += op.get_alloc_sizes()[1]; 890 size_t n = P.available_ram / R; 891 /* TODO: we probably want to ask P about max_threads. */ 892 n = std::min(n, (size_t) U.max_threads()); 893 if (n == 0) throw std::runtime_error("not enough RAM"); 894 return n; 895 } operator ()microbench_dft896 double operator()(unsigned int nparallel) const 897 { 898 matpoly a(U.ab, nparallel, 1, U.asize); 899 a.zero_pad(U.asize); 900 a.fill_random(0, U.asize, U.rstate); 901 matpoly_ft<typename OP::FFT> ta(a.m, a.n, op.fti); 902 double tt = -wct_seconds(); 903 matpoly_ft<typename OP::FFT>::dft(ta, a); 904 return tt + wct_seconds(); 905 } 906 };/*}}}*/ 907 template<typename OP_T> 908 struct microbench_ift { /*{{{*/ 909 typedef lingen_substep_characteristics ch_t; 910 typedef lingen_platform pc_t; 911 typedef OP_T OP; 912 OP op; 913 pc_t P; 914 unsigned int mesh; 915 ch_t const & U; 916 static constexpr const char * name = "ift"; microbench_iftmicrobench_ift917 microbench_ift(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_ift918 unsigned int max_parallel() const { 919 size_t R = 0; 920 /* storage for one input transform and one output coeff */ 921 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1; 922 R += iceildiv(U.csize, simd) * mpz_size(U.p) * sizeof(mp_limb_t); 923 R += op.get_alloc_sizes()[0]; 924 /* plus the temp memory for the ift operation */ 925 R += op.get_alloc_sizes()[1]; 926 size_t n = P.available_ram / R; 927 /* TODO: we probably want to ask P about max_threads. */ 928 n = std::min(n, (size_t) U.max_threads()); 929 if (n == 0) throw std::runtime_error("not enough RAM"); 930 return n; 931 } operator ()microbench_ift932 double operator()(unsigned int nparallel) const { 933 matpoly c(U.ab, nparallel, 1, U.csize); 934 matpoly_ft<typename OP::FFT> tc(c.m, c.n, op.fti); 935 /* This is important, since otherwise the inverse transform won't 936 * work */ 937 c.set_size(U.csize); 938 tc.zero(); /* would be .fill_random(rstate) if we had it */ 939 double tt = -wct_seconds(); 940 matpoly_ft<typename OP::FFT>::ift(c, tc); 941 return tt + wct_seconds(); 942 } 943 };/*}}}*/ 944 template<typename OP_T> 945 struct microbench_conv { /*{{{*/ 946 typedef lingen_substep_characteristics ch_t; 947 typedef lingen_platform pc_t; 948 typedef OP_T OP; 949 OP op; 950 pc_t P; 951 unsigned int mesh; 952 ch_t const & U; 953 static constexpr const char * name = "conv"; microbench_convmicrobench_conv954 microbench_conv(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_conv955 unsigned int max_parallel() const { 956 /* for k = nparallel, we need 2k+1 transforms in ram */ 957 size_t R = 0; 958 /* storage for two input transform */ 959 R += 2 * op.get_alloc_sizes()[0]; 960 /* plus the temp memory for the addmul operation */ 961 R += op.get_alloc_sizes()[1]; 962 R += op.get_alloc_sizes()[2]; 963 /* take into account the +1 transform */ 964 size_t n = (P.available_ram - op.get_alloc_sizes()[0]) / R; 965 /* TODO: we probably want to ask P about max_threads. */ 966 n = std::min(n, (size_t) U.max_threads()); 967 if (n == 0) throw std::runtime_error("not enough RAM"); 968 return n; 969 } operator ()microbench_conv970 double operator()(unsigned int nparallel) const { 971 matpoly_ft<typename OP::FFT> tc( nparallel, 1, op.fti); 972 matpoly_ft<typename OP::FFT> tc0(nparallel, 1, op.fti); 973 matpoly_ft<typename OP::FFT> tc1(1, 1, op.fti); 974 tc0.zero(); /* would be .fill_random(rstate) if we had it */ 975 tc1.zero(); /* would be .fill_random(rstate) if we had it */ 976 double tt = -wct_seconds(); 977 matpoly_ft<typename OP::FFT>::addcompose(tc, tc0, tc1); 978 return tt + wct_seconds(); 979 } 980 };/*}}}*/ 981 template<template<typename> class META_OP> 982 struct microbench_dft<META_OP<void>> { /*{{{*/ 983 typedef lingen_substep_characteristics ch_t; 984 typedef lingen_platform pc_t; 985 typedef META_OP<void> OP; 986 OP op; 987 pc_t P; 988 unsigned int mesh; 989 ch_t const & U; 990 static constexpr const char * name = NULL; microbench_dftmicrobench_dft991 microbench_dft(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_dft992 unsigned int max_parallel() const { return U.max_threads(); } operator ()microbench_dft993 double operator()(unsigned int) const { return 0; } 994 };/*}}}*/ 995 template<template<typename> class META_OP> 996 struct microbench_ift<META_OP<void>> { /*{{{*/ 997 typedef lingen_substep_characteristics ch_t; 998 typedef lingen_platform pc_t; 999 typedef META_OP<void> OP; 1000 OP op; 1001 pc_t P; 1002 unsigned int mesh; 1003 ch_t const & U; 1004 static constexpr const char * name = NULL; microbench_iftmicrobench_ift1005 microbench_ift(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_ift1006 unsigned int max_parallel() const { return U.max_threads(); } operator ()microbench_ift1007 double operator()(unsigned int) const { return 0; } 1008 };/*}}}*/ 1009 template<template<typename> class META_OP> 1010 struct microbench_conv<META_OP<void>> { /*{{{*/ 1011 typedef lingen_substep_characteristics ch_t; 1012 typedef lingen_platform pc_t; 1013 typedef META_OP<void> OP; 1014 OP op; 1015 pc_t P; 1016 unsigned int mesh; 1017 ch_t const & U; 1018 static constexpr const char * name = "product"; microbench_convmicrobench_conv1019 microbench_conv(OP const & op, pc_t const& P, unsigned int mesh, ch_t const & U) : op(op), P(P), mesh(mesh), U(U) {} max_parallelmicrobench_conv1020 unsigned int max_parallel() const { return U.max_threads(); } operator ()microbench_conv1021 double operator()(unsigned int nparallel) const { 1022 constexpr const unsigned int splitwidth = matpoly::over_gf2 ? 64 : 1; 1023 matpoly a(U.ab, nparallel, splitwidth, U.asize); 1024 matpoly b(U.ab, splitwidth, 1, U.asize); 1025 a.zero_pad(U.asize); 1026 a.fill_random(0, U.asize, U.rstate); 1027 b.zero_pad(U.bsize); 1028 b.fill_random(0, U.bsize, U.rstate); 1029 matpoly c(U.ab, nparallel, 1, U.csize); 1030 c.zero_pad(U.csize); 1031 double tt = -wct_seconds(); 1032 OP::addcompose(c, a, b); 1033 return (tt + wct_seconds()) / splitwidth; 1034 } 1035 };/*}}}*/ 1036 1037 #endif /* LINGEN_SUBSTEP_CHARACTERISTICS_HPP_ */ 1038