1 /*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
11 *
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
16 *
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 *
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
34 *
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
37 */
38 #include "gmxpre.h"
39
40 #include "wallcycle.h"
41
42 #include "config.h"
43
44 #include <cstdlib>
45
46 #include <array>
47 #include <vector>
48
49 #include "gromacs/math/functions.h"
50 #include "gromacs/mdtypes/commrec.h"
51 #include "gromacs/timing/cyclecounter.h"
52 #include "gromacs/timing/gpu_timing.h"
53 #include "gromacs/timing/wallcyclereporting.h"
54 #include "gromacs/utility/cstringutil.h"
55 #include "gromacs/utility/gmxassert.h"
56 #include "gromacs/utility/gmxmpi.h"
57 #include "gromacs/utility/logger.h"
58 #include "gromacs/utility/smalloc.h"
59 #include "gromacs/utility/snprintf.h"
60
61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
62
63 #ifndef DEBUG_WCYCLE
64 /*! \brief Enables consistency checking for the counters.
65 *
66 * If the macro is set to 1, code checks if you stop a counter different from the last
67 * one that was opened and if you do nest too deep.
68 */
69 # define DEBUG_WCYCLE 0
70 #endif
71 //! Whether wallcycle debugging is enabled
72 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
73 //! True if only the master rank should print debugging output
74 constexpr bool gmx_unused onlyMasterDebugPrints = true;
75 //! True if cycle counter nesting depth debuggin prints are enabled
76 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
77
78 #if DEBUG_WCYCLE
79 # include "gromacs/utility/fatalerror.h"
80 #endif
81
82 typedef struct
83 {
84 int n;
85 gmx_cycles_t c;
86 gmx_cycles_t start;
87 } wallcc_t;
88
89 struct gmx_wallcycle
90 {
91 wallcc_t* wcc;
92 /* did we detect one or more invalid cycle counts */
93 gmx_bool haveInvalidCount;
94 /* variables for testing/debugging */
95 gmx_bool wc_barrier;
96 wallcc_t* wcc_all;
97 int wc_depth;
98 #if DEBUG_WCYCLE
99 # define DEPTH_MAX 6
100 int counterlist[DEPTH_MAX];
101 int count_depth;
102 bool isMasterRank;
103 #endif
104 int ewc_prev;
105 gmx_cycles_t cycle_prev;
106 int64_t reset_counters;
107 #if GMX_MPI
108 MPI_Comm mpi_comm_mygroup;
109 #endif
110 wallcc_t* wcsc;
111 };
112
113 /* Each name should not exceed 19 printing characters
114 (ie. terminating null can be twentieth) */
115 static const char* wcn[ewcNR] = { "Run",
116 "Step",
117 "PP during PME",
118 "Domain decomp.",
119 "DD comm. load",
120 "DD comm. bounds",
121 "Vsite constr.",
122 "Send X to PME",
123 "Neighbor search",
124 "Launch GPU ops.",
125 "Comm. coord.",
126 "Force",
127 "Wait + Comm. F",
128 "PME mesh",
129 "PME redist. X/F",
130 "PME spread",
131 "PME gather",
132 "PME 3D-FFT",
133 "PME 3D-FFT Comm.",
134 "PME solve LJ",
135 "PME solve Elec",
136 "PME wait for PP",
137 "Wait + Recv. PME F",
138 "Wait PME GPU spread",
139 "PME 3D-FFT",
140 "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
141 "Wait PME GPU gather",
142 "Wait Bonded GPU",
143 "Reduce GPU PME F",
144 "Wait GPU NB nonloc.",
145 "Wait GPU NB local",
146 "Wait GPU state copy",
147 "NB X/F buffer ops.",
148 "Vsite spread",
149 "COM pull force",
150 "AWH",
151 "Write traj.",
152 "Update",
153 "Constraints",
154 "Comm. energies",
155 "Enforced rotation",
156 "Add rot. forces",
157 "Position swapping",
158 "IMD",
159 "Test" };
160
161 static const char* wcsn[ewcsNR] = {
162 "DD redist.",
163 "DD NS grid + sort",
164 "DD setup comm.",
165 "DD make top.",
166 "DD make constr.",
167 "DD top. other",
168 "DD GPU ops.",
169 "NS grid local",
170 "NS grid non-loc.",
171 "NS search local",
172 "NS search non-loc.",
173 "Bonded F",
174 "Bonded-FEP F",
175 "Restraints F",
176 "Listed buffer ops.",
177 "Nonbonded pruning",
178 "Nonbonded F kernel",
179 "Nonbonded F clear",
180 "Nonbonded FEP",
181 "Launch NB GPU tasks",
182 "Launch Bonded GPU tasks",
183 "Launch PME GPU tasks",
184 "Launch state copy",
185 "Ewald F correction",
186 "NB X buffer ops.",
187 "NB F buffer ops.",
188 "Clear force buffer",
189 "Launch GPU NB X buffer ops.",
190 "Launch GPU NB F buffer ops.",
191 "Launch GPU Comm. coord.",
192 "Launch GPU Comm. force.",
193 "Launch GPU update",
194 "Test subcounter",
195 };
196
197 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
198 static const char* PMEStageNames[] = {
199 "PME spline", "PME spread", "PME spline + spread", "PME 3D-FFT r2c",
200 "PME solve", "PME 3D-FFT c2r", "PME gather",
201 };
202
wallcycle_have_counter()203 gmx_bool wallcycle_have_counter()
204 {
205 return gmx_cycles_have_counter();
206 }
207
wallcycle_init(FILE * fplog,int resetstep,t_commrec gmx_unused * cr)208 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
209 {
210 gmx_wallcycle_t wc;
211
212
213 if (!wallcycle_have_counter())
214 {
215 return nullptr;
216 }
217
218 snew(wc, 1);
219
220 wc->haveInvalidCount = FALSE;
221 wc->wc_barrier = FALSE;
222 wc->wcc_all = nullptr;
223 wc->wc_depth = 0;
224 wc->ewc_prev = -1;
225 wc->reset_counters = resetstep;
226
227
228 #if GMX_MPI
229 if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
230 {
231 if (fplog)
232 {
233 fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
234 }
235 wc->wc_barrier = TRUE;
236 wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
237 }
238 #endif
239
240 snew(wc->wcc, ewcNR);
241 if (getenv("GMX_CYCLE_ALL") != nullptr)
242 {
243 if (fplog)
244 {
245 fprintf(fplog, "\nWill time all the code during the run\n\n");
246 }
247 snew(wc->wcc_all, ewcNR * ewcNR);
248 }
249
250 if (useCycleSubcounters)
251 {
252 snew(wc->wcsc, ewcsNR);
253 }
254
255 #if DEBUG_WCYCLE
256 wc->count_depth = 0;
257 wc->isMasterRank = MASTER(cr);
258 #endif
259
260 return wc;
261 }
262
wallcycle_destroy(gmx_wallcycle_t wc)263 void wallcycle_destroy(gmx_wallcycle_t wc)
264 {
265 if (wc == nullptr)
266 {
267 return;
268 }
269
270 if (wc->wcc != nullptr)
271 {
272 sfree(wc->wcc);
273 }
274 if (wc->wcc_all != nullptr)
275 {
276 sfree(wc->wcc_all);
277 }
278 if (wc->wcsc != nullptr)
279 {
280 sfree(wc->wcsc);
281 }
282 sfree(wc);
283 }
284
wallcycle_all_start(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)285 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
286 {
287 wc->ewc_prev = ewc;
288 wc->cycle_prev = cycle;
289 }
290
wallcycle_all_stop(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)291 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
292 {
293 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
294 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
295 }
296
297
298 #if DEBUG_WCYCLE
debug_start_check(gmx_wallcycle_t wc,int ewc)299 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
300 {
301 if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
302 {
303 gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
304 }
305 wc->counterlist[wc->count_depth] = ewc;
306 wc->count_depth++;
307
308 if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
309 {
310 std::string indentStr(4 * wc->count_depth, ' ');
311 fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
312 }
313 }
314
debug_stop_check(gmx_wallcycle_t wc,int ewc)315 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
316 {
317 if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
318 {
319 std::string indentStr(4 * wc->count_depth, ' ');
320 fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
321 }
322
323 wc->count_depth--;
324
325 if (wc->count_depth < 0)
326 {
327 gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
328 wc->count_depth);
329 }
330 if (wc->counterlist[wc->count_depth] != ewc)
331 {
332 gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
333 wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
334 }
335 }
336 #endif
337
wallcycle_start(gmx_wallcycle_t wc,int ewc)338 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
339 {
340 gmx_cycles_t cycle;
341
342 if (wc == nullptr)
343 {
344 return;
345 }
346
347 #if GMX_MPI
348 if (wc->wc_barrier)
349 {
350 MPI_Barrier(wc->mpi_comm_mygroup);
351 }
352 #endif
353
354 #if DEBUG_WCYCLE
355 debug_start_check(wc, ewc);
356 #endif
357
358 cycle = gmx_cycles_read();
359 wc->wcc[ewc].start = cycle;
360 if (wc->wcc_all != nullptr)
361 {
362 wc->wc_depth++;
363 if (ewc == ewcRUN)
364 {
365 wallcycle_all_start(wc, ewc, cycle);
366 }
367 else if (wc->wc_depth == 3)
368 {
369 wallcycle_all_stop(wc, ewc, cycle);
370 }
371 }
372 }
373
wallcycle_increment_event_count(gmx_wallcycle_t wc,int ewc)374 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
375 {
376 if (wc == nullptr)
377 {
378 return;
379 }
380 wc->wcc[ewc].n++;
381 }
382
wallcycle_start_nocount(gmx_wallcycle_t wc,int ewc)383 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
384 {
385 if (wc == nullptr)
386 {
387 return;
388 }
389
390 wallcycle_start(wc, ewc);
391 wc->wcc[ewc].n--;
392 }
393
wallcycle_stop(gmx_wallcycle_t wc,int ewc)394 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
395 {
396 gmx_cycles_t cycle, last;
397
398 if (wc == nullptr)
399 {
400 return 0;
401 }
402
403 #if GMX_MPI
404 if (wc->wc_barrier)
405 {
406 MPI_Barrier(wc->mpi_comm_mygroup);
407 }
408 #endif
409
410 #if DEBUG_WCYCLE
411 debug_stop_check(wc, ewc);
412 #endif
413
414 /* When processes or threads migrate between cores, the cycle counting
415 * can get messed up if the cycle counter on different cores are not
416 * synchronized. When this happens we expect both large negative and
417 * positive cycle differences. We can detect negative cycle differences.
418 * Detecting too large positive counts if difficult, since count can be
419 * large, especially for ewcRUN. If we detect a negative count,
420 * we will not print the cycle accounting table.
421 */
422 cycle = gmx_cycles_read();
423 if (cycle >= wc->wcc[ewc].start)
424 {
425 last = cycle - wc->wcc[ewc].start;
426 }
427 else
428 {
429 last = 0;
430 wc->haveInvalidCount = TRUE;
431 }
432 wc->wcc[ewc].c += last;
433 wc->wcc[ewc].n++;
434 if (wc->wcc_all)
435 {
436 wc->wc_depth--;
437 if (ewc == ewcRUN)
438 {
439 wallcycle_all_stop(wc, ewc, cycle);
440 }
441 else if (wc->wc_depth == 2)
442 {
443 wallcycle_all_start(wc, ewc, cycle);
444 }
445 }
446
447 return last;
448 }
449
wallcycle_get(gmx_wallcycle_t wc,int ewc,int * n,double * c)450 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
451 {
452 *n = wc->wcc[ewc].n;
453 *c = static_cast<double>(wc->wcc[ewc].c);
454 }
455
wallcycle_reset_all(gmx_wallcycle_t wc)456 void wallcycle_reset_all(gmx_wallcycle_t wc)
457 {
458 int i;
459
460 if (wc == nullptr)
461 {
462 return;
463 }
464
465 for (i = 0; i < ewcNR; i++)
466 {
467 wc->wcc[i].n = 0;
468 wc->wcc[i].c = 0;
469 }
470 wc->haveInvalidCount = FALSE;
471
472 if (wc->wcc_all)
473 {
474 for (i = 0; i < ewcNR * ewcNR; i++)
475 {
476 wc->wcc_all[i].n = 0;
477 wc->wcc_all[i].c = 0;
478 }
479 }
480 if (wc->wcsc)
481 {
482 for (i = 0; i < ewcsNR; i++)
483 {
484 wc->wcsc[i].n = 0;
485 wc->wcsc[i].c = 0;
486 }
487 }
488 }
489
is_pme_counter(int ewc)490 static gmx_bool is_pme_counter(int ewc)
491 {
492 return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
493 }
494
is_pme_subcounter(int ewc)495 static gmx_bool is_pme_subcounter(int ewc)
496 {
497 return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
498 }
499
500 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
subtract_cycles(wallcc_t * wcc,int ewc_main,int ewc_sub)501 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
502 {
503 if (wcc[ewc_sub].n > 0)
504 {
505 if (wcc[ewc_main].c >= wcc[ewc_sub].c)
506 {
507 wcc[ewc_main].c -= wcc[ewc_sub].c;
508 }
509 else
510 {
511 /* Something is wrong with the cycle counting */
512 wcc[ewc_main].c = 0;
513 }
514 }
515 }
516
wallcycle_scale_by_num_threads(gmx_wallcycle_t wc,bool isPmeRank,int nthreads_pp,int nthreads_pme)517 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
518 {
519 if (wc == nullptr)
520 {
521 return;
522 }
523
524 for (int i = 0; i < ewcNR; i++)
525 {
526 if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
527 {
528 wc->wcc[i].c *= nthreads_pme;
529
530 if (wc->wcc_all)
531 {
532 for (int j = 0; j < ewcNR; j++)
533 {
534 wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
535 }
536 }
537 }
538 else
539 {
540 wc->wcc[i].c *= nthreads_pp;
541
542 if (wc->wcc_all)
543 {
544 for (int j = 0; j < ewcNR; j++)
545 {
546 wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
547 }
548 }
549 }
550 }
551 if (useCycleSubcounters && wc->wcsc && !isPmeRank)
552 {
553 for (int i = 0; i < ewcsNR; i++)
554 {
555 wc->wcsc[i].c *= nthreads_pp;
556 }
557 }
558 }
559
560 /* TODO Make an object for this function to return, containing some
561 * vectors of something like wallcc_t for the summed wcc, wcc_all and
562 * wcsc, AND the original wcc for rank 0.
563 *
564 * The GPU timing is reported only for rank 0, so we want to preserve
565 * the original wcycle on that rank. Rank 0 also reports the global
566 * counts before that, so needs something to contain the global data
567 * without over-writing the rank-0 data. The current implementation
568 * uses cycles_sum to manage this, which works OK now because wcsc and
569 * wcc_all are unused by the GPU reporting, but it is not satisfactory
570 * for the future. Also, there's no need for MPI_Allreduce, since
571 * only MASTERRANK uses any of the results. */
wallcycle_sum(const t_commrec * cr,gmx_wallcycle_t wc)572 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
573 {
574 WallcycleCounts cycles_sum;
575 wallcc_t* wcc;
576 double cycles[int(ewcNR) + int(ewcsNR)];
577 #if GMX_MPI
578 double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
579 #endif
580 int i;
581 int nsum;
582
583 if (wc == nullptr)
584 {
585 /* Default construction of std::array of non-class T can leave
586 the values indeterminate, just like a C array, and icc
587 warns about it. */
588 cycles_sum.fill(0);
589 return cycles_sum;
590 }
591
592 wcc = wc->wcc;
593
594 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
595 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
596
597 subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
598
599 if (cr->npmenodes == 0)
600 {
601 /* All nodes do PME (or no PME at all) */
602 subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
603 }
604 else
605 {
606 /* The are PME-only nodes */
607 if (wcc[ewcPMEMESH].n > 0)
608 {
609 /* This must be a PME only node, calculate the Wait + Comm. time */
610 GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
611 "Total run ticks must be greater than PME-only ticks");
612 wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
613 }
614 }
615
616 /* Store the cycles in a double buffer for summing */
617 for (i = 0; i < ewcNR; i++)
618 {
619 #if GMX_MPI
620 cycles_n[i] = static_cast<double>(wcc[i].n);
621 #endif
622 cycles[i] = static_cast<double>(wcc[i].c);
623 }
624 nsum = ewcNR;
625 if (wc->wcsc)
626 {
627 for (i = 0; i < ewcsNR; i++)
628 {
629 #if GMX_MPI
630 cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
631 #endif
632 cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
633 }
634 nsum += ewcsNR;
635 }
636
637 #if GMX_MPI
638 if (cr->nnodes > 1)
639 {
640 double buf[int(ewcNR) + int(ewcsNR) + 1];
641
642 // TODO this code is used only at the end of the run, so we
643 // can just do a simple reduce of haveInvalidCount in
644 // wallcycle_print, and avoid bugs
645 cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
646 // TODO Use MPI_Reduce
647 MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
648 for (i = 0; i < ewcNR; i++)
649 {
650 wcc[i].n = gmx::roundToInt(buf[i]);
651 }
652 wc->haveInvalidCount = (buf[nsum] > 0);
653 if (wc->wcsc)
654 {
655 for (i = 0; i < ewcsNR; i++)
656 {
657 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
658 }
659 }
660
661 // TODO Use MPI_Reduce
662 MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
663
664 if (wc->wcc_all != nullptr)
665 {
666 double *buf_all, *cyc_all;
667
668 snew(cyc_all, ewcNR * ewcNR);
669 snew(buf_all, ewcNR * ewcNR);
670 for (i = 0; i < ewcNR * ewcNR; i++)
671 {
672 cyc_all[i] = wc->wcc_all[i].c;
673 }
674 // TODO Use MPI_Reduce
675 MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
676 for (i = 0; i < ewcNR * ewcNR; i++)
677 {
678 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
679 }
680 sfree(buf_all);
681 sfree(cyc_all);
682 }
683 }
684 else
685 #endif
686 {
687 for (i = 0; i < nsum; i++)
688 {
689 cycles_sum[i] = cycles[i];
690 }
691 }
692
693 return cycles_sum;
694 }
695
696 static void
print_cycles(FILE * fplog,double c2t,const char * name,int nnodes,int nthreads,int ncalls,double c_sum,double tot)697 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
698 {
699 char nnodes_str[STRLEN];
700 char nthreads_str[STRLEN];
701 char ncalls_str[STRLEN];
702 double wallt;
703 double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
704
705 if (c_sum > 0)
706 {
707 if (ncalls > 0)
708 {
709 snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
710 if (nnodes < 0)
711 {
712 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
713 }
714 else
715 {
716 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
717 }
718 if (nthreads < 0)
719 {
720 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
721 }
722 else
723 {
724 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
725 }
726 }
727 else
728 {
729 nnodes_str[0] = 0;
730 nthreads_str[0] = 0;
731 ncalls_str[0] = 0;
732 }
733 /* Convert the cycle count to wallclock time for this task */
734 wallt = c_sum * c2t;
735
736 fprintf(fplog, " %-19.19s %4s %4s %10s %10.3f %14.3f %5.1f\n", name, nnodes_str,
737 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
738 }
739 }
740
print_gputimes(FILE * fplog,const char * name,int n,double t,double tot_t)741 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
742 {
743 char num[11];
744 char avg_perf[11];
745
746 if (n > 0)
747 {
748 snprintf(num, sizeof(num), "%10d", n);
749 snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
750 }
751 else
752 {
753 sprintf(num, " ");
754 sprintf(avg_perf, " ");
755 }
756 if (t != tot_t && tot_t > 0)
757 {
758 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
759 }
760 else
761 {
762 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
763 }
764 }
765
print_header(FILE * fplog,int nrank_pp,int nth_pp,int nrank_pme,int nth_pme)766 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
767 {
768 int nrank_tot = nrank_pp + nrank_pme;
769 if (0 == nrank_pme)
770 {
771 fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
772 if (nth_pp > 1)
773 {
774 fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
775 }
776 /* Don't report doing PP+PME, because we can't tell here if
777 * this is RF, etc. */
778 }
779 else
780 {
781 fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
782 if (nth_pp > 1)
783 {
784 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
785 }
786 fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
787 if (nth_pme > 1)
788 {
789 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
790 }
791 }
792
793 fprintf(fplog, "\n\n");
794 fprintf(fplog, " Computing: Num Num Call Wall time Giga-Cycles\n");
795 fprintf(fplog, " Ranks Threads Count (s) total sum %%\n");
796 }
797
798
wallcycle_print(FILE * fplog,const gmx::MDLogger & mdlog,int nnodes,int npme,int nth_pp,int nth_pme,double realtime,gmx_wallcycle_t wc,const WallcycleCounts & cyc_sum,const gmx_wallclock_gpu_nbnxn_t * gpu_nbnxn_t,const gmx_wallclock_gpu_pme_t * gpu_pme_t)799 void wallcycle_print(FILE* fplog,
800 const gmx::MDLogger& mdlog,
801 int nnodes,
802 int npme,
803 int nth_pp,
804 int nth_pme,
805 double realtime,
806 gmx_wallcycle_t wc,
807 const WallcycleCounts& cyc_sum,
808 const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
809 const gmx_wallclock_gpu_pme_t* gpu_pme_t)
810 {
811 double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
812 double c2t, c2t_pp, c2t_pme = 0;
813 int i, j, npp, nth_tot;
814 char buf[STRLEN];
815 const char* hline =
816 "-----------------------------------------------------------------------------";
817
818 if (wc == nullptr)
819 {
820 return;
821 }
822
823 GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
824 GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
825 GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
826 GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
827 npp = nnodes - npme;
828 /* npme is the number of PME-only ranks used, and we always do PP work */
829 GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
830
831 nth_tot = npp * nth_pp + npme * nth_pme;
832
833 /* When using PME-only nodes, the next line is valid for both
834 PP-only and PME-only nodes because they started ewcRUN at the
835 same time. */
836 tot = cyc_sum[ewcRUN];
837 tot_for_pp = 0;
838
839 if (tot <= 0.0)
840 {
841 /* TODO This is heavy handed, but until someone reworks the
842 code so that it is provably robust with respect to
843 non-positive values for all possible timer and cycle
844 counters, there is less value gained from printing whatever
845 timing data might still be sensible for some non-Jenkins
846 run, than is lost from diagnosing Jenkins FP exceptions on
847 runs about whose execution time we don't care. */
848 GMX_LOG(mdlog.warning)
849 .asParagraph()
850 .appendTextFormatted(
851 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
852 "time accounting",
853 tot);
854 return;
855 }
856
857 if (wc->haveInvalidCount)
858 {
859 GMX_LOG(mdlog.warning)
860 .asParagraph()
861 .appendText(
862 "NOTE: Detected invalid cycle counts, probably because threads moved "
863 "between CPU cores that do not have synchronized cycle counters. Will not "
864 "print the cycle accounting.");
865 return;
866 }
867
868
869 /* Conversion factor from cycles to seconds */
870 c2t = realtime / tot;
871 c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
872 if (npme > 0)
873 {
874 c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
875 }
876 else
877 {
878 c2t_pme = 0;
879 }
880
881 fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n");
882
883 print_header(fplog, npp, nth_pp, npme, nth_pme);
884
885 fprintf(fplog, "%s\n", hline);
886 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
887 {
888 if (is_pme_subcounter(i))
889 {
890 /* Do not count these at all */
891 }
892 else if (npme > 0 && is_pme_counter(i))
893 {
894 /* Print timing information for PME-only nodes, but add an
895 * asterisk so the reader of the table can know that the
896 * walltimes are not meant to add up. The asterisk still
897 * fits in the required maximum of 19 characters. */
898 char buffer[STRLEN];
899 snprintf(buffer, STRLEN, "%s *", wcn[i]);
900 print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
901 }
902 else
903 {
904 /* Print timing information when it is for a PP or PP+PME
905 node */
906 print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
907 tot_for_pp += cyc_sum[i];
908 }
909 }
910 if (wc->wcc_all != nullptr)
911 {
912 for (i = 0; i < ewcNR; i++)
913 {
914 for (j = 0; j < ewcNR; j++)
915 {
916 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
917 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
918 wc->wcc_all[i * ewcNR + j].c, tot);
919 }
920 }
921 }
922 tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
923 print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
924 fprintf(fplog, "%s\n", hline);
925 print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
926 fprintf(fplog, "%s\n", hline);
927
928 if (npme > 0)
929 {
930 fprintf(fplog,
931 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
932 " twice the total reported, but the cycle count total and %% are correct.\n"
933 "%s\n",
934 hline);
935 }
936
937 if (wc->wcc[ewcPMEMESH].n > 0)
938 {
939 // A workaround to not print breakdown when no subcounters were recorded.
940 // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
941 std::vector<int> validPmeSubcounterIndices;
942 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
943 {
944 if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
945 {
946 validPmeSubcounterIndices.push_back(i);
947 }
948 }
949
950 if (!validPmeSubcounterIndices.empty())
951 {
952 fprintf(fplog, " Breakdown of PME mesh computation\n");
953 fprintf(fplog, "%s\n", hline);
954 for (auto i : validPmeSubcounterIndices)
955 {
956 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
957 nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
958 }
959 fprintf(fplog, "%s\n", hline);
960 }
961 }
962
963 if (useCycleSubcounters && wc->wcsc)
964 {
965 fprintf(fplog, " Breakdown of PP computation\n");
966 fprintf(fplog, "%s\n", hline);
967 for (i = 0; i < ewcsNR; i++)
968 {
969 print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
970 }
971 fprintf(fplog, "%s\n", hline);
972 }
973
974 /* print GPU timing summary */
975 double tot_gpu = 0.0;
976 if (gpu_pme_t)
977 {
978 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
979 {
980 tot_gpu += gpu_pme_t->timing[k].t;
981 }
982 }
983 if (gpu_nbnxn_t)
984 {
985 const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
986 { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
987 tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
988
989 /* add up the kernel timings */
990 for (i = 0; i < 2; i++)
991 {
992 for (j = 0; j < 2; j++)
993 {
994 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
995 }
996 }
997 tot_gpu += gpu_nbnxn_t->pruneTime.t;
998
999 tot_cpu_overlap = wc->wcc[ewcFORCE].c;
1000 if (wc->wcc[ewcPMEMESH].n > 0)
1001 {
1002 tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
1003 }
1004 tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
1005
1006 fprintf(fplog, "\n GPU timings\n%s\n", hline);
1007 fprintf(fplog,
1008 " Computing: Count Wall t (s) ms/step %c\n", '%');
1009 fprintf(fplog, "%s\n", hline);
1010 print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
1011 print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
1012
1013 for (i = 0; i < 2; i++)
1014 {
1015 for (j = 0; j < 2; j++)
1016 {
1017 if (gpu_nbnxn_t->ktime[i][j].c)
1018 {
1019 print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
1020 gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
1021 }
1022 }
1023 }
1024 if (gpu_pme_t)
1025 {
1026 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1027 {
1028 if (gpu_pme_t->timing[k].c)
1029 {
1030 print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1031 gpu_pme_t->timing[k].t, tot_gpu);
1032 }
1033 }
1034 }
1035 if (gpu_nbnxn_t->pruneTime.c)
1036 {
1037 print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1038 gpu_nbnxn_t->pruneTime.t, tot_gpu);
1039 }
1040 print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1041 fprintf(fplog, "%s\n", hline);
1042 print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1043 fprintf(fplog, "%s\n", hline);
1044 if (gpu_nbnxn_t->dynamicPruneTime.c)
1045 {
1046 /* We print the dynamic pruning kernel timings after a separator
1047 * and avoid adding it to tot_gpu as this is not in the force
1048 * overlap. We print the fraction as relative to the rest.
1049 */
1050 print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1051 gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1052 fprintf(fplog, "%s\n", hline);
1053 }
1054 gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1055 if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1056 {
1057 fprintf(fplog,
1058 "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1059 "%.3f\n",
1060 tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1061 }
1062
1063 /* only print notes related to CPU-GPU load balance with PME */
1064 if (wc->wcc[ewcPMEMESH].n > 0)
1065 {
1066 fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1067
1068 /* print note if the imbalance is high with PME case in which
1069 * CPU-GPU load balancing is possible */
1070 if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1071 {
1072 /* Only the sim master calls this function, so always print to stderr */
1073 if (gpu_cpu_ratio < 0.8)
1074 {
1075 if (npp > 1)
1076 {
1077 /* The user could have used -notunepme,
1078 * but we currently can't check that here.
1079 */
1080 GMX_LOG(mdlog.warning)
1081 .asParagraph()
1082 .appendText(
1083 "NOTE: The CPU has >25% more load than the GPU. This "
1084 "imbalance wastes\n"
1085 " GPU resources. Maybe the domain decomposition "
1086 "limits the PME tuning.\n"
1087 " In that case, try setting the DD grid manually "
1088 "(-dd) or lowering -dds.");
1089 }
1090 else
1091 {
1092 /* We should not end up here, unless the box is
1093 * too small for increasing the cut-off for PME tuning.
1094 */
1095 GMX_LOG(mdlog.warning)
1096 .asParagraph()
1097 .appendText(
1098 "NOTE: The CPU has >25% more load than the GPU. This "
1099 "imbalance wastes\n"
1100 " GPU resources.");
1101 }
1102 }
1103 if (gpu_cpu_ratio > 1.25)
1104 {
1105 GMX_LOG(mdlog.warning)
1106 .asParagraph()
1107 .appendText(
1108 "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1109 "wastes\n"
1110 " CPU resources.");
1111 }
1112 }
1113 }
1114 }
1115
1116 if (wc->wc_barrier)
1117 {
1118 GMX_LOG(mdlog.warning)
1119 .asParagraph()
1120 .appendText(
1121 "MPI_Barrier was called before each cycle start/stop\n"
1122 "call, so timings are not those of real runs.");
1123 }
1124
1125 if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1126 {
1127 /* Only the sim master calls this function, so always print to stderr */
1128 if (wc->wcc[ewcDOMDEC].n == 0)
1129 {
1130 GMX_LOG(mdlog.warning)
1131 .asParagraph()
1132 .appendTextFormatted(
1133 "NOTE: %d %% of the run time was spent in pair search,\n"
1134 " you might want to increase nstlist (this has no effect on "
1135 "accuracy)\n",
1136 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1137 }
1138 else
1139 {
1140 GMX_LOG(mdlog.warning)
1141 .asParagraph()
1142 .appendTextFormatted(
1143 "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1144 " %d %% of the run time was spent in pair search,\n"
1145 " you might want to increase nstlist (this has no effect on "
1146 "accuracy)\n",
1147 gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1148 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1149 }
1150 }
1151
1152 if (cyc_sum[ewcMoveE] > tot * 0.05)
1153 {
1154 GMX_LOG(mdlog.warning)
1155 .asParagraph()
1156 .appendTextFormatted(
1157 "NOTE: %d %% of the run time was spent communicating energies,\n"
1158 " you might want to increase some nst* mdp options\n",
1159 gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1160 }
1161 }
1162
wcycle_get_reset_counters(gmx_wallcycle_t wc)1163 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1164 {
1165 if (wc == nullptr)
1166 {
1167 return -1;
1168 }
1169
1170 return wc->reset_counters;
1171 }
1172
wcycle_set_reset_counters(gmx_wallcycle_t wc,int64_t reset_counters)1173 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1174 {
1175 if (wc == nullptr)
1176 {
1177 return;
1178 }
1179
1180 wc->reset_counters = reset_counters;
1181 }
1182
wallcycle_sub_start(gmx_wallcycle_t wc,int ewcs)1183 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1184 {
1185 if (useCycleSubcounters && wc != nullptr)
1186 {
1187 wc->wcsc[ewcs].start = gmx_cycles_read();
1188 }
1189 }
1190
wallcycle_sub_start_nocount(gmx_wallcycle_t wc,int ewcs)1191 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1192 {
1193 if (useCycleSubcounters && wc != nullptr)
1194 {
1195 wallcycle_sub_start(wc, ewcs);
1196 wc->wcsc[ewcs].n--;
1197 }
1198 }
1199
wallcycle_sub_stop(gmx_wallcycle_t wc,int ewcs)1200 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1201 {
1202 if (useCycleSubcounters && wc != nullptr)
1203 {
1204 wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1205 wc->wcsc[ewcs].n++;
1206 }
1207 }
1208