1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team, 1998-2005
4  *
5  * Statistics and timing-related functions.
6  *
7  * ---------------------------------------------------------------------------*/
8 
9 #include "PosixSource.h"
10 #include "Rts.h"
11 
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GCThread.h"
20 #include "sm/BlockAlloc.h"
21 
22 // for spin/yield counters
23 #include "sm/GC.h"
24 #include "ThreadPaused.h"
25 #include "Messages.h"
26 
27 #include <string.h> // for memset
28 
29 #if defined(THREADED_RTS)
30 // Protects all statistics below
31 Mutex stats_mutex;
32 #endif
33 
34 static Time
35     start_init_cpu, start_init_elapsed,
36     end_init_cpu,   end_init_elapsed,
37     start_exit_cpu, start_exit_elapsed,
38     start_exit_gc_elapsed, start_exit_gc_cpu,
39     end_exit_cpu,   end_exit_elapsed,
40     start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed,
41     start_nonmoving_gc_sync_elapsed;
42 
43 #if defined(PROFILING)
44 static Time RP_start_time  = 0, RP_tot_time  = 0;  // retainer prof user time
45 static Time RPe_start_time = 0, RPe_tot_time = 0;  // retainer prof elap time
46 
47 static Time HC_start_time, HC_tot_time = 0;     // heap census prof user time
48 static Time HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
49 #endif
50 
51 #if defined(PROF_SPIN)
52 volatile StgWord64 whitehole_lockClosure_spin = 0;
53 volatile StgWord64 whitehole_lockClosure_yield = 0;
54 volatile StgWord64 whitehole_threadPaused_spin = 0;
55 volatile StgWord64 whitehole_executeMessage_spin = 0;
56 #endif
57 
58 //
59 // All the stats!
60 //
61 // This is where we accumulate all the stats during execution, and it's also
62 // in a convenient form that we can copy over to a caller of getRTSStats().
63 //
64 static RTSStats stats;
65 
66 static W_ GC_end_faults = 0;
67 
68 static Time *GC_coll_cpu = NULL;
69 static Time *GC_coll_elapsed = NULL;
70 static Time *GC_coll_max_pause = NULL;
71 
72 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
73 static void statsFlush( void );
74 static void statsClose( void );
75 
76 /* -----------------------------------------------------------------------------
77    Current elapsed time
78    ------------------------------------------------------------------------- */
79 
stat_getElapsedTime(void)80 Time stat_getElapsedTime(void)
81 {
82     return getProcessElapsedTime() - start_init_elapsed;
83 }
84 
85 /* ---------------------------------------------------------------------------
86    Measure the current MUT time, for profiling
87    ------------------------------------------------------------------------ */
88 
89 static double
mut_user_time_until(Time t)90 mut_user_time_until( Time t )
91 {
92     ACQUIRE_LOCK(&stats_mutex);
93     double ret = TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns);
94     RELEASE_LOCK(&stats_mutex);
95     return ret;
96     // heapCensus() time is included in GC_tot_cpu, so we don't need
97     // to subtract it here.
98 
99     // TODO: This seems wrong to me. Surely we should be subtracting
100     // (at least) start_init_cpu?
101 }
102 
103 double
mut_user_time(void)104 mut_user_time( void )
105 {
106     Time cpu = getProcessCPUTime();
107     return mut_user_time_until(cpu);
108 }
109 
110 #if defined(PROFILING)
111 /*
112   mut_user_time_during_RP() returns the MUT time during retainer profiling.
113   The same is for mut_user_time_during_HC();
114  */
115 static double
mut_user_time_during_RP(void)116 mut_user_time_during_RP( void )
117 {
118     return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
119 }
120 
121 #endif /* PROFILING */
122 
123 /* ---------------------------------------------------------------------------
124    initStats0() has no dependencies, it can be called right at the beginning
125    ------------------------------------------------------------------------ */
126 
127 void
initStats0(void)128 initStats0(void)
129 {
130 #if defined(THREADED_RTS)
131     initMutex(&stats_mutex);
132 #endif
133 
134     start_init_cpu    = 0;
135     start_init_elapsed = 0;
136     end_init_cpu     = 0;
137     end_init_elapsed  = 0;
138 
139     start_nonmoving_gc_cpu = 0;
140     start_nonmoving_gc_elapsed = 0;
141     start_nonmoving_gc_sync_elapsed = 0;
142 
143     start_exit_cpu    = 0;
144     start_exit_elapsed = 0;
145     start_exit_gc_cpu    = 0;
146     start_exit_gc_elapsed = 0;
147     end_exit_cpu     = 0;
148     end_exit_elapsed  = 0;
149 
150 #if defined(PROFILING)
151     RP_start_time  = 0;
152     RP_tot_time  = 0;
153     RPe_start_time = 0;
154     RPe_tot_time = 0;
155 
156     HC_start_time = 0;
157     HC_tot_time = 0;
158     HCe_start_time = 0;
159     HCe_tot_time = 0;
160 #endif
161 
162     GC_end_faults = 0;
163 
164     stats = (RTSStats) {
165         .gcs = 0,
166         .major_gcs = 0,
167         .allocated_bytes = 0,
168         .max_live_bytes = 0,
169         .max_large_objects_bytes = 0,
170         .max_compact_bytes = 0,
171         .max_slop_bytes = 0,
172         .max_mem_in_use_bytes = 0,
173         .cumulative_live_bytes = 0,
174         .copied_bytes = 0,
175         .par_copied_bytes = 0,
176         .cumulative_par_max_copied_bytes = 0,
177         .cumulative_par_balanced_copied_bytes = 0,
178         .gc_spin_spin = 0,
179         .gc_spin_yield = 0,
180         .mut_spin_spin = 0,
181         .mut_spin_yield = 0,
182         .any_work = 0,
183         .no_work = 0,
184         .scav_find_work = 0,
185         .init_cpu_ns = 0,
186         .init_elapsed_ns = 0,
187         .mutator_cpu_ns = 0,
188         .mutator_elapsed_ns = 0,
189         .gc_cpu_ns = 0,
190         .gc_elapsed_ns = 0,
191         .cpu_ns = 0,
192         .elapsed_ns = 0,
193         .nonmoving_gc_cpu_ns = 0,
194         .nonmoving_gc_elapsed_ns = 0,
195         .nonmoving_gc_max_elapsed_ns = 0,
196         .nonmoving_gc_sync_elapsed_ns = 0,
197         .nonmoving_gc_sync_max_elapsed_ns = 0,
198         .gc = {
199             .gen = 0,
200             .threads = 0,
201             .allocated_bytes = 0,
202             .live_bytes = 0,
203             .large_objects_bytes = 0,
204             .compact_bytes = 0,
205             .slop_bytes = 0,
206             .mem_in_use_bytes = 0,
207             .copied_bytes = 0,
208             .par_max_copied_bytes = 0,
209             .par_balanced_copied_bytes = 0,
210             .sync_elapsed_ns = 0,
211             .cpu_ns = 0,
212             .elapsed_ns = 0,
213             .nonmoving_gc_cpu_ns = 0,
214             .nonmoving_gc_elapsed_ns = 0,
215             .nonmoving_gc_sync_elapsed_ns = 0,
216         }
217     };
218 }
219 
220 /* ---------------------------------------------------------------------------
221    initStats1() can be called after setupRtsFlags()
222    ------------------------------------------------------------------------ */
223 
224 void initGenerationStats(void);
225 
226 void
initStats1(void)227 initStats1 (void)
228 {
229     if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
230         statsPrintf("    Alloc    Copied     Live     GC     GC      TOT      TOT  Page Flts\n");
231         statsPrintf("    bytes     bytes     bytes   user   elap     user     elap\n");
232     }
233     GC_coll_cpu =
234         (Time *)stgMallocBytes(
235             sizeof(Time)*RtsFlags.GcFlags.generations,
236             "initStats");
237     GC_coll_elapsed =
238         (Time *)stgMallocBytes(
239             sizeof(Time)*RtsFlags.GcFlags.generations,
240             "initStats");
241     GC_coll_max_pause =
242         (Time *)stgMallocBytes(
243             sizeof(Time)*RtsFlags.GcFlags.generations,
244             "initStats");
245     initGenerationStats();
246 }
247 
248 void
initGenerationStats()249 initGenerationStats()
250 {
251     for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
252         GC_coll_cpu[i] = 0;
253         GC_coll_elapsed[i] = 0;
254         GC_coll_max_pause[i] = 0;
255     }
256 }
257 
258 /* ---------------------------------------------------------------------------
259    Reset stats of child process after fork()
260    ------------------------------------------------------------------------ */
261 
resetChildProcessStats()262 void resetChildProcessStats()
263 {
264     initStats0();
265     initGenerationStats();
266 }
267 
268 /* -----------------------------------------------------------------------------
269    Initialisation time...
270    -------------------------------------------------------------------------- */
271 
272 void
stat_startInit(void)273 stat_startInit(void)
274 {
275     getProcessTimes(&start_init_cpu, &start_init_elapsed);
276 }
277 
278 void
stat_endInit(void)279 stat_endInit(void)
280 {
281     getProcessTimes(&end_init_cpu, &end_init_elapsed);
282     stats.init_cpu_ns = end_init_cpu - start_init_cpu;
283     stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
284 }
285 
286 /* -----------------------------------------------------------------------------
287    stat_startExit and stat_endExit
288 
289    These two measure the time taken in shutdownHaskell().
290    -------------------------------------------------------------------------- */
291 
292 void
stat_startExit(void)293 stat_startExit(void)
294 {
295     ACQUIRE_LOCK(&stats_mutex);
296     getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
297     start_exit_gc_elapsed = stats.gc_elapsed_ns;
298     start_exit_gc_cpu = stats.gc_cpu_ns;
299     RELEASE_LOCK(&stats_mutex);
300 }
301 
302 /* -----------------------------------------------------------------------------
303    Nonmoving (concurrent) collector statistics
304 
305    These two measure the time taken in the concurrent mark & sweep collector.
306    -------------------------------------------------------------------------- */
307 void
stat_endExit(void)308 stat_endExit(void)
309 {
310     ACQUIRE_LOCK(&stats_mutex);
311     getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
312     RELEASE_LOCK(&stats_mutex);
313 }
314 
315 void
stat_startGCSync(gc_thread * gct)316 stat_startGCSync (gc_thread *gct)
317 {
318     gct->gc_sync_start_elapsed = getProcessElapsedTime();
319 }
320 
321 void
stat_startNonmovingGc()322 stat_startNonmovingGc ()
323 {
324     ACQUIRE_LOCK(&stats_mutex);
325     start_nonmoving_gc_cpu = getCurrentThreadCPUTime();
326     start_nonmoving_gc_elapsed = getProcessCPUTime();
327     RELEASE_LOCK(&stats_mutex);
328 }
329 
330 void
stat_endNonmovingGc()331 stat_endNonmovingGc ()
332 {
333     Time cpu = getCurrentThreadCPUTime();
334     Time elapsed = getProcessCPUTime();
335 
336     ACQUIRE_LOCK(&stats_mutex);
337     stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed;
338     stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns;
339 
340     stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu;
341     stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns;
342 
343     stats.nonmoving_gc_max_elapsed_ns =
344       stg_max(stats.gc.nonmoving_gc_elapsed_ns,
345               stats.nonmoving_gc_max_elapsed_ns);
346     RELEASE_LOCK(&stats_mutex);
347 }
348 
349 void
stat_startNonmovingGcSync()350 stat_startNonmovingGcSync ()
351 {
352     ACQUIRE_LOCK(&stats_mutex);
353     start_nonmoving_gc_sync_elapsed = getProcessElapsedTime();
354     RELEASE_LOCK(&stats_mutex);
355     traceConcSyncBegin();
356 }
357 
358 void
stat_endNonmovingGcSync()359 stat_endNonmovingGcSync ()
360 {
361     Time end_elapsed = getProcessElapsedTime();
362     ACQUIRE_LOCK(&stats_mutex);
363     stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed;
364     stats.nonmoving_gc_sync_elapsed_ns +=  stats.gc.nonmoving_gc_sync_elapsed_ns;
365     stats.nonmoving_gc_sync_max_elapsed_ns =
366       stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns,
367               stats.nonmoving_gc_sync_max_elapsed_ns);
368     RELEASE_LOCK(&stats_mutex);
369     traceConcSyncEnd();
370 }
371 
372 /* -----------------------------------------------------------------------------
373    Called at the beginning of each GC
374    -------------------------------------------------------------------------- */
375 
376 /*
377  * Note [Time accounting]
378  * ~~~~~~~~~~~~~~~~~~~~~~
379  * In the "vanilla" configuration (using the standard copying GC) GHC keeps
380  * track of a two different sinks of elapsed and CPU time:
381  *
382  *  - time spent synchronising to initiate garbage collection
383  *  - garbage collection (per generation)
384  *  - mutation
385  *
386  * When using the (concurrent) non-moving garbage collector (see Note
387  * [Non-moving garbage collector]) we also track a few more sinks:
388  *
389  *  - minor GC
390  *  - major GC (namly time spent in the preparatory phase)
391  *  - concurrent mark
392  *  - final synchronization (elapsed only)
393  *  - mutation
394  *
395  * To keep track of these CPU times we rely on the system's per-thread CPU time
396  * clock (exposed via the runtime's getCurrentThreadCPUTime utility).
397  *
398  * CPU time spent in the copying garbage collector is tracked in each GC
399  * worker's gc_thread struct. At the beginning of scavenging each worker
400  * records its OS thread's CPU time its gc_thread (by stat_startGCWorker). At
401  * the end of scavenging we again record the CPU time (in stat_endGCworker).
402  * The differences of these are then summed over by the thread leading the GC
403  * at the end of collection in stat_endGC. By contrast, the elapsed time is
404  * recorded only by the leader.
405  *
406  * Mutator time is derived from the process's CPU time, subtracting out
407  * contributions from stop-the-world and concurrent GCs.
408  *
409  * Time spent in concurrent marking is recorded by stat_{start,end}NonmovingGc.
410  * Likewise, elapsed time spent in the final synchronization is recorded by
411  * stat_{start,end}NonmovingGcSync.
412  */
413 
414 void
stat_startGCWorker(Capability * cap STG_UNUSED,gc_thread * gct)415 stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
416 {
417     bool stats_enabled =
418         RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
419         rtsConfig.gcDoneHook != NULL;
420 
421     if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
422         gct->gc_start_cpu = getCurrentThreadCPUTime();
423     }
424 }
425 
426 void
stat_endGCWorker(Capability * cap STG_UNUSED,gc_thread * gct)427 stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
428 {
429     bool stats_enabled =
430         RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
431         rtsConfig.gcDoneHook != NULL;
432 
433     if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
434         gct->gc_end_cpu = getCurrentThreadCPUTime();
435         ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
436     }
437 }
438 
439 void
stat_startGC(Capability * cap,gc_thread * gct)440 stat_startGC (Capability *cap, gc_thread *gct)
441 {
442     if (RtsFlags.GcFlags.ringBell) {
443         debugBelch("\007");
444     }
445 
446     bool stats_enabled =
447         RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
448         rtsConfig.gcDoneHook != NULL;
449 
450     if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
451         gct->gc_start_cpu = getCurrentThreadCPUTime();
452     }
453 
454     gct->gc_start_elapsed = getProcessElapsedTime();
455 
456     // Post EVENT_GC_START with the same timestamp as used for stats
457     // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
458     // Here, as opposed to other places, the event is emitted on the cap
459     // that initiates the GC and external tools expect it to have the same
460     // timestamp as used in +RTS -s calculcations.
461     traceEventGcStartAtT(cap,
462                          TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
463 
464     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
465     {
466         gct->gc_start_faults = getPageFaults();
467     }
468 
469     updateNurseriesStats();
470 }
471 
472 /* -----------------------------------------------------------------------------
473    Called at the end of each GC
474    -------------------------------------------------------------------------- */
475 
476 void
stat_endGC(Capability * cap,gc_thread * initiating_gct,W_ live,W_ copied,W_ slop,uint32_t gen,uint32_t par_n_threads,gc_thread ** gc_threads,W_ par_max_copied,W_ par_balanced_copied,W_ gc_spin_spin,W_ gc_spin_yield,W_ mut_spin_spin,W_ mut_spin_yield,W_ any_work,W_ no_work,W_ scav_find_work)477 stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
478             uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
479             W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
480             W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
481             W_ scav_find_work)
482 {
483     ACQUIRE_LOCK(&stats_mutex);
484 
485     // -------------------------------------------------
486     // Collect all the stats about this GC in stats.gc. We always do this since
487     // it's relatively cheap and we need allocated_bytes to catch heap
488     // overflows.
489 
490     stats.gc.gen = gen;
491     stats.gc.threads = par_n_threads;
492 
493     uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
494 
495     // allocated since the last GC
496     stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
497 
498     stats.gc.live_bytes = live * sizeof(W_);
499     stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
500     stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
501     stats.gc.slop_bytes = slop * sizeof(W_);
502     stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
503     stats.gc.copied_bytes = copied * sizeof(W_);
504     stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
505     stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
506 
507     bool stats_enabled =
508         RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
509         rtsConfig.gcDoneHook != NULL;
510 
511     if (stats_enabled
512       || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
513     {
514         // We only update the times when stats are explicitly enabled since
515         // getProcessTimes (e.g. requiring a system call) can be expensive on
516         // some platforms.
517         Time current_cpu, current_elapsed;
518         getProcessTimes(&current_cpu, &current_elapsed);
519         stats.cpu_ns = current_cpu - start_init_cpu;
520         stats.elapsed_ns = current_elapsed - start_init_elapsed;
521 
522         stats.gc.sync_elapsed_ns =
523             initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
524         stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
525         stats.gc.cpu_ns = 0;
526         // see Note [n_gc_threads]
527         // par_n_threads is set to n_gc_threads at the single callsite of this
528         // function
529         if (par_n_threads == 1) {
530             ASSERT(initiating_gct->gc_end_cpu >= initiating_gct->gc_start_cpu);
531             stats.gc.cpu_ns += initiating_gct->gc_end_cpu - initiating_gct->gc_start_cpu;
532         } else {
533             for (unsigned int i=0; i < par_n_threads; i++) {
534                 gc_thread *gct = gc_threads[i];
535                 ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
536                 stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
537             }
538         }
539     }
540     // -------------------------------------------------
541     // Update the cumulative stats
542 
543     stats.gcs++;
544     stats.allocated_bytes = tot_alloc_bytes;
545     stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
546 
547     GC_coll_cpu[gen] += stats.gc.cpu_ns;
548     GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
549     if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
550         GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
551     }
552 
553     stats.copied_bytes += stats.gc.copied_bytes;
554     if (par_n_threads > 1) {
555         stats.par_copied_bytes += stats.gc.copied_bytes;
556         stats.cumulative_par_max_copied_bytes +=
557             stats.gc.par_max_copied_bytes;
558         stats.cumulative_par_balanced_copied_bytes +=
559             stats.gc.par_balanced_copied_bytes;
560         stats.any_work += any_work;
561         stats.no_work += no_work;
562         stats.scav_find_work += scav_find_work;
563         stats.gc_spin_spin += gc_spin_spin;
564         stats.gc_spin_yield += gc_spin_yield;
565         stats.mut_spin_spin += mut_spin_spin;
566         stats.mut_spin_yield += mut_spin_yield;
567     }
568     stats.gc_cpu_ns += stats.gc.cpu_ns;
569     stats.gc_elapsed_ns += stats.gc.elapsed_ns;
570 
571     if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
572         stats.major_gcs++;
573         if (stats.gc.live_bytes > stats.max_live_bytes) {
574             stats.max_live_bytes = stats.gc.live_bytes;
575         }
576         if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
577             stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
578         }
579         if (stats.gc.compact_bytes > stats.max_compact_bytes) {
580             stats.max_compact_bytes = stats.gc.compact_bytes;
581         }
582         if (stats.gc.slop_bytes > stats.max_slop_bytes) {
583             stats.max_slop_bytes = stats.gc.slop_bytes;
584         }
585         stats.cumulative_live_bytes += stats.gc.live_bytes;
586     }
587 
588     // -------------------------------------------------
589     // Do the more expensive bits only when stats are enabled.
590 
591     if (stats_enabled)
592     {
593         // -------------------------------------------------
594         // Emit events to the event log
595 
596         // Has to be emitted while all caps stopped for GC, but before GC_END.
597         // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
598         // for a detailed design rationale of the current setup
599         // of GC eventlog events.
600         traceEventGcGlobalSync(cap);
601 
602         // Emitted before GC_END on all caps, which simplifies tools code.
603         traceEventGcStats(cap,
604                           CAPSET_HEAP_DEFAULT,
605                           stats.gc.gen,
606                           stats.gc.copied_bytes,
607                           stats.gc.slop_bytes,
608                           /* current loss due to fragmentation */
609                           (mblocks_allocated * BLOCKS_PER_MBLOCK
610                            - n_alloc_blocks) * BLOCK_SIZE,
611                           par_n_threads,
612                           stats.gc.par_max_copied_bytes,
613                           stats.gc.copied_bytes,
614                           stats.gc.par_balanced_copied_bytes);
615 
616         // Post EVENT_GC_END with the same timestamp as used for stats
617         // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
618         // Here, as opposed to other places, the event is emitted on the cap
619         // that initiates the GC and external tools expect it to have the same
620         // timestamp as used in +RTS -s calculcations.
621         traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
622 
623         if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
624             traceEventHeapLive(cap,
625                                CAPSET_HEAP_DEFAULT,
626                                stats.gc.live_bytes);
627         }
628 
629         // -------------------------------------------------
630         // Print GC stats to stdout or a file (+RTS -S/-s)
631 
632         if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
633             W_ faults = getPageFaults();
634 
635             statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
636                         stats.gc.allocated_bytes, stats.gc.copied_bytes,
637                         stats.gc.live_bytes);
638 
639             statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
640                         FMT_Word " %4" FMT_Word "  (Gen: %2d)\n",
641                     TimeToSecondsDbl(stats.gc.cpu_ns),
642                     TimeToSecondsDbl(stats.gc.elapsed_ns),
643                     TimeToSecondsDbl(stats.cpu_ns),
644                     TimeToSecondsDbl(stats.elapsed_ns),
645                     faults - initiating_gct->gc_start_faults,
646                         initiating_gct->gc_start_faults - GC_end_faults,
647                     gen);
648 
649             GC_end_faults = faults;
650             statsFlush();
651         }
652 
653 
654         if (rtsConfig.gcDoneHook != NULL) {
655             rtsConfig.gcDoneHook(&stats.gc);
656         }
657 
658         traceEventHeapSize(cap,
659                            CAPSET_HEAP_DEFAULT,
660                            mblocks_allocated * MBLOCK_SIZE);
661     }
662     RELEASE_LOCK(&stats_mutex);
663 }
664 
665 /* -----------------------------------------------------------------------------
666    Called at the beginning of each Retainer Profiliing
667    -------------------------------------------------------------------------- */
668 #if defined(PROFILING)
669 void
stat_startRP(void)670 stat_startRP(void)
671 {
672     Time user, elapsed;
673     getProcessTimes( &user, &elapsed );
674 
675     ACQUIRE_LOCK(&stats_mutex);
676     RP_start_time = user;
677     RPe_start_time = elapsed;
678     RELEASE_LOCK(&stats_mutex);
679 }
680 #endif /* PROFILING */
681 
682 /* -----------------------------------------------------------------------------
683    Called at the end of each Retainer Profiliing
684    -------------------------------------------------------------------------- */
685 
686 #if defined(PROFILING)
687 void
stat_endRP(uint32_t retainerGeneration,int maxStackSize,double averageNumVisit)688 stat_endRP(
689   uint32_t retainerGeneration,
690   int maxStackSize,
691   double averageNumVisit)
692 {
693     Time user, elapsed;
694     getProcessTimes( &user, &elapsed );
695 
696     ACQUIRE_LOCK(&stats_mutex);
697     RP_tot_time += user - RP_start_time;
698     RPe_tot_time += elapsed - RPe_start_time;
699     double mut_time_during_RP = mut_user_time_during_RP();
700     RELEASE_LOCK(&stats_mutex);
701 
702     fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
703             retainerGeneration, mut_time_during_RP);
704     fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
705     fprintf(prof_file, "\tAverage number of visits per object = %f\n",
706             averageNumVisit);
707 }
708 #endif /* PROFILING */
709 
710 /* -----------------------------------------------------------------------------
711    Called at the beginning of each heap census
712    -------------------------------------------------------------------------- */
713 #if defined(PROFILING)
714 void
stat_startHeapCensus(void)715 stat_startHeapCensus(void)
716 {
717     Time user, elapsed;
718     getProcessTimes( &user, &elapsed );
719 
720     ACQUIRE_LOCK(&stats_mutex);
721     HC_start_time = user;
722     HCe_start_time = elapsed;
723     RELEASE_LOCK(&stats_mutex);
724 }
725 #endif /* PROFILING */
726 
727 /* -----------------------------------------------------------------------------
728    Called at the end of each heap census
729    -------------------------------------------------------------------------- */
730 #if defined(PROFILING)
731 void
stat_endHeapCensus(void)732 stat_endHeapCensus(void)
733 {
734     Time user, elapsed;
735     getProcessTimes( &user, &elapsed );
736 
737     ACQUIRE_LOCK(&stats_mutex);
738     HC_tot_time += user - HC_start_time;
739     HCe_tot_time += elapsed - HCe_start_time;
740     RELEASE_LOCK(&stats_mutex);
741 }
742 #endif /* PROFILING */
743 
744 /* -----------------------------------------------------------------------------
745    Called at the end of execution
746 
747    NOTE: number of allocations is not entirely accurate: it doesn't
748    take into account the few bytes at the end of the heap that
749    were left unused when the heap-check failed.
750    -------------------------------------------------------------------------- */
751 
752 #if defined(DEBUG)
753 #define TICK_VAR_INI(arity) \
754   StgInt SLOW_CALLS_##arity = 1; \
755   StgInt RIGHT_ARITY_##arity = 1; \
756   StgInt TAGGED_PTR_##arity = 0;
757 
758 TICK_VAR_INI(1)
759 TICK_VAR_INI(2)
760 
761 StgInt TOTAL_CALLS=1;
762 #endif
763 
764 /* Report the value of a counter */
765 #define REPORT(counter) \
766   { \
767     showStgWord64(counter,temp,true/*commas*/); \
768     statsPrintf("  (" #counter ")  : %s\n",temp); \
769   }
770 
771 /* Report the value of a counter as a percentage of another counter */
772 #define REPORT_PCT(counter,countertot) \
773   statsPrintf("  (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
774               counter*100.0/countertot)
775 
776 #define TICK_PRINT(arity) \
777   REPORT(SLOW_CALLS_##arity); \
778   REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
779   REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
780   REPORT(RIGHT_ARITY_##arity); \
781   REPORT(TAGGED_PTR_##arity)
782 
783 #define TICK_PRINT_TOT(arity) \
784   statsPrintf("  (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
785               SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
786 
787 /*
788 Note [RTS Stats Reporting]
789 ==========================
790 
791 There are currently three reporting functions:
792   * report_summary:
793       Responsible for producing '+RTS -s' output.
794       Will report internal counters if the RTS flag --internal-counters is
795       passed. See [Internal Counters Stats]
796   * report_machine_readable:
797       Responsible for producing '+RTS -t --machine-readable' output.
798   * report_one_line:
799       Responsible for producing '+RTS -t' output
800 
801 Stats are accumulated into the global variable 'stats' as the program runs, then
802 in 'stat_exit' we do the following:
803   * Finalise 'stats'. This involves setting final running times and allocations
804     that have not yet been accounted for.
805   * Create a RTSSummaryStats. This contains all data for reports that is not
806     included in stats (because they do not make sense before the program has
807     completed) or in a global variable.
808   * call the appropriate report_* function, passing the newly constructed
809     RTSSummaryStats.
810 
811 To ensure that the data in the different reports is kept consistent, the
812 report_* functions should not do any calculation, excepting unit changes and
813 formatting. If you need to add a new calculated field, add it to
814 RTSSummaryStats.
815 
816 */
817 
818 
init_RTSSummaryStats(RTSSummaryStats * sum)819 static void init_RTSSummaryStats(RTSSummaryStats* sum)
820 {
821     const size_t sizeof_gc_summary_stats =
822       RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
823 
824     memset(sum, 0, sizeof(RTSSummaryStats));
825     sum->gc_summary_stats =
826       stgMallocBytes(sizeof_gc_summary_stats,
827                      "alloc_RTSSummaryStats.gc_summary_stats");
828     memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
829 }
830 
free_RTSSummaryStats(RTSSummaryStats * sum)831 static void free_RTSSummaryStats(RTSSummaryStats * sum)
832 {
833     stgFree(sum->gc_summary_stats);
834     sum->gc_summary_stats = NULL;
835 }
836 
837 // Must hold stats_mutex.
report_summary(const RTSSummaryStats * sum)838 static void report_summary(const RTSSummaryStats* sum)
839 {
840     // We should do no calculation, other than unit changes and formatting, and
841     // we should not use any data from outside of globals, sum and stats
842     // here. See Note [RTS Stats Reporting]
843 
844     uint32_t g;
845     char temp[512];
846     showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
847     statsPrintf("%16s bytes allocated in the heap\n", temp);
848 
849     showStgWord64(stats.copied_bytes, temp, true/*commas*/);
850     statsPrintf("%16s bytes copied during GC\n", temp);
851 
852     if ( stats.major_gcs > 0 ) {
853         showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
854         statsPrintf("%16s bytes maximum residency (%" FMT_Word32
855                     " sample(s))\n",
856                     temp, stats.major_gcs);
857     }
858 
859     showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
860     statsPrintf("%16s bytes maximum slop\n", temp);
861 
862     statsPrintf("%16" FMT_Word64 " MiB total memory in use (%"
863                 FMT_Word64 " MB lost due to fragmentation)\n\n",
864                 stats.max_mem_in_use_bytes  / (1024 * 1024),
865                 sum->fragmentation_bytes / (1024 * 1024));
866 
867     /* Print garbage collections in each gen */
868     statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
869     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
870         const GenerationSummaryStats * gen_stats =
871             &sum->gc_summary_stats[g];
872         statsPrintf("  Gen %2d     %5d colls"
873                     ", %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
874                     g, // REVIEWERS: this used to be gen->no
875                     //, this can't ever be different right?
876                     gen_stats->collections,
877                     gen_stats->par_collections,
878                     TimeToSecondsDbl(gen_stats->cpu_ns),
879                     TimeToSecondsDbl(gen_stats->elapsed_ns),
880                     TimeToSecondsDbl(gen_stats->avg_pause_ns),
881                     TimeToSecondsDbl(gen_stats->max_pause_ns));
882     }
883     if (RtsFlags.GcFlags.useNonmoving) {
884         const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections;
885         statsPrintf("  Gen  1     %5d syncs"
886                     ",                      %6.3fs     %3.4fs    %3.4fs\n",
887                     n_major_colls,
888                     TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns),
889                     TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls,
890                     TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns));
891         statsPrintf("  Gen  1      concurrent"
892                     ",             %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
893                     TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
894                     TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns),
895                     TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls,
896                     TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns));
897     }
898 
899     statsPrintf("\n");
900 
901 #if defined(THREADED_RTS)
902     if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
903         // See Note [Work Balance]
904         statsPrintf("  Parallel GC work balance: "
905                     "%.2f%% (serial 0%%, perfect 100%%)\n\n",
906                     sum->work_balance * 100);
907     }
908 
909     statsPrintf("  TASKS: %d "
910                 "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
911                 taskCount, sum->bound_task_count,
912                 peakWorkerCount, workerCount,
913                 n_capabilities);
914 
915     statsPrintf("  SPARKS: %" FMT_Word64
916                 " (%" FMT_Word " converted, %" FMT_Word " overflowed, %"
917                 FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
918                 sum->sparks_count,
919                 sum->sparks.converted, sum->sparks.overflowed,
920                 sum->sparks.dud, sum->sparks.gcd,
921                 sum->sparks.fizzled);
922 #endif
923 
924     statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
925                 TimeToSecondsDbl(stats.init_cpu_ns),
926                 TimeToSecondsDbl(stats.init_elapsed_ns));
927 
928     statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
929                 TimeToSecondsDbl(stats.mutator_cpu_ns),
930                 TimeToSecondsDbl(stats.mutator_elapsed_ns));
931     statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
932                 TimeToSecondsDbl(stats.gc_cpu_ns),
933                 TimeToSecondsDbl(stats.gc_elapsed_ns));
934     if (RtsFlags.GcFlags.useNonmoving) {
935         statsPrintf(
936                 "  CONC GC time  %7.3fs  (%7.3fs elapsed)\n",
937                 TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
938                 TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns));
939     }
940 
941 #if defined(PROFILING)
942     statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
943                 TimeToSecondsDbl(sum->rp_cpu_ns),
944                 TimeToSecondsDbl(sum->rp_elapsed_ns));
945     statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
946                 TimeToSecondsDbl(sum->hc_cpu_ns),
947                 TimeToSecondsDbl(sum->hc_elapsed_ns));
948 #endif
949     statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
950                 TimeToSecondsDbl(sum->exit_cpu_ns),
951                 TimeToSecondsDbl(sum->exit_elapsed_ns));
952     statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
953                 TimeToSecondsDbl(stats.cpu_ns),
954                 TimeToSecondsDbl(stats.elapsed_ns));
955 #if !defined(THREADED_RTS)
956     statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
957                 sum->gc_cpu_percent * 100,
958                 sum->gc_elapsed_percent * 100);
959 #endif
960 
961     showStgWord64(sum->alloc_rate, temp, true/*commas*/);
962 
963     statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
964 
965     statsPrintf("  Productivity %5.1f%% of total user, "
966                 "%.1f%% of total elapsed\n\n",
967                 sum->productivity_cpu_percent * 100,
968                 sum->productivity_elapsed_percent * 100);
969 
970     // See Note [Internal Counter Stats] for a description of the
971     // following counters. If you add a counter here, please remember
972     // to update the Note.
973     if (RtsFlags.MiscFlags.internalCounters) {
974 #if defined(THREADED_RTS) && defined(PROF_SPIN)
975         const int32_t col_width[] = {4, -30, 14, 14};
976         statsPrintf("Internal Counters:\n");
977         statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
978                     , col_width[0], ""
979                     , col_width[1], "SpinLock"
980                     , col_width[2], "Spins"
981                     , col_width[3], "Yields");
982         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
983                     , col_width[0], ""
984                     , col_width[1], "gc_alloc_block_sync"
985                     , col_width[2], gc_alloc_block_sync.spin
986                     , col_width[3], gc_alloc_block_sync.yield);
987         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
988                     , col_width[0], ""
989                     , col_width[1], "gc_spin"
990                     , col_width[2], stats.gc_spin_spin
991                     , col_width[3], stats.gc_spin_yield);
992         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
993                     , col_width[0], ""
994                     , col_width[1], "mut_spin"
995                     , col_width[2], stats.mut_spin_spin
996                     , col_width[3], stats.mut_spin_yield);
997         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
998                     , col_width[0], ""
999                     , col_width[1], "whitehole_gc"
1000                     , col_width[2], whitehole_gc_spin
1001                     , col_width[3], "n/a");
1002         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
1003                     , col_width[0], ""
1004                     , col_width[1], "whitehole_threadPaused"
1005                     , col_width[2], whitehole_threadPaused_spin
1006                     , col_width[3], "n/a");
1007         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
1008                     , col_width[0], ""
1009                     , col_width[1], "whitehole_executeMessage"
1010                     , col_width[2], whitehole_executeMessage_spin
1011                     , col_width[3], "n/a");
1012         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1013                     , col_width[0], ""
1014                     , col_width[1], "whitehole_lockClosure"
1015                     , col_width[2], whitehole_lockClosure_spin
1016                     , col_width[3], whitehole_lockClosure_yield);
1017         // waitForGcThreads isn't really spin-locking(see the function)
1018         // but these numbers still seem useful.
1019         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1020                     , col_width[0], ""
1021                     , col_width[1], "waitForGcThreads"
1022                     , col_width[2], waitForGcThreads_spin
1023                     , col_width[3], waitForGcThreads_yield);
1024 
1025         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1026             int prefix_length = 0;
1027             statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
1028                         col_width[0], "", g, &prefix_length);
1029             prefix_length -= col_width[0];
1030             int suffix_length = col_width[1] + prefix_length;
1031             suffix_length =
1032                   suffix_length > 0 ? col_width[1] : suffix_length;
1033 
1034             statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1035                         , suffix_length, "].sync"
1036                         , col_width[2], generations[g].sync.spin
1037                         , col_width[3], generations[g].sync.yield);
1038         }
1039         statsPrintf("\n");
1040         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1041                     , col_width[0], ""
1042                     , col_width[1], "any_work"
1043                     , col_width[2], stats.any_work);
1044         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1045                     , col_width[0], ""
1046                     , col_width[1], "no_work"
1047                     , col_width[2], stats.no_work);
1048         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1049                     , col_width[0], ""
1050                     , col_width[1], "scav_find_work"
1051                     , col_width[2], stats.scav_find_work);
1052 #elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
1053         statsPrintf("Internal Counters require the RTS to be built "
1054                 "with PROF_SPIN"); // PROF_SPIN is not #defined here
1055 #else // THREADED_RTS
1056         statsPrintf("Internal Counters require the threaded RTS");
1057 #endif
1058     }
1059 }
1060 
report_machine_readable(const RTSSummaryStats * sum)1061 static void report_machine_readable (const RTSSummaryStats * sum)
1062 {
1063     // We should do no calculation, other than unit changes and formatting, and
1064     // we should not use any data from outside of globals, sum and stats
1065     // here. See Note [RTS Stats Reporting]
1066     uint32_t g;
1067 
1068 #define MR_STAT(field_name,format,value) \
1069     statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
1070 #define MR_STAT_GEN(gen,field_name,format,value) \
1071     statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
1072       format "\")\n", g, value)
1073 
1074     // These first values are for backwards compatibility.
1075     // Some of these first fields are duplicated with more machine-readable
1076     // names, or to match the name in RtsStats.
1077 
1078     // we don't use for the first field helper macro here because the prefix is
1079     // different
1080     statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
1081                 stats.allocated_bytes);
1082     MR_STAT("num_GCs", FMT_Word32, stats.gcs);
1083     MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
1084     MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
1085     MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
1086     MR_STAT("peak_megabytes_allocated", FMT_Word64,
1087       stats.max_mem_in_use_bytes / (1024 * 1024));
1088 
1089     MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
1090     MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
1091     MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
1092     MR_STAT("mut_wall_seconds", "f",
1093             TimeToSecondsDbl(stats.mutator_elapsed_ns));
1094     MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
1095     MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
1096 
1097     // end backward compatibility
1098 
1099     // First, the rest of the times
1100 
1101     MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
1102     MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
1103 #if defined(PROFILING)
1104     MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
1105     MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
1106     MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hc_cpu_ns));
1107     MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hc_elapsed_ns));
1108 #endif
1109     MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
1110     MR_STAT("total_wall_seconds", "f",
1111             TimeToSecondsDbl(stats.elapsed_ns));
1112 
1113     // next, the remainder of the fields of RTSStats, except internal counters
1114 
1115     // The first two are duplicates of those above, but have more machine
1116     // readable names that match the field names in RTSStats.
1117 
1118 
1119     // gcs has been done as num_GCs above
1120     MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
1121     MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
1122     MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
1123     MR_STAT("max_large_objects_bytes", FMT_Word64,
1124             stats.max_large_objects_bytes);
1125     MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
1126     MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
1127     // This duplicates, except for unit, peak_megabytes_allocated above
1128     MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
1129     MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
1130     MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
1131     MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
1132     MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
1133             stats.cumulative_par_max_copied_bytes);
1134     MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
1135             stats.cumulative_par_balanced_copied_bytes);
1136 
1137     // next, the computed fields in RTSSummaryStats
1138 #if !defined(THREADED_RTS) // THREADED_RTS
1139     MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
1140     MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
1141 #endif
1142     MR_STAT("fragmentation_bytes", FMT_Word64, sum->fragmentation_bytes);
1143     // average_bytes_used is done above
1144     MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
1145     MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
1146     MR_STAT("productivity_wall_percent", "f",
1147             sum->productivity_elapsed_percent);
1148 
1149     // next, the THREADED_RTS fields in RTSSummaryStats
1150 
1151 #if defined(THREADED_RTS)
1152     MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
1153     MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
1154     MR_STAT("sparks_converted", FMT_Word, sum->sparks.converted);
1155     MR_STAT("sparks_overflowed", FMT_Word, sum->sparks.overflowed);
1156     MR_STAT("sparks_dud ", FMT_Word, sum->sparks.dud);
1157     MR_STAT("sparks_gcd", FMT_Word, sum->sparks.gcd);
1158     MR_STAT("sparks_fizzled", FMT_Word, sum->sparks.fizzled);
1159     MR_STAT("work_balance", "f", sum->work_balance);
1160 
1161     // next, globals (other than internal counters)
1162     MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
1163     MR_STAT("task_count", FMT_Word32, taskCount);
1164     MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
1165     MR_STAT("worker_count", FMT_Word32, workerCount);
1166 
1167     // next, internal counters
1168 #if defined(PROF_SPIN)
1169     MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
1170     MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
1171             gc_alloc_block_sync.yield);
1172     MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
1173     MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
1174     MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
1175     MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
1176     MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
1177     MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
1178     MR_STAT("waitForGcThreads_yield", FMT_Word64,
1179             waitForGcThreads_yield);
1180     MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
1181     MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
1182             whitehole_lockClosure_spin);
1183     MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
1184             whitehole_lockClosure_yield);
1185     MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
1186             whitehole_executeMessage_spin);
1187     MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
1188             whitehole_threadPaused_spin);
1189     MR_STAT("any_work", FMT_Word64,
1190             stats.any_work);
1191     MR_STAT("no_work", FMT_Word64,
1192             stats.no_work);
1193     MR_STAT("scav_find_work", FMT_Word64,
1194             stats.scav_find_work);
1195 #endif // PROF_SPIN
1196 #endif // THREADED_RTS
1197 
1198     // finally, per-generation stats. Named as, for example for generation 0,
1199     // gen_0_collections
1200     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1201         const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
1202         MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
1203         MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
1204         MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
1205         MR_STAT_GEN(g, "wall_seconds", "f",
1206                     TimeToSecondsDbl(gc_sum->elapsed_ns));
1207         MR_STAT_GEN(g, "max_pause_seconds", "f",
1208                     TimeToSecondsDbl(gc_sum->max_pause_ns));
1209         MR_STAT_GEN(g, "avg_pause_seconds", "f",
1210                     TimeToSecondsDbl(gc_sum->avg_pause_ns));
1211 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1212         MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
1213         MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
1214 #endif
1215     }
1216 
1217     statsPrintf(" ]\n");
1218 }
1219 
1220 // Must hold stats_mutex.
report_one_line(const RTSSummaryStats * sum)1221 static void report_one_line(const RTSSummaryStats * sum)
1222 {
1223     // We should do no calculation, other than unit changes and formatting, and
1224     // we should not use any data from outside of globals, sum and stats
1225     // here. See Note [RTS Stats Reporting]
1226 
1227     /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
1228     mingw-0.5) */
1229     statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
1230                 "%" FMT_Word32 " GCs, "
1231                 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
1232                 "(%" FMT_Word32 " samples), "
1233                 "%" FMT_Word64 "M in use, "
1234                 "%.3f INIT (%.3f elapsed), "
1235                 "%.3f MUT (%.3f elapsed), "
1236                 "%.3f GC (%.3f elapsed) :ghc>>\n",
1237                 stats.allocated_bytes,
1238                 stats.gcs,
1239                 sum->average_bytes_used,
1240                 stats.max_live_bytes,
1241                 stats.major_gcs,
1242                 stats.max_mem_in_use_bytes / (1024 * 1024),
1243                 TimeToSecondsDbl(stats.init_cpu_ns),
1244                 TimeToSecondsDbl(stats.init_elapsed_ns),
1245                 TimeToSecondsDbl(stats.mutator_cpu_ns),
1246                 TimeToSecondsDbl(stats.mutator_elapsed_ns),
1247                 TimeToSecondsDbl(stats.gc_cpu_ns),
1248                 TimeToSecondsDbl(stats.gc_elapsed_ns));
1249 }
1250 
1251 void
stat_exitReport(void)1252 stat_exitReport (void)
1253 {
1254     RTSSummaryStats sum;
1255     init_RTSSummaryStats(&sum);
1256     // We'll need to refer to task counters later
1257     ACQUIRE_LOCK(&all_tasks_mutex);
1258 
1259     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
1260         // First we tidy the times in stats, and populate the times in sum.
1261         // In particular, we adjust the gc_* time in stats to remove
1262         // profiling times.
1263         {
1264             Time now_cpu_ns, now_elapsed_ns;
1265             getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
1266 
1267             ACQUIRE_LOCK(&stats_mutex);
1268             stats.cpu_ns = now_cpu_ns - start_init_cpu;
1269             stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
1270             /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
1271                seconds -- SDM */
1272             if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
1273             if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
1274 
1275 #if defined(PROFILING)
1276             sum.rp_cpu_ns = RP_tot_time;
1277             sum.rp_elapsed_ns = RPe_tot_time;
1278             sum.hc_cpu_ns = HC_tot_time;
1279             sum.hc_elapsed_ns = HCe_tot_time;
1280 #endif // PROFILING
1281 
1282             // We do a GC during the EXIT phase. We'll attribute the cost of
1283             // that to GC instead of EXIT, so carefully subtract it from the
1284             // EXIT time.
1285             // Note that exit_gc includes RP and HC for the exit GC too.
1286             Time exit_gc_cpu     = stats.gc_cpu_ns - start_exit_gc_cpu;
1287             Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
1288 
1289             ASSERT(exit_gc_elapsed > 0);
1290 
1291             sum.exit_cpu_ns     = end_exit_cpu
1292                                       - start_exit_cpu
1293                                       - exit_gc_cpu;
1294             sum.exit_elapsed_ns = end_exit_elapsed
1295                                        - start_exit_elapsed
1296                                        - exit_gc_elapsed;
1297 
1298             ASSERT(sum.exit_elapsed_ns >= 0);
1299 
1300             stats.mutator_cpu_ns     = start_exit_cpu
1301                                  - end_init_cpu
1302                                  - (stats.gc_cpu_ns - exit_gc_cpu)
1303                                  - stats.nonmoving_gc_cpu_ns;
1304             stats.mutator_elapsed_ns = start_exit_elapsed
1305                                  - end_init_elapsed
1306                                  - (stats.gc_elapsed_ns - exit_gc_elapsed);
1307 
1308             ASSERT(stats.mutator_elapsed_ns >= 0);
1309 
1310             if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
1311 
1312             // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
1313             // and subtracting, so the parts should add up to the total exactly.
1314             // Note that stats->total_ns is captured a tiny bit later than
1315             // end_exit_elapsed, so we don't use it here.
1316             ASSERT(stats.init_elapsed_ns // INIT
1317                    + stats.mutator_elapsed_ns // MUT
1318                    + stats.gc_elapsed_ns // GC
1319                    + sum.exit_elapsed_ns // EXIT
1320                    == end_exit_elapsed - start_init_elapsed);
1321 
1322             // heapCensus() is called by the GC, so RP and HC time are
1323             // included in the GC stats.  We therefore subtract them to
1324             // obtain the actual GC cpu time.
1325             Time prof_cpu     = sum.rp_cpu_ns + sum.hc_cpu_ns;
1326             Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
1327 
1328             stats.gc_cpu_ns      -=  prof_cpu;
1329             stats.gc_elapsed_ns  -=  prof_elapsed;
1330 
1331             // This assertion is probably not necessary; make sure the
1332             // subdivision with PROF also makes sense
1333             ASSERT(stats.init_elapsed_ns // INIT
1334                    + stats.mutator_elapsed_ns // MUT
1335                    + stats.gc_elapsed_ns // GC
1336                    + sum.exit_elapsed_ns // EXIT
1337                    + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
1338                    == end_exit_elapsed - start_init_elapsed);
1339         }
1340 
1341         // REVIEWERS: it's not clear to me why the following isn't done in
1342         // stat_endGC of the last garbage collection?
1343 
1344         // We account for the last garbage collection
1345         {
1346             uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
1347             stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
1348             stats.allocated_bytes = tot_alloc_bytes;
1349             if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
1350                 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
1351                             (W_)stats.gc.allocated_bytes, "", "");
1352                 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
1353             }
1354         }
1355 
1356         // We populate the remainder (non-time elements) of sum
1357         {
1358     #if defined(THREADED_RTS)
1359             sum.bound_task_count = taskCount - workerCount;
1360 
1361             for (uint32_t i = 0; i < n_capabilities; i++) {
1362                 sum.sparks.created   += capabilities[i]->spark_stats.created;
1363                 sum.sparks.dud       += capabilities[i]->spark_stats.dud;
1364                 sum.sparks.overflowed+=
1365                   capabilities[i]->spark_stats.overflowed;
1366                 sum.sparks.converted +=
1367                   capabilities[i]->spark_stats.converted;
1368                 sum.sparks.gcd       += capabilities[i]->spark_stats.gcd;
1369                 sum.sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
1370             }
1371 
1372             sum.sparks_count = sum.sparks.created
1373                 + sum.sparks.dud
1374                 + sum.sparks.overflowed;
1375 
1376             if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
1377                 // See Note [Work Balance]
1378                 sum.work_balance =
1379                     (double)stats.cumulative_par_balanced_copied_bytes
1380                     / (double)stats.par_copied_bytes;
1381             } else {
1382                 sum.work_balance = 0;
1383             }
1384 
1385 
1386     #else // THREADED_RTS
1387             sum.gc_cpu_percent     = stats.gc_cpu_ns
1388                                   / stats.cpu_ns;
1389             sum.gc_elapsed_percent = stats.gc_elapsed_ns
1390                                   / stats.elapsed_ns;
1391     #endif // THREADED_RTS
1392 
1393             sum.fragmentation_bytes =
1394                 (uint64_t)(peak_mblocks_allocated
1395                          * BLOCKS_PER_MBLOCK
1396                          * BLOCK_SIZE_W
1397                          - hw_alloc_blocks * BLOCK_SIZE_W)
1398                 * (uint64_t)sizeof(W_);
1399 
1400             sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
1401                  stats.cumulative_live_bytes/stats.major_gcs,
1402 
1403             sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
1404                 (uint64_t)((double)stats.allocated_bytes
1405                 / TimeToSecondsDbl(stats.mutator_cpu_ns));
1406 
1407             // REVIEWERS: These two values didn't used to include the exit times
1408             sum.productivity_cpu_percent =
1409                 TimeToSecondsDbl(stats.cpu_ns
1410                                 - stats.gc_cpu_ns
1411                                 - stats.init_cpu_ns
1412                                 - sum.exit_cpu_ns)
1413                 / TimeToSecondsDbl(stats.cpu_ns);
1414 
1415             ASSERT(sum.productivity_cpu_percent >= 0);
1416 
1417             sum.productivity_elapsed_percent =
1418                 TimeToSecondsDbl(stats.elapsed_ns
1419                                 - stats.gc_elapsed_ns
1420                                 - stats.init_elapsed_ns
1421                                 - sum.exit_elapsed_ns)
1422                 / TimeToSecondsDbl(stats.elapsed_ns);
1423 
1424             ASSERT(sum.productivity_elapsed_percent >= 0);
1425 
1426             for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
1427                 const generation* gen = &generations[g];
1428                 GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
1429                 gen_stats->collections = gen->collections;
1430                 gen_stats->par_collections = gen->par_collections;
1431                 gen_stats->cpu_ns = GC_coll_cpu[g];
1432                 gen_stats->elapsed_ns = GC_coll_elapsed[g];
1433                 gen_stats->max_pause_ns = GC_coll_max_pause[g];
1434                 gen_stats->avg_pause_ns = gen->collections == 0 ?
1435                     0 : (GC_coll_elapsed[g] / gen->collections);
1436     #if defined(THREADED_RTS) && defined(PROF_SPIN)
1437                 gen_stats->sync_spin = gen->sync.spin;
1438                 gen_stats->sync_yield = gen->sync.yield;
1439     #endif // PROF_SPIN
1440             }
1441         }
1442 
1443         // Now we generate the report
1444         if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
1445             report_summary(&sum);
1446         }
1447 
1448         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
1449             if (RtsFlags.MiscFlags.machineReadable) {
1450                 report_machine_readable(&sum);
1451             }
1452             else {
1453                 report_one_line(&sum);
1454             }
1455         }
1456         RELEASE_LOCK(&stats_mutex);
1457 
1458         statsFlush();
1459         statsClose();
1460     }
1461 
1462     free_RTSSummaryStats(&sum);
1463 
1464     if (GC_coll_cpu) {
1465       stgFree(GC_coll_cpu);
1466       GC_coll_cpu = NULL;
1467     }
1468     if (GC_coll_elapsed) {
1469       stgFree(GC_coll_elapsed);
1470       GC_coll_elapsed = NULL;
1471     }
1472     if (GC_coll_max_pause) {
1473       stgFree(GC_coll_max_pause);
1474       GC_coll_max_pause = NULL;
1475     }
1476 
1477     RELEASE_LOCK(&all_tasks_mutex);
1478 }
1479 
stat_exit()1480 void stat_exit()
1481 {
1482 #if defined(THREADED_RTS)
1483         closeMutex(&stats_mutex);
1484 #endif
1485 }
1486 
1487 /* Note [Work Balance]
1488 ----------------------
1489 Work balance is a measure of how evenly the work done during parallel garbage
1490 collection is spread across threads. To compute work balance we must take care
1491 to account for the number of GC threads changing between GCs. The statistics we
1492 track must have the number of GC threads "integrated out".
1493 
1494 We accumulate two values from each garbage collection:
1495 * par_copied: is a measure of the total work done during parallel garbage
1496   collection
1497 * par_balanced_copied: is a measure of the balanced work done
1498   during parallel garbage collection.
1499 
1500 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
1501 complicated:
1502 
1503 For a given garbage collection:
1504 Let gc_copied := total copies during the gc
1505     gc_copied_i := copies by the ith thread during the gc
1506     num_gc_threads := the number of threads participating in the gc
1507     balance_limit := (gc_copied / num_gc_threads)
1508 
1509 If we were to graph gc_copied_i, sorted from largest to smallest we would see
1510 something like:
1511 
1512        |X
1513   ^    |X X
1514   |    |X X X            X: unbalanced copies
1515 copies |-----------      Y: balanced copies by the busiest GC thread
1516        |Y Z Z            Z: other balanced copies
1517        |Y Z Z Z
1518        |Y Z Z Z Z
1519        |Y Z Z Z Z Z
1520        |===========
1521        |1 2 3 4 5 6
1522           i ->
1523 
1524 where the --- line is at balance_limit. Balanced copies are those under the ---
1525 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
1526 always equal balance_limit. Completely balanced gc has every thread copying
1527 balance_limit and a completely unbalanced gc has a single thread copying
1528 gc_copied.
1529 
1530 One could define par_balance_copied as the areas of the Ys and Zs in the graph
1531 above, however we would like the ratio of (par_balance_copied / gc_copied) to
1532 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
1533 from 0 to 1. We therefore define par_balanced_copied as:
1534 
1535                                                         (  num_gc_threads  )
1536 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
1537   i                                                     (num_gc_threads - 1)
1538                                           vvv                  vvv
1539                                            S                    T
1540 
1541 Where the S and T terms serve to remove the area of the Ys, and
1542 to normalize the result to lie between 0 and gc_copied.
1543 
1544 Note that the implementation orders these operations differently to minimize
1545 error due to integer rounding.
1546 
1547 Then cumulative work balance is computed as
1548 (cumulative_par_balanced_copied_bytes / par_copied_byes)
1549 
1550 Previously, cumulative work balance was computed as:
1551 
1552 (cumulative_par_max_copied_bytes)
1553 (-------------------------------) - 1
1554 (       par_copied_bytes        )
1555 -------------------------------------
1556         (n_capabilities - 1)
1557 
1558 This was less accurate than the current method, and invalid whenever a garbage
1559 collection had occurred with num_gc_threads /= n_capabilities; which can happen
1560 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
1561 the number of gc threads is limited to the number of cores.
1562 See #13830
1563 */
1564 
1565 /*
1566 Note [Internal Counter Stats]
1567 -----------------------------
1568 What do the counts at the end of a '+RTS -s --internal-counters' report mean?
1569 They are detailed below. Most of these counters are used by multiple threads
1570 with no attempt at synchronisation. This means that reported values  may be
1571 lower than the true value and this becomes more likely and more severe as
1572 contention increases.
1573 
1574 The first counters are for various SpinLock-like constructs in the RTS. See
1575 Spinlock.h for the definition of a SpinLock. We maintain up two counters per
1576 SpinLock:
1577 * spin: The number of busy-spins over the length of the program.
1578 * yield: The number of times the SpinLock spun SPIN_COUNT times without success
1579          and called yieldThread().
1580 Not all of these are actual SpinLocks, see the details below.
1581 
1582 Actual SpinLocks:
1583 * gc_alloc_block:
1584     This SpinLock protects the block allocator and free list manager. See
1585     BlockAlloc.c.
1586 * gc_spin and mut_spin:
1587     These SpinLocks are used to herd gc worker threads during parallel garbage
1588     collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
1589 * gen[g].sync:
1590     These SpinLocks, one per generation, protect the generations[g] data
1591     structure during garbage collection.
1592 
1593 waitForGcThreads:
1594   These counters are incremented while we wait for all threads to be ready
1595   for a parallel garbage collection. We yield more than we spin in this case.
1596 
1597 In several places in the runtime we must take a lock on a closure. To do this,
1598 we replace its info table with stg_WHITEHOLE_info, spinning if it is already
1599 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
1600 don't. We count these white-hole spins and include them in the SpinLocks table.
1601 If a particular loop does not yield, we put "n/a" in the table. They are named
1602 for the function that has the spinning loop except that several loops in the
1603 garbage collector accumulate into whitehole_gc.
1604 TODO: Should these counters be more or less granular?
1605 
1606 white-hole spin counters:
1607 * whitehole_gc
1608 * whitehole_lockClosure
1609 * whitehole_executeMessage
1610 * whitehole_threadPaused
1611 
1612 
1613 We count the number of calls of several functions in the parallel garbage
1614 collector.
1615 
1616 Parallel garbage collector counters:
1617 * any_work:
1618     A cheap function called whenever a gc_thread is ready for work. Does
1619     not do any work.
1620 * no_work:
1621     Incremented whenever any_work finds no work.
1622 * scav_find_work:
1623     Called to do work when any_work return true.
1624 
1625 */
1626 
1627 /* -----------------------------------------------------------------------------
1628    stat_describe_gens
1629 
1630    Produce some detailed info on the state of the generational GC.
1631    -------------------------------------------------------------------------- */
1632 void
statDescribeGens(void)1633 statDescribeGens(void)
1634 {
1635   uint32_t g, mut, lge, compacts, i;
1636   W_ gen_slop;
1637   W_ tot_live, tot_slop;
1638   W_ gen_live, gen_blocks;
1639   bdescr *bd;
1640   generation *gen;
1641 
1642   debugBelch(
1643 "----------------------------------------------------------------------\n"
1644 "  Gen     Max  Mut-list  Blocks    Large  Compacts      Live      Slop\n"
1645 "       Blocks     Bytes          Objects                              \n"
1646 "----------------------------------------------------------------------\n");
1647 
1648   tot_live = 0;
1649   tot_slop = 0;
1650 
1651   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1652       gen = &generations[g];
1653 
1654       for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
1655           lge++;
1656       }
1657 
1658       for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
1659           compacts++;
1660       }
1661 
1662       gen_live   = genLiveWords(gen);
1663       gen_blocks = genLiveBlocks(gen);
1664 
1665       mut = 0;
1666       for (i = 0; i < n_capabilities; i++) {
1667           mut += countOccupied(capabilities[i]->mut_lists[g]);
1668 
1669           // Add the pinned object block.
1670           bd = capabilities[i]->pinned_object_block;
1671           if (bd != NULL) {
1672               gen_live   += bd->free - bd->start;
1673               gen_blocks += bd->blocks;
1674           }
1675 
1676           gen_live   += gcThreadLiveWords(i,g);
1677           gen_blocks += gcThreadLiveBlocks(i,g);
1678       }
1679 
1680       debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
1681 
1682       gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
1683 
1684       debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n",
1685                  gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
1686                  gen_slop*(W_)sizeof(W_));
1687       tot_live += gen_live;
1688       tot_slop += gen_slop;
1689   }
1690   debugBelch("----------------------------------------------------------------------\n");
1691   debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
1692              "",tot_live*(W_)sizeof(W_),tot_slop*(W_)sizeof(W_));
1693   debugBelch("----------------------------------------------------------------------\n");
1694   debugBelch("\n");
1695 }
1696 
1697 /* -----------------------------------------------------------------------------
1698    Stats available via a programmatic interface, so eg. GHCi can time
1699    each compilation and expression evaluation.
1700    -------------------------------------------------------------------------- */
1701 
getAllocations(void)1702 uint64_t getAllocations( void )
1703 {
1704     ACQUIRE_LOCK(&stats_mutex);
1705     StgWord64 n = stats.allocated_bytes;
1706     RELEASE_LOCK(&stats_mutex);
1707     return n;
1708 }
1709 
getRTSStatsEnabled(void)1710 int getRTSStatsEnabled( void )
1711 {
1712     return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
1713 }
1714 
getRTSStats(RTSStats * s)1715 void getRTSStats( RTSStats *s )
1716 {
1717     Time current_elapsed = 0;
1718     Time current_cpu = 0;
1719 
1720     ACQUIRE_LOCK(&stats_mutex);
1721     *s = stats;
1722     RELEASE_LOCK(&stats_mutex);
1723 
1724     getProcessTimes(&current_cpu, &current_elapsed);
1725     s->cpu_ns = current_cpu - end_init_cpu;
1726     s->elapsed_ns = current_elapsed - end_init_elapsed;
1727 
1728     s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
1729         stats.nonmoving_gc_cpu_ns;
1730     s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1731         stats.gc_elapsed_ns;
1732 }
1733 
1734 /* -----------------------------------------------------------------------------
1735    Dumping stuff in the stats file, or via the debug message interface
1736    -------------------------------------------------------------------------- */
1737 
1738 void
statsPrintf(char * s,...)1739 statsPrintf( char *s, ... )
1740 {
1741     FILE *sf = RtsFlags.GcFlags.statsFile;
1742     va_list ap;
1743 
1744     va_start(ap,s);
1745     if (sf == NULL) {
1746         vdebugBelch(s,ap);
1747     } else {
1748         vfprintf(sf, s, ap);
1749     }
1750     va_end(ap);
1751 }
1752 
1753 static void
statsFlush(void)1754 statsFlush( void )
1755 {
1756     FILE *sf = RtsFlags.GcFlags.statsFile;
1757     if (sf != NULL) {
1758         fflush(sf);
1759     }
1760 }
1761 
1762 static void
statsClose(void)1763 statsClose( void )
1764 {
1765     FILE *sf = RtsFlags.GcFlags.statsFile;
1766     if (sf != NULL) {
1767         fclose(sf);
1768     }
1769 }
1770