1 /* -----------------------------------------------------------------------------
2 *
3 * (c) The GHC Team, 1998-2005
4 *
5 * Statistics and timing-related functions.
6 *
7 * ---------------------------------------------------------------------------*/
8
9 #include "PosixSource.h"
10 #include "Rts.h"
11
12 #include "RtsFlags.h"
13 #include "RtsUtils.h"
14 #include "Schedule.h"
15 #include "Stats.h"
16 #include "Profiling.h"
17 #include "GetTime.h"
18 #include "sm/Storage.h"
19 #include "sm/GCThread.h"
20 #include "sm/BlockAlloc.h"
21
22 // for spin/yield counters
23 #include "sm/GC.h"
24 #include "ThreadPaused.h"
25 #include "Messages.h"
26
27 #include <string.h> // for memset
28
29 #if defined(THREADED_RTS)
30 // Protects all statistics below
31 Mutex stats_mutex;
32 #endif
33
34 static Time
35 start_init_cpu, start_init_elapsed,
36 end_init_cpu, end_init_elapsed,
37 start_exit_cpu, start_exit_elapsed,
38 start_exit_gc_elapsed, start_exit_gc_cpu,
39 end_exit_cpu, end_exit_elapsed,
40 start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed,
41 start_nonmoving_gc_sync_elapsed;
42
43 #if defined(PROFILING)
44 static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
45 static Time RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time
46
47 static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
48 static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
49 #endif
50
51 #if defined(PROF_SPIN)
52 volatile StgWord64 whitehole_lockClosure_spin = 0;
53 volatile StgWord64 whitehole_lockClosure_yield = 0;
54 volatile StgWord64 whitehole_threadPaused_spin = 0;
55 volatile StgWord64 whitehole_executeMessage_spin = 0;
56 #endif
57
58 //
59 // All the stats!
60 //
61 // This is where we accumulate all the stats during execution, and it's also
62 // in a convenient form that we can copy over to a caller of getRTSStats().
63 //
64 static RTSStats stats;
65
66 static W_ GC_end_faults = 0;
67
68 static Time *GC_coll_cpu = NULL;
69 static Time *GC_coll_elapsed = NULL;
70 static Time *GC_coll_max_pause = NULL;
71
72 static void statsPrintf( char *s, ... ) GNUC3_ATTRIBUTE(format (PRINTF, 1, 2));
73 static void statsFlush( void );
74 static void statsClose( void );
75
76 /* -----------------------------------------------------------------------------
77 Current elapsed time
78 ------------------------------------------------------------------------- */
79
stat_getElapsedTime(void)80 Time stat_getElapsedTime(void)
81 {
82 return getProcessElapsedTime() - start_init_elapsed;
83 }
84
85 /* ---------------------------------------------------------------------------
86 Measure the current MUT time, for profiling
87 ------------------------------------------------------------------------ */
88
89 static double
mut_user_time_until(Time t)90 mut_user_time_until( Time t )
91 {
92 ACQUIRE_LOCK(&stats_mutex);
93 double ret = TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns);
94 RELEASE_LOCK(&stats_mutex);
95 return ret;
96 // heapCensus() time is included in GC_tot_cpu, so we don't need
97 // to subtract it here.
98
99 // TODO: This seems wrong to me. Surely we should be subtracting
100 // (at least) start_init_cpu?
101 }
102
103 double
mut_user_time(void)104 mut_user_time( void )
105 {
106 Time cpu = getProcessCPUTime();
107 return mut_user_time_until(cpu);
108 }
109
110 #if defined(PROFILING)
111 /*
112 mut_user_time_during_RP() returns the MUT time during retainer profiling.
113 The same is for mut_user_time_during_HC();
114 */
115 static double
mut_user_time_during_RP(void)116 mut_user_time_during_RP( void )
117 {
118 return TimeToSecondsDbl(RP_start_time - stats.gc_cpu_ns - RP_tot_time);
119 }
120
121 #endif /* PROFILING */
122
123 /* ---------------------------------------------------------------------------
124 initStats0() has no dependencies, it can be called right at the beginning
125 ------------------------------------------------------------------------ */
126
127 void
initStats0(void)128 initStats0(void)
129 {
130 #if defined(THREADED_RTS)
131 initMutex(&stats_mutex);
132 #endif
133
134 start_init_cpu = 0;
135 start_init_elapsed = 0;
136 end_init_cpu = 0;
137 end_init_elapsed = 0;
138
139 start_nonmoving_gc_cpu = 0;
140 start_nonmoving_gc_elapsed = 0;
141 start_nonmoving_gc_sync_elapsed = 0;
142
143 start_exit_cpu = 0;
144 start_exit_elapsed = 0;
145 start_exit_gc_cpu = 0;
146 start_exit_gc_elapsed = 0;
147 end_exit_cpu = 0;
148 end_exit_elapsed = 0;
149
150 #if defined(PROFILING)
151 RP_start_time = 0;
152 RP_tot_time = 0;
153 RPe_start_time = 0;
154 RPe_tot_time = 0;
155
156 HC_start_time = 0;
157 HC_tot_time = 0;
158 HCe_start_time = 0;
159 HCe_tot_time = 0;
160 #endif
161
162 GC_end_faults = 0;
163
164 stats = (RTSStats) {
165 .gcs = 0,
166 .major_gcs = 0,
167 .allocated_bytes = 0,
168 .max_live_bytes = 0,
169 .max_large_objects_bytes = 0,
170 .max_compact_bytes = 0,
171 .max_slop_bytes = 0,
172 .max_mem_in_use_bytes = 0,
173 .cumulative_live_bytes = 0,
174 .copied_bytes = 0,
175 .par_copied_bytes = 0,
176 .cumulative_par_max_copied_bytes = 0,
177 .cumulative_par_balanced_copied_bytes = 0,
178 .gc_spin_spin = 0,
179 .gc_spin_yield = 0,
180 .mut_spin_spin = 0,
181 .mut_spin_yield = 0,
182 .any_work = 0,
183 .no_work = 0,
184 .scav_find_work = 0,
185 .init_cpu_ns = 0,
186 .init_elapsed_ns = 0,
187 .mutator_cpu_ns = 0,
188 .mutator_elapsed_ns = 0,
189 .gc_cpu_ns = 0,
190 .gc_elapsed_ns = 0,
191 .cpu_ns = 0,
192 .elapsed_ns = 0,
193 .nonmoving_gc_cpu_ns = 0,
194 .nonmoving_gc_elapsed_ns = 0,
195 .nonmoving_gc_max_elapsed_ns = 0,
196 .nonmoving_gc_sync_elapsed_ns = 0,
197 .nonmoving_gc_sync_max_elapsed_ns = 0,
198 .gc = {
199 .gen = 0,
200 .threads = 0,
201 .allocated_bytes = 0,
202 .live_bytes = 0,
203 .large_objects_bytes = 0,
204 .compact_bytes = 0,
205 .slop_bytes = 0,
206 .mem_in_use_bytes = 0,
207 .copied_bytes = 0,
208 .par_max_copied_bytes = 0,
209 .par_balanced_copied_bytes = 0,
210 .sync_elapsed_ns = 0,
211 .cpu_ns = 0,
212 .elapsed_ns = 0,
213 .nonmoving_gc_cpu_ns = 0,
214 .nonmoving_gc_elapsed_ns = 0,
215 .nonmoving_gc_sync_elapsed_ns = 0,
216 }
217 };
218 }
219
220 /* ---------------------------------------------------------------------------
221 initStats1() can be called after setupRtsFlags()
222 ------------------------------------------------------------------------ */
223
224 void initGenerationStats(void);
225
226 void
initStats1(void)227 initStats1 (void)
228 {
229 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
230 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
231 statsPrintf(" bytes bytes bytes user elap user elap\n");
232 }
233 GC_coll_cpu =
234 (Time *)stgMallocBytes(
235 sizeof(Time)*RtsFlags.GcFlags.generations,
236 "initStats");
237 GC_coll_elapsed =
238 (Time *)stgMallocBytes(
239 sizeof(Time)*RtsFlags.GcFlags.generations,
240 "initStats");
241 GC_coll_max_pause =
242 (Time *)stgMallocBytes(
243 sizeof(Time)*RtsFlags.GcFlags.generations,
244 "initStats");
245 initGenerationStats();
246 }
247
248 void
initGenerationStats()249 initGenerationStats()
250 {
251 for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
252 GC_coll_cpu[i] = 0;
253 GC_coll_elapsed[i] = 0;
254 GC_coll_max_pause[i] = 0;
255 }
256 }
257
258 /* ---------------------------------------------------------------------------
259 Reset stats of child process after fork()
260 ------------------------------------------------------------------------ */
261
resetChildProcessStats()262 void resetChildProcessStats()
263 {
264 initStats0();
265 initGenerationStats();
266 }
267
268 /* -----------------------------------------------------------------------------
269 Initialisation time...
270 -------------------------------------------------------------------------- */
271
272 void
stat_startInit(void)273 stat_startInit(void)
274 {
275 getProcessTimes(&start_init_cpu, &start_init_elapsed);
276 }
277
278 void
stat_endInit(void)279 stat_endInit(void)
280 {
281 getProcessTimes(&end_init_cpu, &end_init_elapsed);
282 stats.init_cpu_ns = end_init_cpu - start_init_cpu;
283 stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
284 }
285
286 /* -----------------------------------------------------------------------------
287 stat_startExit and stat_endExit
288
289 These two measure the time taken in shutdownHaskell().
290 -------------------------------------------------------------------------- */
291
292 void
stat_startExit(void)293 stat_startExit(void)
294 {
295 ACQUIRE_LOCK(&stats_mutex);
296 getProcessTimes(&start_exit_cpu, &start_exit_elapsed);
297 start_exit_gc_elapsed = stats.gc_elapsed_ns;
298 start_exit_gc_cpu = stats.gc_cpu_ns;
299 RELEASE_LOCK(&stats_mutex);
300 }
301
302 /* -----------------------------------------------------------------------------
303 Nonmoving (concurrent) collector statistics
304
305 These two measure the time taken in the concurrent mark & sweep collector.
306 -------------------------------------------------------------------------- */
307 void
stat_endExit(void)308 stat_endExit(void)
309 {
310 ACQUIRE_LOCK(&stats_mutex);
311 getProcessTimes(&end_exit_cpu, &end_exit_elapsed);
312 RELEASE_LOCK(&stats_mutex);
313 }
314
315 void
stat_startGCSync(gc_thread * gct)316 stat_startGCSync (gc_thread *gct)
317 {
318 gct->gc_sync_start_elapsed = getProcessElapsedTime();
319 }
320
321 void
stat_startNonmovingGc()322 stat_startNonmovingGc ()
323 {
324 ACQUIRE_LOCK(&stats_mutex);
325 start_nonmoving_gc_cpu = getCurrentThreadCPUTime();
326 start_nonmoving_gc_elapsed = getProcessCPUTime();
327 RELEASE_LOCK(&stats_mutex);
328 }
329
330 void
stat_endNonmovingGc()331 stat_endNonmovingGc ()
332 {
333 Time cpu = getCurrentThreadCPUTime();
334 Time elapsed = getProcessCPUTime();
335
336 ACQUIRE_LOCK(&stats_mutex);
337 stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed;
338 stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns;
339
340 stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu;
341 stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns;
342
343 stats.nonmoving_gc_max_elapsed_ns =
344 stg_max(stats.gc.nonmoving_gc_elapsed_ns,
345 stats.nonmoving_gc_max_elapsed_ns);
346 RELEASE_LOCK(&stats_mutex);
347 }
348
349 void
stat_startNonmovingGcSync()350 stat_startNonmovingGcSync ()
351 {
352 ACQUIRE_LOCK(&stats_mutex);
353 start_nonmoving_gc_sync_elapsed = getProcessElapsedTime();
354 RELEASE_LOCK(&stats_mutex);
355 traceConcSyncBegin();
356 }
357
358 void
stat_endNonmovingGcSync()359 stat_endNonmovingGcSync ()
360 {
361 Time end_elapsed = getProcessElapsedTime();
362 ACQUIRE_LOCK(&stats_mutex);
363 stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed;
364 stats.nonmoving_gc_sync_elapsed_ns += stats.gc.nonmoving_gc_sync_elapsed_ns;
365 stats.nonmoving_gc_sync_max_elapsed_ns =
366 stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns,
367 stats.nonmoving_gc_sync_max_elapsed_ns);
368 RELEASE_LOCK(&stats_mutex);
369 traceConcSyncEnd();
370 }
371
372 /* -----------------------------------------------------------------------------
373 Called at the beginning of each GC
374 -------------------------------------------------------------------------- */
375
376 /*
377 * Note [Time accounting]
378 * ~~~~~~~~~~~~~~~~~~~~~~
379 * In the "vanilla" configuration (using the standard copying GC) GHC keeps
380 * track of a two different sinks of elapsed and CPU time:
381 *
382 * - time spent synchronising to initiate garbage collection
383 * - garbage collection (per generation)
384 * - mutation
385 *
386 * When using the (concurrent) non-moving garbage collector (see Note
387 * [Non-moving garbage collector]) we also track a few more sinks:
388 *
389 * - minor GC
390 * - major GC (namly time spent in the preparatory phase)
391 * - concurrent mark
392 * - final synchronization (elapsed only)
393 * - mutation
394 *
395 * To keep track of these CPU times we rely on the system's per-thread CPU time
396 * clock (exposed via the runtime's getCurrentThreadCPUTime utility).
397 *
398 * CPU time spent in the copying garbage collector is tracked in each GC
399 * worker's gc_thread struct. At the beginning of scavenging each worker
400 * records its OS thread's CPU time its gc_thread (by stat_startGCWorker). At
401 * the end of scavenging we again record the CPU time (in stat_endGCworker).
402 * The differences of these are then summed over by the thread leading the GC
403 * at the end of collection in stat_endGC. By contrast, the elapsed time is
404 * recorded only by the leader.
405 *
406 * Mutator time is derived from the process's CPU time, subtracting out
407 * contributions from stop-the-world and concurrent GCs.
408 *
409 * Time spent in concurrent marking is recorded by stat_{start,end}NonmovingGc.
410 * Likewise, elapsed time spent in the final synchronization is recorded by
411 * stat_{start,end}NonmovingGcSync.
412 */
413
414 void
stat_startGCWorker(Capability * cap STG_UNUSED,gc_thread * gct)415 stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
416 {
417 bool stats_enabled =
418 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
419 rtsConfig.gcDoneHook != NULL;
420
421 if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
422 gct->gc_start_cpu = getCurrentThreadCPUTime();
423 }
424 }
425
426 void
stat_endGCWorker(Capability * cap STG_UNUSED,gc_thread * gct)427 stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
428 {
429 bool stats_enabled =
430 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
431 rtsConfig.gcDoneHook != NULL;
432
433 if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
434 gct->gc_end_cpu = getCurrentThreadCPUTime();
435 ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
436 }
437 }
438
439 void
stat_startGC(Capability * cap,gc_thread * gct)440 stat_startGC (Capability *cap, gc_thread *gct)
441 {
442 if (RtsFlags.GcFlags.ringBell) {
443 debugBelch("\007");
444 }
445
446 bool stats_enabled =
447 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
448 rtsConfig.gcDoneHook != NULL;
449
450 if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
451 gct->gc_start_cpu = getCurrentThreadCPUTime();
452 }
453
454 gct->gc_start_elapsed = getProcessElapsedTime();
455
456 // Post EVENT_GC_START with the same timestamp as used for stats
457 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
458 // Here, as opposed to other places, the event is emitted on the cap
459 // that initiates the GC and external tools expect it to have the same
460 // timestamp as used in +RTS -s calculcations.
461 traceEventGcStartAtT(cap,
462 TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
463
464 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
465 {
466 gct->gc_start_faults = getPageFaults();
467 }
468
469 updateNurseriesStats();
470 }
471
472 /* -----------------------------------------------------------------------------
473 Called at the end of each GC
474 -------------------------------------------------------------------------- */
475
476 void
stat_endGC(Capability * cap,gc_thread * initiating_gct,W_ live,W_ copied,W_ slop,uint32_t gen,uint32_t par_n_threads,gc_thread ** gc_threads,W_ par_max_copied,W_ par_balanced_copied,W_ gc_spin_spin,W_ gc_spin_yield,W_ mut_spin_spin,W_ mut_spin_yield,W_ any_work,W_ no_work,W_ scav_find_work)477 stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
478 uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
479 W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
480 W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
481 W_ scav_find_work)
482 {
483 ACQUIRE_LOCK(&stats_mutex);
484
485 // -------------------------------------------------
486 // Collect all the stats about this GC in stats.gc. We always do this since
487 // it's relatively cheap and we need allocated_bytes to catch heap
488 // overflows.
489
490 stats.gc.gen = gen;
491 stats.gc.threads = par_n_threads;
492
493 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
494
495 // allocated since the last GC
496 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
497
498 stats.gc.live_bytes = live * sizeof(W_);
499 stats.gc.large_objects_bytes = calcTotalLargeObjectsW() * sizeof(W_);
500 stats.gc.compact_bytes = calcTotalCompactW() * sizeof(W_);
501 stats.gc.slop_bytes = slop * sizeof(W_);
502 stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
503 stats.gc.copied_bytes = copied * sizeof(W_);
504 stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
505 stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
506
507 bool stats_enabled =
508 RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
509 rtsConfig.gcDoneHook != NULL;
510
511 if (stats_enabled
512 || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time
513 {
514 // We only update the times when stats are explicitly enabled since
515 // getProcessTimes (e.g. requiring a system call) can be expensive on
516 // some platforms.
517 Time current_cpu, current_elapsed;
518 getProcessTimes(¤t_cpu, ¤t_elapsed);
519 stats.cpu_ns = current_cpu - start_init_cpu;
520 stats.elapsed_ns = current_elapsed - start_init_elapsed;
521
522 stats.gc.sync_elapsed_ns =
523 initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
524 stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
525 stats.gc.cpu_ns = 0;
526 // see Note [n_gc_threads]
527 // par_n_threads is set to n_gc_threads at the single callsite of this
528 // function
529 if (par_n_threads == 1) {
530 ASSERT(initiating_gct->gc_end_cpu >= initiating_gct->gc_start_cpu);
531 stats.gc.cpu_ns += initiating_gct->gc_end_cpu - initiating_gct->gc_start_cpu;
532 } else {
533 for (unsigned int i=0; i < par_n_threads; i++) {
534 gc_thread *gct = gc_threads[i];
535 ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
536 stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
537 }
538 }
539 }
540 // -------------------------------------------------
541 // Update the cumulative stats
542
543 stats.gcs++;
544 stats.allocated_bytes = tot_alloc_bytes;
545 stats.max_mem_in_use_bytes = peak_mblocks_allocated * MBLOCK_SIZE;
546
547 GC_coll_cpu[gen] += stats.gc.cpu_ns;
548 GC_coll_elapsed[gen] += stats.gc.elapsed_ns;
549 if (GC_coll_max_pause[gen] < stats.gc.elapsed_ns) {
550 GC_coll_max_pause[gen] = stats.gc.elapsed_ns;
551 }
552
553 stats.copied_bytes += stats.gc.copied_bytes;
554 if (par_n_threads > 1) {
555 stats.par_copied_bytes += stats.gc.copied_bytes;
556 stats.cumulative_par_max_copied_bytes +=
557 stats.gc.par_max_copied_bytes;
558 stats.cumulative_par_balanced_copied_bytes +=
559 stats.gc.par_balanced_copied_bytes;
560 stats.any_work += any_work;
561 stats.no_work += no_work;
562 stats.scav_find_work += scav_find_work;
563 stats.gc_spin_spin += gc_spin_spin;
564 stats.gc_spin_yield += gc_spin_yield;
565 stats.mut_spin_spin += mut_spin_spin;
566 stats.mut_spin_yield += mut_spin_yield;
567 }
568 stats.gc_cpu_ns += stats.gc.cpu_ns;
569 stats.gc_elapsed_ns += stats.gc.elapsed_ns;
570
571 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
572 stats.major_gcs++;
573 if (stats.gc.live_bytes > stats.max_live_bytes) {
574 stats.max_live_bytes = stats.gc.live_bytes;
575 }
576 if (stats.gc.large_objects_bytes > stats.max_large_objects_bytes) {
577 stats.max_large_objects_bytes = stats.gc.large_objects_bytes;
578 }
579 if (stats.gc.compact_bytes > stats.max_compact_bytes) {
580 stats.max_compact_bytes = stats.gc.compact_bytes;
581 }
582 if (stats.gc.slop_bytes > stats.max_slop_bytes) {
583 stats.max_slop_bytes = stats.gc.slop_bytes;
584 }
585 stats.cumulative_live_bytes += stats.gc.live_bytes;
586 }
587
588 // -------------------------------------------------
589 // Do the more expensive bits only when stats are enabled.
590
591 if (stats_enabled)
592 {
593 // -------------------------------------------------
594 // Emit events to the event log
595
596 // Has to be emitted while all caps stopped for GC, but before GC_END.
597 // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
598 // for a detailed design rationale of the current setup
599 // of GC eventlog events.
600 traceEventGcGlobalSync(cap);
601
602 // Emitted before GC_END on all caps, which simplifies tools code.
603 traceEventGcStats(cap,
604 CAPSET_HEAP_DEFAULT,
605 stats.gc.gen,
606 stats.gc.copied_bytes,
607 stats.gc.slop_bytes,
608 /* current loss due to fragmentation */
609 (mblocks_allocated * BLOCKS_PER_MBLOCK
610 - n_alloc_blocks) * BLOCK_SIZE,
611 par_n_threads,
612 stats.gc.par_max_copied_bytes,
613 stats.gc.copied_bytes,
614 stats.gc.par_balanced_copied_bytes);
615
616 // Post EVENT_GC_END with the same timestamp as used for stats
617 // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
618 // Here, as opposed to other places, the event is emitted on the cap
619 // that initiates the GC and external tools expect it to have the same
620 // timestamp as used in +RTS -s calculcations.
621 traceEventGcEndAtT(cap, TimeToNS(stats.elapsed_ns));
622
623 if (gen == RtsFlags.GcFlags.generations-1) { // major GC?
624 traceEventHeapLive(cap,
625 CAPSET_HEAP_DEFAULT,
626 stats.gc.live_bytes);
627 }
628
629 // -------------------------------------------------
630 // Print GC stats to stdout or a file (+RTS -S/-s)
631
632 if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
633 W_ faults = getPageFaults();
634
635 statsPrintf("%9" FMT_Word64 " %9" FMT_Word64 " %9" FMT_Word64,
636 stats.gc.allocated_bytes, stats.gc.copied_bytes,
637 stats.gc.live_bytes);
638
639 statsPrintf(" %6.3f %6.3f %8.3f %8.3f %4"
640 FMT_Word " %4" FMT_Word " (Gen: %2d)\n",
641 TimeToSecondsDbl(stats.gc.cpu_ns),
642 TimeToSecondsDbl(stats.gc.elapsed_ns),
643 TimeToSecondsDbl(stats.cpu_ns),
644 TimeToSecondsDbl(stats.elapsed_ns),
645 faults - initiating_gct->gc_start_faults,
646 initiating_gct->gc_start_faults - GC_end_faults,
647 gen);
648
649 GC_end_faults = faults;
650 statsFlush();
651 }
652
653
654 if (rtsConfig.gcDoneHook != NULL) {
655 rtsConfig.gcDoneHook(&stats.gc);
656 }
657
658 traceEventHeapSize(cap,
659 CAPSET_HEAP_DEFAULT,
660 mblocks_allocated * MBLOCK_SIZE);
661 }
662 RELEASE_LOCK(&stats_mutex);
663 }
664
665 /* -----------------------------------------------------------------------------
666 Called at the beginning of each Retainer Profiliing
667 -------------------------------------------------------------------------- */
668 #if defined(PROFILING)
669 void
stat_startRP(void)670 stat_startRP(void)
671 {
672 Time user, elapsed;
673 getProcessTimes( &user, &elapsed );
674
675 ACQUIRE_LOCK(&stats_mutex);
676 RP_start_time = user;
677 RPe_start_time = elapsed;
678 RELEASE_LOCK(&stats_mutex);
679 }
680 #endif /* PROFILING */
681
682 /* -----------------------------------------------------------------------------
683 Called at the end of each Retainer Profiliing
684 -------------------------------------------------------------------------- */
685
686 #if defined(PROFILING)
687 void
stat_endRP(uint32_t retainerGeneration,int maxStackSize,double averageNumVisit)688 stat_endRP(
689 uint32_t retainerGeneration,
690 int maxStackSize,
691 double averageNumVisit)
692 {
693 Time user, elapsed;
694 getProcessTimes( &user, &elapsed );
695
696 ACQUIRE_LOCK(&stats_mutex);
697 RP_tot_time += user - RP_start_time;
698 RPe_tot_time += elapsed - RPe_start_time;
699 double mut_time_during_RP = mut_user_time_during_RP();
700 RELEASE_LOCK(&stats_mutex);
701
702 fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
703 retainerGeneration, mut_time_during_RP);
704 fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
705 fprintf(prof_file, "\tAverage number of visits per object = %f\n",
706 averageNumVisit);
707 }
708 #endif /* PROFILING */
709
710 /* -----------------------------------------------------------------------------
711 Called at the beginning of each heap census
712 -------------------------------------------------------------------------- */
713 #if defined(PROFILING)
714 void
stat_startHeapCensus(void)715 stat_startHeapCensus(void)
716 {
717 Time user, elapsed;
718 getProcessTimes( &user, &elapsed );
719
720 ACQUIRE_LOCK(&stats_mutex);
721 HC_start_time = user;
722 HCe_start_time = elapsed;
723 RELEASE_LOCK(&stats_mutex);
724 }
725 #endif /* PROFILING */
726
727 /* -----------------------------------------------------------------------------
728 Called at the end of each heap census
729 -------------------------------------------------------------------------- */
730 #if defined(PROFILING)
731 void
stat_endHeapCensus(void)732 stat_endHeapCensus(void)
733 {
734 Time user, elapsed;
735 getProcessTimes( &user, &elapsed );
736
737 ACQUIRE_LOCK(&stats_mutex);
738 HC_tot_time += user - HC_start_time;
739 HCe_tot_time += elapsed - HCe_start_time;
740 RELEASE_LOCK(&stats_mutex);
741 }
742 #endif /* PROFILING */
743
744 /* -----------------------------------------------------------------------------
745 Called at the end of execution
746
747 NOTE: number of allocations is not entirely accurate: it doesn't
748 take into account the few bytes at the end of the heap that
749 were left unused when the heap-check failed.
750 -------------------------------------------------------------------------- */
751
752 #if defined(DEBUG)
753 #define TICK_VAR_INI(arity) \
754 StgInt SLOW_CALLS_##arity = 1; \
755 StgInt RIGHT_ARITY_##arity = 1; \
756 StgInt TAGGED_PTR_##arity = 0;
757
758 TICK_VAR_INI(1)
759 TICK_VAR_INI(2)
760
761 StgInt TOTAL_CALLS=1;
762 #endif
763
764 /* Report the value of a counter */
765 #define REPORT(counter) \
766 { \
767 showStgWord64(counter,temp,true/*commas*/); \
768 statsPrintf(" (" #counter ") : %s\n",temp); \
769 }
770
771 /* Report the value of a counter as a percentage of another counter */
772 #define REPORT_PCT(counter,countertot) \
773 statsPrintf(" (" #counter ") %% of (" #countertot ") : %.1f%%\n", \
774 counter*100.0/countertot)
775
776 #define TICK_PRINT(arity) \
777 REPORT(SLOW_CALLS_##arity); \
778 REPORT_PCT(RIGHT_ARITY_##arity,SLOW_CALLS_##arity); \
779 REPORT_PCT(TAGGED_PTR_##arity,RIGHT_ARITY_##arity); \
780 REPORT(RIGHT_ARITY_##arity); \
781 REPORT(TAGGED_PTR_##arity)
782
783 #define TICK_PRINT_TOT(arity) \
784 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
785 SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
786
787 /*
788 Note [RTS Stats Reporting]
789 ==========================
790
791 There are currently three reporting functions:
792 * report_summary:
793 Responsible for producing '+RTS -s' output.
794 Will report internal counters if the RTS flag --internal-counters is
795 passed. See [Internal Counters Stats]
796 * report_machine_readable:
797 Responsible for producing '+RTS -t --machine-readable' output.
798 * report_one_line:
799 Responsible for producing '+RTS -t' output
800
801 Stats are accumulated into the global variable 'stats' as the program runs, then
802 in 'stat_exit' we do the following:
803 * Finalise 'stats'. This involves setting final running times and allocations
804 that have not yet been accounted for.
805 * Create a RTSSummaryStats. This contains all data for reports that is not
806 included in stats (because they do not make sense before the program has
807 completed) or in a global variable.
808 * call the appropriate report_* function, passing the newly constructed
809 RTSSummaryStats.
810
811 To ensure that the data in the different reports is kept consistent, the
812 report_* functions should not do any calculation, excepting unit changes and
813 formatting. If you need to add a new calculated field, add it to
814 RTSSummaryStats.
815
816 */
817
818
init_RTSSummaryStats(RTSSummaryStats * sum)819 static void init_RTSSummaryStats(RTSSummaryStats* sum)
820 {
821 const size_t sizeof_gc_summary_stats =
822 RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
823
824 memset(sum, 0, sizeof(RTSSummaryStats));
825 sum->gc_summary_stats =
826 stgMallocBytes(sizeof_gc_summary_stats,
827 "alloc_RTSSummaryStats.gc_summary_stats");
828 memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
829 }
830
free_RTSSummaryStats(RTSSummaryStats * sum)831 static void free_RTSSummaryStats(RTSSummaryStats * sum)
832 {
833 stgFree(sum->gc_summary_stats);
834 sum->gc_summary_stats = NULL;
835 }
836
837 // Must hold stats_mutex.
report_summary(const RTSSummaryStats * sum)838 static void report_summary(const RTSSummaryStats* sum)
839 {
840 // We should do no calculation, other than unit changes and formatting, and
841 // we should not use any data from outside of globals, sum and stats
842 // here. See Note [RTS Stats Reporting]
843
844 uint32_t g;
845 char temp[512];
846 showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
847 statsPrintf("%16s bytes allocated in the heap\n", temp);
848
849 showStgWord64(stats.copied_bytes, temp, true/*commas*/);
850 statsPrintf("%16s bytes copied during GC\n", temp);
851
852 if ( stats.major_gcs > 0 ) {
853 showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
854 statsPrintf("%16s bytes maximum residency (%" FMT_Word32
855 " sample(s))\n",
856 temp, stats.major_gcs);
857 }
858
859 showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
860 statsPrintf("%16s bytes maximum slop\n", temp);
861
862 statsPrintf("%16" FMT_Word64 " MiB total memory in use (%"
863 FMT_Word64 " MB lost due to fragmentation)\n\n",
864 stats.max_mem_in_use_bytes / (1024 * 1024),
865 sum->fragmentation_bytes / (1024 * 1024));
866
867 /* Print garbage collections in each gen */
868 statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
869 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
870 const GenerationSummaryStats * gen_stats =
871 &sum->gc_summary_stats[g];
872 statsPrintf(" Gen %2d %5d colls"
873 ", %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
874 g, // REVIEWERS: this used to be gen->no
875 //, this can't ever be different right?
876 gen_stats->collections,
877 gen_stats->par_collections,
878 TimeToSecondsDbl(gen_stats->cpu_ns),
879 TimeToSecondsDbl(gen_stats->elapsed_ns),
880 TimeToSecondsDbl(gen_stats->avg_pause_ns),
881 TimeToSecondsDbl(gen_stats->max_pause_ns));
882 }
883 if (RtsFlags.GcFlags.useNonmoving) {
884 const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections;
885 statsPrintf(" Gen 1 %5d syncs"
886 ", %6.3fs %3.4fs %3.4fs\n",
887 n_major_colls,
888 TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns),
889 TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls,
890 TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns));
891 statsPrintf(" Gen 1 concurrent"
892 ", %6.3fs %6.3fs %3.4fs %3.4fs\n",
893 TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
894 TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns),
895 TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls,
896 TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns));
897 }
898
899 statsPrintf("\n");
900
901 #if defined(THREADED_RTS)
902 if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
903 // See Note [Work Balance]
904 statsPrintf(" Parallel GC work balance: "
905 "%.2f%% (serial 0%%, perfect 100%%)\n\n",
906 sum->work_balance * 100);
907 }
908
909 statsPrintf(" TASKS: %d "
910 "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
911 taskCount, sum->bound_task_count,
912 peakWorkerCount, workerCount,
913 n_capabilities);
914
915 statsPrintf(" SPARKS: %" FMT_Word64
916 " (%" FMT_Word " converted, %" FMT_Word " overflowed, %"
917 FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
918 sum->sparks_count,
919 sum->sparks.converted, sum->sparks.overflowed,
920 sum->sparks.dud, sum->sparks.gcd,
921 sum->sparks.fizzled);
922 #endif
923
924 statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
925 TimeToSecondsDbl(stats.init_cpu_ns),
926 TimeToSecondsDbl(stats.init_elapsed_ns));
927
928 statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
929 TimeToSecondsDbl(stats.mutator_cpu_ns),
930 TimeToSecondsDbl(stats.mutator_elapsed_ns));
931 statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
932 TimeToSecondsDbl(stats.gc_cpu_ns),
933 TimeToSecondsDbl(stats.gc_elapsed_ns));
934 if (RtsFlags.GcFlags.useNonmoving) {
935 statsPrintf(
936 " CONC GC time %7.3fs (%7.3fs elapsed)\n",
937 TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
938 TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns));
939 }
940
941 #if defined(PROFILING)
942 statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
943 TimeToSecondsDbl(sum->rp_cpu_ns),
944 TimeToSecondsDbl(sum->rp_elapsed_ns));
945 statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
946 TimeToSecondsDbl(sum->hc_cpu_ns),
947 TimeToSecondsDbl(sum->hc_elapsed_ns));
948 #endif
949 statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
950 TimeToSecondsDbl(sum->exit_cpu_ns),
951 TimeToSecondsDbl(sum->exit_elapsed_ns));
952 statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
953 TimeToSecondsDbl(stats.cpu_ns),
954 TimeToSecondsDbl(stats.elapsed_ns));
955 #if !defined(THREADED_RTS)
956 statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
957 sum->gc_cpu_percent * 100,
958 sum->gc_elapsed_percent * 100);
959 #endif
960
961 showStgWord64(sum->alloc_rate, temp, true/*commas*/);
962
963 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
964
965 statsPrintf(" Productivity %5.1f%% of total user, "
966 "%.1f%% of total elapsed\n\n",
967 sum->productivity_cpu_percent * 100,
968 sum->productivity_elapsed_percent * 100);
969
970 // See Note [Internal Counter Stats] for a description of the
971 // following counters. If you add a counter here, please remember
972 // to update the Note.
973 if (RtsFlags.MiscFlags.internalCounters) {
974 #if defined(THREADED_RTS) && defined(PROF_SPIN)
975 const int32_t col_width[] = {4, -30, 14, 14};
976 statsPrintf("Internal Counters:\n");
977 statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
978 , col_width[0], ""
979 , col_width[1], "SpinLock"
980 , col_width[2], "Spins"
981 , col_width[3], "Yields");
982 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
983 , col_width[0], ""
984 , col_width[1], "gc_alloc_block_sync"
985 , col_width[2], gc_alloc_block_sync.spin
986 , col_width[3], gc_alloc_block_sync.yield);
987 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
988 , col_width[0], ""
989 , col_width[1], "gc_spin"
990 , col_width[2], stats.gc_spin_spin
991 , col_width[3], stats.gc_spin_yield);
992 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
993 , col_width[0], ""
994 , col_width[1], "mut_spin"
995 , col_width[2], stats.mut_spin_spin
996 , col_width[3], stats.mut_spin_yield);
997 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
998 , col_width[0], ""
999 , col_width[1], "whitehole_gc"
1000 , col_width[2], whitehole_gc_spin
1001 , col_width[3], "n/a");
1002 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
1003 , col_width[0], ""
1004 , col_width[1], "whitehole_threadPaused"
1005 , col_width[2], whitehole_threadPaused_spin
1006 , col_width[3], "n/a");
1007 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
1008 , col_width[0], ""
1009 , col_width[1], "whitehole_executeMessage"
1010 , col_width[2], whitehole_executeMessage_spin
1011 , col_width[3], "n/a");
1012 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1013 , col_width[0], ""
1014 , col_width[1], "whitehole_lockClosure"
1015 , col_width[2], whitehole_lockClosure_spin
1016 , col_width[3], whitehole_lockClosure_yield);
1017 // waitForGcThreads isn't really spin-locking(see the function)
1018 // but these numbers still seem useful.
1019 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1020 , col_width[0], ""
1021 , col_width[1], "waitForGcThreads"
1022 , col_width[2], waitForGcThreads_spin
1023 , col_width[3], waitForGcThreads_yield);
1024
1025 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1026 int prefix_length = 0;
1027 statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
1028 col_width[0], "", g, &prefix_length);
1029 prefix_length -= col_width[0];
1030 int suffix_length = col_width[1] + prefix_length;
1031 suffix_length =
1032 suffix_length > 0 ? col_width[1] : suffix_length;
1033
1034 statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
1035 , suffix_length, "].sync"
1036 , col_width[2], generations[g].sync.spin
1037 , col_width[3], generations[g].sync.yield);
1038 }
1039 statsPrintf("\n");
1040 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1041 , col_width[0], ""
1042 , col_width[1], "any_work"
1043 , col_width[2], stats.any_work);
1044 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1045 , col_width[0], ""
1046 , col_width[1], "no_work"
1047 , col_width[2], stats.no_work);
1048 statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
1049 , col_width[0], ""
1050 , col_width[1], "scav_find_work"
1051 , col_width[2], stats.scav_find_work);
1052 #elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
1053 statsPrintf("Internal Counters require the RTS to be built "
1054 "with PROF_SPIN"); // PROF_SPIN is not #defined here
1055 #else // THREADED_RTS
1056 statsPrintf("Internal Counters require the threaded RTS");
1057 #endif
1058 }
1059 }
1060
report_machine_readable(const RTSSummaryStats * sum)1061 static void report_machine_readable (const RTSSummaryStats * sum)
1062 {
1063 // We should do no calculation, other than unit changes and formatting, and
1064 // we should not use any data from outside of globals, sum and stats
1065 // here. See Note [RTS Stats Reporting]
1066 uint32_t g;
1067
1068 #define MR_STAT(field_name,format,value) \
1069 statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
1070 #define MR_STAT_GEN(gen,field_name,format,value) \
1071 statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
1072 format "\")\n", g, value)
1073
1074 // These first values are for backwards compatibility.
1075 // Some of these first fields are duplicated with more machine-readable
1076 // names, or to match the name in RtsStats.
1077
1078 // we don't use for the first field helper macro here because the prefix is
1079 // different
1080 statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
1081 stats.allocated_bytes);
1082 MR_STAT("num_GCs", FMT_Word32, stats.gcs);
1083 MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
1084 MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
1085 MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
1086 MR_STAT("peak_megabytes_allocated", FMT_Word64,
1087 stats.max_mem_in_use_bytes / (1024 * 1024));
1088
1089 MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
1090 MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
1091 MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
1092 MR_STAT("mut_wall_seconds", "f",
1093 TimeToSecondsDbl(stats.mutator_elapsed_ns));
1094 MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
1095 MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
1096
1097 // end backward compatibility
1098
1099 // First, the rest of the times
1100
1101 MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
1102 MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
1103 #if defined(PROFILING)
1104 MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
1105 MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
1106 MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hc_cpu_ns));
1107 MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hc_elapsed_ns));
1108 #endif
1109 MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
1110 MR_STAT("total_wall_seconds", "f",
1111 TimeToSecondsDbl(stats.elapsed_ns));
1112
1113 // next, the remainder of the fields of RTSStats, except internal counters
1114
1115 // The first two are duplicates of those above, but have more machine
1116 // readable names that match the field names in RTSStats.
1117
1118
1119 // gcs has been done as num_GCs above
1120 MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
1121 MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
1122 MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
1123 MR_STAT("max_large_objects_bytes", FMT_Word64,
1124 stats.max_large_objects_bytes);
1125 MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
1126 MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
1127 // This duplicates, except for unit, peak_megabytes_allocated above
1128 MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
1129 MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
1130 MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
1131 MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
1132 MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
1133 stats.cumulative_par_max_copied_bytes);
1134 MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
1135 stats.cumulative_par_balanced_copied_bytes);
1136
1137 // next, the computed fields in RTSSummaryStats
1138 #if !defined(THREADED_RTS) // THREADED_RTS
1139 MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
1140 MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
1141 #endif
1142 MR_STAT("fragmentation_bytes", FMT_Word64, sum->fragmentation_bytes);
1143 // average_bytes_used is done above
1144 MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
1145 MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
1146 MR_STAT("productivity_wall_percent", "f",
1147 sum->productivity_elapsed_percent);
1148
1149 // next, the THREADED_RTS fields in RTSSummaryStats
1150
1151 #if defined(THREADED_RTS)
1152 MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
1153 MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
1154 MR_STAT("sparks_converted", FMT_Word, sum->sparks.converted);
1155 MR_STAT("sparks_overflowed", FMT_Word, sum->sparks.overflowed);
1156 MR_STAT("sparks_dud ", FMT_Word, sum->sparks.dud);
1157 MR_STAT("sparks_gcd", FMT_Word, sum->sparks.gcd);
1158 MR_STAT("sparks_fizzled", FMT_Word, sum->sparks.fizzled);
1159 MR_STAT("work_balance", "f", sum->work_balance);
1160
1161 // next, globals (other than internal counters)
1162 MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
1163 MR_STAT("task_count", FMT_Word32, taskCount);
1164 MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
1165 MR_STAT("worker_count", FMT_Word32, workerCount);
1166
1167 // next, internal counters
1168 #if defined(PROF_SPIN)
1169 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
1170 MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
1171 gc_alloc_block_sync.yield);
1172 MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
1173 MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
1174 MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
1175 MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
1176 MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
1177 MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
1178 MR_STAT("waitForGcThreads_yield", FMT_Word64,
1179 waitForGcThreads_yield);
1180 MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
1181 MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
1182 whitehole_lockClosure_spin);
1183 MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
1184 whitehole_lockClosure_yield);
1185 MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
1186 whitehole_executeMessage_spin);
1187 MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
1188 whitehole_threadPaused_spin);
1189 MR_STAT("any_work", FMT_Word64,
1190 stats.any_work);
1191 MR_STAT("no_work", FMT_Word64,
1192 stats.no_work);
1193 MR_STAT("scav_find_work", FMT_Word64,
1194 stats.scav_find_work);
1195 #endif // PROF_SPIN
1196 #endif // THREADED_RTS
1197
1198 // finally, per-generation stats. Named as, for example for generation 0,
1199 // gen_0_collections
1200 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1201 const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
1202 MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
1203 MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
1204 MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
1205 MR_STAT_GEN(g, "wall_seconds", "f",
1206 TimeToSecondsDbl(gc_sum->elapsed_ns));
1207 MR_STAT_GEN(g, "max_pause_seconds", "f",
1208 TimeToSecondsDbl(gc_sum->max_pause_ns));
1209 MR_STAT_GEN(g, "avg_pause_seconds", "f",
1210 TimeToSecondsDbl(gc_sum->avg_pause_ns));
1211 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1212 MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
1213 MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
1214 #endif
1215 }
1216
1217 statsPrintf(" ]\n");
1218 }
1219
1220 // Must hold stats_mutex.
report_one_line(const RTSSummaryStats * sum)1221 static void report_one_line(const RTSSummaryStats * sum)
1222 {
1223 // We should do no calculation, other than unit changes and formatting, and
1224 // we should not use any data from outside of globals, sum and stats
1225 // here. See Note [RTS Stats Reporting]
1226
1227 /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
1228 mingw-0.5) */
1229 statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
1230 "%" FMT_Word32 " GCs, "
1231 "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
1232 "(%" FMT_Word32 " samples), "
1233 "%" FMT_Word64 "M in use, "
1234 "%.3f INIT (%.3f elapsed), "
1235 "%.3f MUT (%.3f elapsed), "
1236 "%.3f GC (%.3f elapsed) :ghc>>\n",
1237 stats.allocated_bytes,
1238 stats.gcs,
1239 sum->average_bytes_used,
1240 stats.max_live_bytes,
1241 stats.major_gcs,
1242 stats.max_mem_in_use_bytes / (1024 * 1024),
1243 TimeToSecondsDbl(stats.init_cpu_ns),
1244 TimeToSecondsDbl(stats.init_elapsed_ns),
1245 TimeToSecondsDbl(stats.mutator_cpu_ns),
1246 TimeToSecondsDbl(stats.mutator_elapsed_ns),
1247 TimeToSecondsDbl(stats.gc_cpu_ns),
1248 TimeToSecondsDbl(stats.gc_elapsed_ns));
1249 }
1250
1251 void
stat_exitReport(void)1252 stat_exitReport (void)
1253 {
1254 RTSSummaryStats sum;
1255 init_RTSSummaryStats(&sum);
1256 // We'll need to refer to task counters later
1257 ACQUIRE_LOCK(&all_tasks_mutex);
1258
1259 if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
1260 // First we tidy the times in stats, and populate the times in sum.
1261 // In particular, we adjust the gc_* time in stats to remove
1262 // profiling times.
1263 {
1264 Time now_cpu_ns, now_elapsed_ns;
1265 getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
1266
1267 ACQUIRE_LOCK(&stats_mutex);
1268 stats.cpu_ns = now_cpu_ns - start_init_cpu;
1269 stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
1270 /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
1271 seconds -- SDM */
1272 if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
1273 if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
1274
1275 #if defined(PROFILING)
1276 sum.rp_cpu_ns = RP_tot_time;
1277 sum.rp_elapsed_ns = RPe_tot_time;
1278 sum.hc_cpu_ns = HC_tot_time;
1279 sum.hc_elapsed_ns = HCe_tot_time;
1280 #endif // PROFILING
1281
1282 // We do a GC during the EXIT phase. We'll attribute the cost of
1283 // that to GC instead of EXIT, so carefully subtract it from the
1284 // EXIT time.
1285 // Note that exit_gc includes RP and HC for the exit GC too.
1286 Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
1287 Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
1288
1289 ASSERT(exit_gc_elapsed > 0);
1290
1291 sum.exit_cpu_ns = end_exit_cpu
1292 - start_exit_cpu
1293 - exit_gc_cpu;
1294 sum.exit_elapsed_ns = end_exit_elapsed
1295 - start_exit_elapsed
1296 - exit_gc_elapsed;
1297
1298 ASSERT(sum.exit_elapsed_ns >= 0);
1299
1300 stats.mutator_cpu_ns = start_exit_cpu
1301 - end_init_cpu
1302 - (stats.gc_cpu_ns - exit_gc_cpu)
1303 - stats.nonmoving_gc_cpu_ns;
1304 stats.mutator_elapsed_ns = start_exit_elapsed
1305 - end_init_elapsed
1306 - (stats.gc_elapsed_ns - exit_gc_elapsed);
1307
1308 ASSERT(stats.mutator_elapsed_ns >= 0);
1309
1310 if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
1311
1312 // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
1313 // and subtracting, so the parts should add up to the total exactly.
1314 // Note that stats->total_ns is captured a tiny bit later than
1315 // end_exit_elapsed, so we don't use it here.
1316 ASSERT(stats.init_elapsed_ns // INIT
1317 + stats.mutator_elapsed_ns // MUT
1318 + stats.gc_elapsed_ns // GC
1319 + sum.exit_elapsed_ns // EXIT
1320 == end_exit_elapsed - start_init_elapsed);
1321
1322 // heapCensus() is called by the GC, so RP and HC time are
1323 // included in the GC stats. We therefore subtract them to
1324 // obtain the actual GC cpu time.
1325 Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns;
1326 Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
1327
1328 stats.gc_cpu_ns -= prof_cpu;
1329 stats.gc_elapsed_ns -= prof_elapsed;
1330
1331 // This assertion is probably not necessary; make sure the
1332 // subdivision with PROF also makes sense
1333 ASSERT(stats.init_elapsed_ns // INIT
1334 + stats.mutator_elapsed_ns // MUT
1335 + stats.gc_elapsed_ns // GC
1336 + sum.exit_elapsed_ns // EXIT
1337 + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
1338 == end_exit_elapsed - start_init_elapsed);
1339 }
1340
1341 // REVIEWERS: it's not clear to me why the following isn't done in
1342 // stat_endGC of the last garbage collection?
1343
1344 // We account for the last garbage collection
1345 {
1346 uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
1347 stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
1348 stats.allocated_bytes = tot_alloc_bytes;
1349 if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
1350 statsPrintf("%9" FMT_Word " %9.9s %9.9s",
1351 (W_)stats.gc.allocated_bytes, "", "");
1352 statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
1353 }
1354 }
1355
1356 // We populate the remainder (non-time elements) of sum
1357 {
1358 #if defined(THREADED_RTS)
1359 sum.bound_task_count = taskCount - workerCount;
1360
1361 for (uint32_t i = 0; i < n_capabilities; i++) {
1362 sum.sparks.created += capabilities[i]->spark_stats.created;
1363 sum.sparks.dud += capabilities[i]->spark_stats.dud;
1364 sum.sparks.overflowed+=
1365 capabilities[i]->spark_stats.overflowed;
1366 sum.sparks.converted +=
1367 capabilities[i]->spark_stats.converted;
1368 sum.sparks.gcd += capabilities[i]->spark_stats.gcd;
1369 sum.sparks.fizzled += capabilities[i]->spark_stats.fizzled;
1370 }
1371
1372 sum.sparks_count = sum.sparks.created
1373 + sum.sparks.dud
1374 + sum.sparks.overflowed;
1375
1376 if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
1377 // See Note [Work Balance]
1378 sum.work_balance =
1379 (double)stats.cumulative_par_balanced_copied_bytes
1380 / (double)stats.par_copied_bytes;
1381 } else {
1382 sum.work_balance = 0;
1383 }
1384
1385
1386 #else // THREADED_RTS
1387 sum.gc_cpu_percent = stats.gc_cpu_ns
1388 / stats.cpu_ns;
1389 sum.gc_elapsed_percent = stats.gc_elapsed_ns
1390 / stats.elapsed_ns;
1391 #endif // THREADED_RTS
1392
1393 sum.fragmentation_bytes =
1394 (uint64_t)(peak_mblocks_allocated
1395 * BLOCKS_PER_MBLOCK
1396 * BLOCK_SIZE_W
1397 - hw_alloc_blocks * BLOCK_SIZE_W)
1398 * (uint64_t)sizeof(W_);
1399
1400 sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
1401 stats.cumulative_live_bytes/stats.major_gcs,
1402
1403 sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
1404 (uint64_t)((double)stats.allocated_bytes
1405 / TimeToSecondsDbl(stats.mutator_cpu_ns));
1406
1407 // REVIEWERS: These two values didn't used to include the exit times
1408 sum.productivity_cpu_percent =
1409 TimeToSecondsDbl(stats.cpu_ns
1410 - stats.gc_cpu_ns
1411 - stats.init_cpu_ns
1412 - sum.exit_cpu_ns)
1413 / TimeToSecondsDbl(stats.cpu_ns);
1414
1415 ASSERT(sum.productivity_cpu_percent >= 0);
1416
1417 sum.productivity_elapsed_percent =
1418 TimeToSecondsDbl(stats.elapsed_ns
1419 - stats.gc_elapsed_ns
1420 - stats.init_elapsed_ns
1421 - sum.exit_elapsed_ns)
1422 / TimeToSecondsDbl(stats.elapsed_ns);
1423
1424 ASSERT(sum.productivity_elapsed_percent >= 0);
1425
1426 for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
1427 const generation* gen = &generations[g];
1428 GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
1429 gen_stats->collections = gen->collections;
1430 gen_stats->par_collections = gen->par_collections;
1431 gen_stats->cpu_ns = GC_coll_cpu[g];
1432 gen_stats->elapsed_ns = GC_coll_elapsed[g];
1433 gen_stats->max_pause_ns = GC_coll_max_pause[g];
1434 gen_stats->avg_pause_ns = gen->collections == 0 ?
1435 0 : (GC_coll_elapsed[g] / gen->collections);
1436 #if defined(THREADED_RTS) && defined(PROF_SPIN)
1437 gen_stats->sync_spin = gen->sync.spin;
1438 gen_stats->sync_yield = gen->sync.yield;
1439 #endif // PROF_SPIN
1440 }
1441 }
1442
1443 // Now we generate the report
1444 if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
1445 report_summary(&sum);
1446 }
1447
1448 if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
1449 if (RtsFlags.MiscFlags.machineReadable) {
1450 report_machine_readable(&sum);
1451 }
1452 else {
1453 report_one_line(&sum);
1454 }
1455 }
1456 RELEASE_LOCK(&stats_mutex);
1457
1458 statsFlush();
1459 statsClose();
1460 }
1461
1462 free_RTSSummaryStats(&sum);
1463
1464 if (GC_coll_cpu) {
1465 stgFree(GC_coll_cpu);
1466 GC_coll_cpu = NULL;
1467 }
1468 if (GC_coll_elapsed) {
1469 stgFree(GC_coll_elapsed);
1470 GC_coll_elapsed = NULL;
1471 }
1472 if (GC_coll_max_pause) {
1473 stgFree(GC_coll_max_pause);
1474 GC_coll_max_pause = NULL;
1475 }
1476
1477 RELEASE_LOCK(&all_tasks_mutex);
1478 }
1479
stat_exit()1480 void stat_exit()
1481 {
1482 #if defined(THREADED_RTS)
1483 closeMutex(&stats_mutex);
1484 #endif
1485 }
1486
1487 /* Note [Work Balance]
1488 ----------------------
1489 Work balance is a measure of how evenly the work done during parallel garbage
1490 collection is spread across threads. To compute work balance we must take care
1491 to account for the number of GC threads changing between GCs. The statistics we
1492 track must have the number of GC threads "integrated out".
1493
1494 We accumulate two values from each garbage collection:
1495 * par_copied: is a measure of the total work done during parallel garbage
1496 collection
1497 * par_balanced_copied: is a measure of the balanced work done
1498 during parallel garbage collection.
1499
1500 par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
1501 complicated:
1502
1503 For a given garbage collection:
1504 Let gc_copied := total copies during the gc
1505 gc_copied_i := copies by the ith thread during the gc
1506 num_gc_threads := the number of threads participating in the gc
1507 balance_limit := (gc_copied / num_gc_threads)
1508
1509 If we were to graph gc_copied_i, sorted from largest to smallest we would see
1510 something like:
1511
1512 |X
1513 ^ |X X
1514 | |X X X X: unbalanced copies
1515 copies |----------- Y: balanced copies by the busiest GC thread
1516 |Y Z Z Z: other balanced copies
1517 |Y Z Z Z
1518 |Y Z Z Z Z
1519 |Y Z Z Z Z Z
1520 |===========
1521 |1 2 3 4 5 6
1522 i ->
1523
1524 where the --- line is at balance_limit. Balanced copies are those under the ---
1525 line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
1526 always equal balance_limit. Completely balanced gc has every thread copying
1527 balance_limit and a completely unbalanced gc has a single thread copying
1528 gc_copied.
1529
1530 One could define par_balance_copied as the areas of the Ys and Zs in the graph
1531 above, however we would like the ratio of (par_balance_copied / gc_copied) to
1532 range from 0 to 1, so that work_balance will be a nice percentage, also ranging
1533 from 0 to 1. We therefore define par_balanced_copied as:
1534
1535 ( num_gc_threads )
1536 {Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
1537 i (num_gc_threads - 1)
1538 vvv vvv
1539 S T
1540
1541 Where the S and T terms serve to remove the area of the Ys, and
1542 to normalize the result to lie between 0 and gc_copied.
1543
1544 Note that the implementation orders these operations differently to minimize
1545 error due to integer rounding.
1546
1547 Then cumulative work balance is computed as
1548 (cumulative_par_balanced_copied_bytes / par_copied_byes)
1549
1550 Previously, cumulative work balance was computed as:
1551
1552 (cumulative_par_max_copied_bytes)
1553 (-------------------------------) - 1
1554 ( par_copied_bytes )
1555 -------------------------------------
1556 (n_capabilities - 1)
1557
1558 This was less accurate than the current method, and invalid whenever a garbage
1559 collection had occurred with num_gc_threads /= n_capabilities; which can happen
1560 when setNumCapabilities is called, when -qn is passed as an RTS option, or when
1561 the number of gc threads is limited to the number of cores.
1562 See #13830
1563 */
1564
1565 /*
1566 Note [Internal Counter Stats]
1567 -----------------------------
1568 What do the counts at the end of a '+RTS -s --internal-counters' report mean?
1569 They are detailed below. Most of these counters are used by multiple threads
1570 with no attempt at synchronisation. This means that reported values may be
1571 lower than the true value and this becomes more likely and more severe as
1572 contention increases.
1573
1574 The first counters are for various SpinLock-like constructs in the RTS. See
1575 Spinlock.h for the definition of a SpinLock. We maintain up two counters per
1576 SpinLock:
1577 * spin: The number of busy-spins over the length of the program.
1578 * yield: The number of times the SpinLock spun SPIN_COUNT times without success
1579 and called yieldThread().
1580 Not all of these are actual SpinLocks, see the details below.
1581
1582 Actual SpinLocks:
1583 * gc_alloc_block:
1584 This SpinLock protects the block allocator and free list manager. See
1585 BlockAlloc.c.
1586 * gc_spin and mut_spin:
1587 These SpinLocks are used to herd gc worker threads during parallel garbage
1588 collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
1589 * gen[g].sync:
1590 These SpinLocks, one per generation, protect the generations[g] data
1591 structure during garbage collection.
1592
1593 waitForGcThreads:
1594 These counters are incremented while we wait for all threads to be ready
1595 for a parallel garbage collection. We yield more than we spin in this case.
1596
1597 In several places in the runtime we must take a lock on a closure. To do this,
1598 we replace its info table with stg_WHITEHOLE_info, spinning if it is already
1599 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
1600 don't. We count these white-hole spins and include them in the SpinLocks table.
1601 If a particular loop does not yield, we put "n/a" in the table. They are named
1602 for the function that has the spinning loop except that several loops in the
1603 garbage collector accumulate into whitehole_gc.
1604 TODO: Should these counters be more or less granular?
1605
1606 white-hole spin counters:
1607 * whitehole_gc
1608 * whitehole_lockClosure
1609 * whitehole_executeMessage
1610 * whitehole_threadPaused
1611
1612
1613 We count the number of calls of several functions in the parallel garbage
1614 collector.
1615
1616 Parallel garbage collector counters:
1617 * any_work:
1618 A cheap function called whenever a gc_thread is ready for work. Does
1619 not do any work.
1620 * no_work:
1621 Incremented whenever any_work finds no work.
1622 * scav_find_work:
1623 Called to do work when any_work return true.
1624
1625 */
1626
1627 /* -----------------------------------------------------------------------------
1628 stat_describe_gens
1629
1630 Produce some detailed info on the state of the generational GC.
1631 -------------------------------------------------------------------------- */
1632 void
statDescribeGens(void)1633 statDescribeGens(void)
1634 {
1635 uint32_t g, mut, lge, compacts, i;
1636 W_ gen_slop;
1637 W_ tot_live, tot_slop;
1638 W_ gen_live, gen_blocks;
1639 bdescr *bd;
1640 generation *gen;
1641
1642 debugBelch(
1643 "----------------------------------------------------------------------\n"
1644 " Gen Max Mut-list Blocks Large Compacts Live Slop\n"
1645 " Blocks Bytes Objects \n"
1646 "----------------------------------------------------------------------\n");
1647
1648 tot_live = 0;
1649 tot_slop = 0;
1650
1651 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1652 gen = &generations[g];
1653
1654 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
1655 lge++;
1656 }
1657
1658 for (bd = gen->compact_objects, compacts = 0; bd; bd = bd->link) {
1659 compacts++;
1660 }
1661
1662 gen_live = genLiveWords(gen);
1663 gen_blocks = genLiveBlocks(gen);
1664
1665 mut = 0;
1666 for (i = 0; i < n_capabilities; i++) {
1667 mut += countOccupied(capabilities[i]->mut_lists[g]);
1668
1669 // Add the pinned object block.
1670 bd = capabilities[i]->pinned_object_block;
1671 if (bd != NULL) {
1672 gen_live += bd->free - bd->start;
1673 gen_blocks += bd->blocks;
1674 }
1675
1676 gen_live += gcThreadLiveWords(i,g);
1677 gen_blocks += gcThreadLiveBlocks(i,g);
1678 }
1679
1680 debugBelch("%5d %7" FMT_Word " %9d", g, (W_)gen->max_blocks, mut);
1681
1682 gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
1683
1684 debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n",
1685 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
1686 gen_slop*(W_)sizeof(W_));
1687 tot_live += gen_live;
1688 tot_slop += gen_slop;
1689 }
1690 debugBelch("----------------------------------------------------------------------\n");
1691 debugBelch("%51s%9" FMT_Word " %9" FMT_Word "\n",
1692 "",tot_live*(W_)sizeof(W_),tot_slop*(W_)sizeof(W_));
1693 debugBelch("----------------------------------------------------------------------\n");
1694 debugBelch("\n");
1695 }
1696
1697 /* -----------------------------------------------------------------------------
1698 Stats available via a programmatic interface, so eg. GHCi can time
1699 each compilation and expression evaluation.
1700 -------------------------------------------------------------------------- */
1701
getAllocations(void)1702 uint64_t getAllocations( void )
1703 {
1704 ACQUIRE_LOCK(&stats_mutex);
1705 StgWord64 n = stats.allocated_bytes;
1706 RELEASE_LOCK(&stats_mutex);
1707 return n;
1708 }
1709
getRTSStatsEnabled(void)1710 int getRTSStatsEnabled( void )
1711 {
1712 return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
1713 }
1714
getRTSStats(RTSStats * s)1715 void getRTSStats( RTSStats *s )
1716 {
1717 Time current_elapsed = 0;
1718 Time current_cpu = 0;
1719
1720 ACQUIRE_LOCK(&stats_mutex);
1721 *s = stats;
1722 RELEASE_LOCK(&stats_mutex);
1723
1724 getProcessTimes(¤t_cpu, ¤t_elapsed);
1725 s->cpu_ns = current_cpu - end_init_cpu;
1726 s->elapsed_ns = current_elapsed - end_init_elapsed;
1727
1728 s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
1729 stats.nonmoving_gc_cpu_ns;
1730 s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
1731 stats.gc_elapsed_ns;
1732 }
1733
1734 /* -----------------------------------------------------------------------------
1735 Dumping stuff in the stats file, or via the debug message interface
1736 -------------------------------------------------------------------------- */
1737
1738 void
statsPrintf(char * s,...)1739 statsPrintf( char *s, ... )
1740 {
1741 FILE *sf = RtsFlags.GcFlags.statsFile;
1742 va_list ap;
1743
1744 va_start(ap,s);
1745 if (sf == NULL) {
1746 vdebugBelch(s,ap);
1747 } else {
1748 vfprintf(sf, s, ap);
1749 }
1750 va_end(ap);
1751 }
1752
1753 static void
statsFlush(void)1754 statsFlush( void )
1755 {
1756 FILE *sf = RtsFlags.GcFlags.statsFile;
1757 if (sf != NULL) {
1758 fflush(sf);
1759 }
1760 }
1761
1762 static void
statsClose(void)1763 statsClose( void )
1764 {
1765 FILE *sf = RtsFlags.GcFlags.statsFile;
1766 if (sf != NULL) {
1767 fclose(sf);
1768 }
1769 }
1770