1 /*
2 Copyright (c) 2005-2020 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #include "perf.h"
18
19 #include <cstdlib>
20 #include <cmath>
21 #include <vector>
22 #include <algorithm>
23 #include <cassert>
24
25 #include "tbb/tick_count.h"
26
27 #define HARNESS_CUSTOM_MAIN 1
28 #include "../src/test/harness.h"
29 #include "../src/test/harness_barrier.h"
30
31 #include "tbb/task_scheduler_init.h"
32 #include "tbb/task.h"
33 #include "tbb/atomic.h"
34
35 #if __linux__ || __APPLE__ || (__FreeBSD__||__DragonFly__) || __NetBSD__
36 #include <sys/resource.h>
37 #endif
38
39 __TBB_PERF_API int NumCpus = tbb::task_scheduler_init::default_num_threads(),
40 NumThreads,
41 MaxConcurrency;
42
43 namespace Perf {
44
45 SessionSettings theSettings;
46
47 namespace internal {
48
49 typedef std::vector<duration_t> durations_t;
50
51 static uintptr_t NumRuns = 7;
52 static duration_t RunDuration = 0.01;
53
54 static const int RateFieldLen = 10;
55 static const int OvhdFieldLen = 12;
56
57 const char* TestNameColumnTitle = "Test name";
58 const char* WorkloadNameColumnTitle = "Workload";
59
60 size_t TitleFieldLen = 0;
61 size_t WorkloadFieldLen = 0;
62
63 int TotalConfigs = 0;
64 int MaxTbbMasters = 1;
65
66 //! Defines the mapping between threads and cores in the undersubscription mode
67 /** When adding new enumerator, insert it before amLast, and do not specify
68 its value explicitly. **/
69 enum AffinitizationMode {
70 amFirst = 0,
71 amDense = amFirst,
72 amSparse,
73 //! Used to track the number of supported affinitization modes
74 amLast
75 };
76
77 static const int NumAffinitizationModes = amLast - amFirst;
78
79 const char* AffinitizationModeNames[] = { "dense", "sparse" };
80
81 int NumActiveAffModes = 1;
82
83 //! Settings of a test run configuration
84 struct RunConfig {
85 int my_maxConcurrency;
86 int my_numThreads; // For task scheduler tests this is number of workers + 1
87 int my_numMasters; // Used for task scheduler tests only
88 int my_affinityMode; // Used for task scheduler tests only
89 int my_workloadID;
90
NumMastersPerf::internal::RunConfig91 int NumMasters () const {
92 return theSettings.my_opts & UseTaskScheduler ? my_numMasters : my_numThreads;
93 }
94 };
95
StandardDeviation(double avg,const durations_t & d)96 double StandardDeviation ( double avg, const durations_t& d ) {
97 double std_dev = 0;
98 for ( uintptr_t i = 0; i < d.size(); ++i ) {
99 double dev = fabs(d[i] - avg);
100 std_dev += dev * dev;
101 }
102 std_dev = sqrt(std_dev / d.size());
103 return std_dev / avg * 100;
104 }
105
Statistics(const durations_t & d,duration_t & avgTime,double & stdDev,duration_t & minTime,duration_t & maxTime)106 void Statistics ( const durations_t& d,
107 duration_t& avgTime, double& stdDev,
108 duration_t& minTime, duration_t& maxTime )
109 {
110 minTime = maxTime = avgTime = d[0];
111 for ( size_t i = 1; i < d.size(); ++i ) {
112 avgTime += d[i];
113 if ( minTime > d[i] )
114 minTime = d[i];
115 else if ( maxTime < d[i] )
116 maxTime = d[i];
117 }
118 avgTime = avgTime / d.size();
119 stdDev = StandardDeviation( avgTime, d );
120 }
121
122 //! Timing data for the series of repeated runs and results of their statistical processing
123 struct TimingSeries {
124 //! Statistical timing series
125 durations_t my_durations;
126
127 //! Average time obtained from my_durations data
128 duration_t my_avgTime;
129
130 //! Minimal time obtained from my_durations data
131 duration_t my_minTime;
132
133 //! Minimal time obtained from my_durations data
134 duration_t my_maxTime;
135
136 //! Standard deviation of my_avgTime value (per cent)
137 double my_stdDev;
138
TimingSeriesPerf::internal::TimingSeries139 TimingSeries ( uintptr_t nruns = NumRuns )
140 : my_durations(nruns), my_avgTime(0), my_minTime(0), my_maxTime(0)
141 {}
142
CalculateStatisticsPerf::internal::TimingSeries143 void CalculateStatistics () {
144 Statistics( my_durations, my_avgTime, my_stdDev, my_minTime, my_maxTime );
145 }
146 }; // struct TimingSeries
147
148 //! Settings and timing results for a test run configuration
149 struct RunResults {
150 //! Run configuration settings
151 RunConfig my_config;
152
153 //! Timing results for this run configuration
154 TimingSeries my_timing;
155 };
156
157 typedef std::vector<const char*> names_t;
158 typedef std::vector<TimingSeries> timings_t;
159 typedef std::vector<RunResults> test_results_t;
160
161 enum TestMethods {
162 idRunSerial = 0x01,
163 idOnStart = 0x02,
164 idOnFinish = 0x04,
165 idPrePostProcess = idOnStart | idOnFinish
166 };
167
168 //! Set of flags identifying methods not overridden by the currently active test
169 /** Used as a scratch var. **/
170 uintptr_t g_absentMethods;
171
172 //! Test object and timing results for all of its configurations
173 struct TestResults {
174 //! Pointer to the test object interface
175 Test* my_test;
176
177 //! Set of flags identifying optional methods overridden by my_test
178 /** A set of ORed TestMethods flags **/
179 uintptr_t my_availableMethods;
180
181 //! Vector of serial times for each workload supported by this test
182 /** Element index in the vector serves as a zero based workload ID. **/
183 timings_t my_serialBaselines;
184
185 //! Common baselines for both parallel and serial variants
186 /** Element index in the vector serves as a zero based workload ID. **/
187 timings_t my_baselines;
188
189 //! Strings identifying workloads to be used in output
190 names_t my_workloadNames;
191
192 //! Vector of timings for all run configurations of my_test
193 test_results_t my_results;
194
195 const char* my_testName;
196
197 mutable bool my_hasOwnership;
198
TestResultsPerf::internal::TestResults199 TestResults ( Test* t, const char* className, bool takeOwnership )
200 : my_test(t), my_availableMethods(0), my_testName(className), my_hasOwnership(takeOwnership)
201 {}
202
TestResultsPerf::internal::TestResults203 TestResults ( const TestResults& tr )
204 : my_test(tr.my_test)
205 , my_availableMethods(0)
206 , my_testName(tr.my_testName)
207 , my_hasOwnership(tr.my_hasOwnership)
208 {
209 tr.my_hasOwnership = false;
210 }
211
~TestResultsPerf::internal::TestResults212 ~TestResults () {
213 for ( size_t i = 0; i < my_workloadNames.size(); ++i )
214 delete my_workloadNames[i];
215 if ( my_hasOwnership )
216 delete my_test;
217 }
218 }; // struct TestResults
219
220 typedef std::vector<TestResults> session_t;
221
222 session_t theSession;
223
224 TimingSeries CalibrationTiming;
225
226 const uintptr_t CacheSize = 8*1024*1024;
227 volatile intptr_t W[CacheSize];
228
229 struct WiperBody {
operator ()Perf::internal::WiperBody230 void operator()( int ) const {
231 volatile intptr_t sink = 0;
232 for ( uintptr_t i = 0; i < CacheSize; ++i )
233 sink += W[i];
234 }
235 };
236
TraceHistogram(const durations_t & t,const char * histogramFileName)237 void TraceHistogram ( const durations_t& t, const char* histogramFileName ) {
238 FILE* f = histogramFileName ? fopen(histogramFileName, "wt") : stdout;
239 uintptr_t n = t.size();
240 const uintptr_t num_buckets = 100;
241 double min_val = *std::min_element(t.begin(), t.end()),
242 max_val = *std::max_element(t.begin(), t.end()),
243 bucket_size = (max_val - min_val) / num_buckets;
244 std::vector<uintptr_t> hist(num_buckets + 1, 0);
245 for ( uintptr_t i = 0; i < n; ++i )
246 ++hist[uintptr_t((t[i]-min_val)/bucket_size)];
247 ASSERT (hist[num_buckets] == 1, "");
248 ++hist[num_buckets - 1];
249 hist.resize(num_buckets);
250 fprintf (f, "Histogram: nvals = %u, min = %g, max = %g, nbuckets = %u\n", (unsigned)n, min_val, max_val, (unsigned)num_buckets);
251 double bucket = min_val;
252 for ( uintptr_t i = 0; i < num_buckets; ++i, bucket+=bucket_size )
253 fprintf (f, "%12g\t%u\n", bucket, (unsigned)hist[i]);
254 fclose(f);
255 }
256
257 #if _MSC_VER
258 typedef DWORD_PTR cpu_set_t;
259
260 class AffinityHelper {
261 static const unsigned MaxAffinitySetSize = sizeof(cpu_set_t) * 8;
262 static unsigned AffinitySetSize;
263
264 //! Mapping from a CPU index to a valid affinity cpu_mask
265 /** The first element is not used. **/
266 static cpu_set_t m_affinities[MaxAffinitySetSize + 1];
267
268 static cpu_set_t m_processMask;
269
270 class Initializer {
271 public:
Initializer()272 Initializer () {
273 SYSTEM_INFO si;
274 GetNativeSystemInfo(&si);
275 ASSERT( si.dwNumberOfProcessors <= MaxAffinitySetSize, "Too many CPUs" );
276 AffinitySetSize = min (si.dwNumberOfProcessors, MaxAffinitySetSize);
277 cpu_set_t systemMask = 0;
278 GetProcessAffinityMask( GetCurrentProcess(), &m_processMask, &systemMask );
279 cpu_set_t cpu_mask = 1;
280 for ( DWORD i = 0; i < AffinitySetSize; ++i ) {
281 while ( !(cpu_mask & m_processMask) && cpu_mask )
282 cpu_mask <<= 1;
283 ASSERT( cpu_mask != 0, "Process affinity set is culled?" );
284 m_affinities[i] = cpu_mask;
285 cpu_mask <<= 1;
286 }
287 }
288 }; // class AffinityHelper::Initializer
289
290 static Initializer m_initializer;
291
292 public:
CpuAffinity(int cpuIndex)293 static cpu_set_t CpuAffinity ( int cpuIndex ) {
294 return m_affinities[cpuIndex % AffinitySetSize];
295 }
296
ProcessMask()297 static const cpu_set_t& ProcessMask () { return m_processMask; }
298 }; // class AffinityHelper
299
300 unsigned AffinityHelper::AffinitySetSize = 0;
301 cpu_set_t AffinityHelper::m_affinities[AffinityHelper::MaxAffinitySetSize + 1] = {0};
302 cpu_set_t AffinityHelper::m_processMask = 0;
303 AffinityHelper::Initializer AffinityHelper::m_initializer;
304
305 #define CPU_ZERO(cpu_mask) (*cpu_mask = 0)
306 #define CPU_SET(cpu_idx, cpu_mask) (*cpu_mask |= AffinityHelper::CpuAffinity(cpu_idx))
307 #define CPU_CLR(cpu_idx, cpu_mask) (*cpu_mask &= ~AffinityHelper::CpuAffinity(cpu_idx))
308 #define CPU_ISSET(cpu_idx, cpu_mask) ((*cpu_mask & AffinityHelper::CpuAffinity(cpu_idx)) != 0)
309
310 #elif __linux__ /* end of _MSC_VER */
311
312 #include <unistd.h>
313 #include <sys/types.h>
314 #include <linux/unistd.h>
315
gettid()316 pid_t gettid() { return (pid_t)syscall(__NR_gettid); }
317
318 #define GET_MASK(cpu_set) (*(unsigned*)(void*)&cpu_set)
319 #define RES_STAT(res) (res != 0 ? "failed" : "ok")
320
321 class AffinityHelper {
322 static cpu_set_t m_processMask;
323
324 class Initializer {
325 public:
Initializer()326 Initializer () {
327 CPU_ZERO (&m_processMask);
328 int res = sched_getaffinity( getpid(), sizeof(cpu_set_t), &m_processMask );
329 ASSERT ( res == 0, "sched_getaffinity failed" );
330 }
331 }; // class AffinityHelper::Initializer
332
333 static Initializer m_initializer;
334
335 public:
ProcessMask()336 static const cpu_set_t& ProcessMask () { return m_processMask; }
337 }; // class AffinityHelper
338
339 cpu_set_t AffinityHelper::m_processMask;
340 AffinityHelper::Initializer AffinityHelper::m_initializer;
341 #endif /* __linux__ */
342
PinTheThread(int cpu_idx,tbb::atomic<int> & nThreads)343 bool PinTheThread ( int cpu_idx, tbb::atomic<int>& nThreads ) {
344 #if _MSC_VER || __linux__
345 cpu_set_t orig_mask, target_mask;
346 CPU_ZERO( &target_mask );
347 CPU_SET( cpu_idx, &target_mask );
348 ASSERT ( CPU_ISSET(cpu_idx, &target_mask), "CPU_SET failed" );
349 #endif
350 #if _MSC_VER
351 orig_mask = SetThreadAffinityMask( GetCurrentThread(), target_mask );
352 if ( !orig_mask )
353 return false;
354 #elif __linux__
355 CPU_ZERO( &orig_mask );
356 int res = sched_getaffinity( gettid(), sizeof(cpu_set_t), &orig_mask );
357 ASSERT ( res == 0, "sched_getaffinity failed" );
358 res = sched_setaffinity( gettid(), sizeof(cpu_set_t), &target_mask );
359 ASSERT ( res == 0, "sched_setaffinity failed" );
360 #endif /* _MSC_VER */
361 --nThreads;
362 while ( nThreads )
363 __TBB_Yield();
364 #if _MSC_VER
365 SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
366 #endif
367 return true;
368 }
369
370 class AffinitySetterTask : tbb::task {
371 static bool m_result;
372 static tbb::atomic<int> m_nThreads;
373 int m_idx;
374
execute()375 tbb::task* execute () {
376 //TestAffinityOps();
377 m_result = PinTheThread( m_idx, m_nThreads );
378 return NULL;
379 }
380
381 public:
AffinitySetterTask(int idx)382 AffinitySetterTask ( int idx ) : m_idx(idx) {}
383
384 friend bool AffinitizeTBB ( int, int /*mode*/ );
385 };
386
387 bool AffinitySetterTask::m_result = true;
388 tbb::atomic<int> AffinitySetterTask::m_nThreads;
389
AffinitizeTBB(int p,int affMode)390 bool AffinitizeTBB ( int p, int affMode ) {
391 #if _MSC_VER
392 SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
393 SetPriorityClass (GetCurrentProcess(), HIGH_PRIORITY_CLASS);
394 #endif
395 AffinitySetterTask::m_result = true;
396 AffinitySetterTask::m_nThreads = p;
397 tbb::task_list tl;
398 for ( int i = 0; i < p; ++i ) {
399 tbb::task &t = *new( tbb::task::allocate_root() ) AffinitySetterTask( affMode == amSparse ? i * NumCpus / p : i );
400 t.set_affinity( tbb::task::affinity_id(i + 1) );
401 tl.push_back( t );
402 }
403 tbb::task::spawn_root_and_wait(tl);
404 return AffinitySetterTask::m_result;
405 }
406
407 inline
Affinitize(int p,int affMode)408 void Affinitize ( int p, int affMode ) {
409 if ( !AffinitizeTBB (p, affMode) )
410 REPORT("Warning: Failed to set affinity for %d TBB threads\n", p);
411 }
412
413 class TbbWorkersTrapper {
414 tbb::atomic<int> my_refcount;
415 tbb::task *my_root;
416 tbb::task_group_context my_context;
417 Harness::SpinBarrier my_barrier;
418
419 friend class TrapperTask;
420
421 class TrapperTask : public tbb::task {
422 TbbWorkersTrapper& my_owner;
423
execute()424 tbb::task* execute () {
425 my_owner.my_barrier.wait();
426 my_owner.my_root->wait_for_all();
427 my_owner.my_barrier.wait();
428 return NULL;
429 }
430 public:
TrapperTask(TbbWorkersTrapper & owner)431 TrapperTask ( TbbWorkersTrapper& owner ) : my_owner(owner) {}
432 };
433
434 public:
TbbWorkersTrapper()435 TbbWorkersTrapper ()
436 : my_context(tbb::task_group_context::bound,
437 tbb::task_group_context::default_traits | tbb::task_group_context::concurrent_wait)
438 {
439 my_root = new ( tbb::task::allocate_root(my_context) ) tbb::empty_task;
440 my_root->set_ref_count(2);
441 my_barrier.initialize(NumThreads);
442 for ( int i = 1; i < NumThreads; ++i )
443 tbb::task::spawn( *new(tbb::task::allocate_root()) TrapperTask(*this) );
444 my_barrier.wait(); // Wait util all workers are ready
445 }
446
~TbbWorkersTrapper()447 ~TbbWorkersTrapper () {
448 my_root->decrement_ref_count();
449 my_barrier.wait(); // Make sure no tasks are referencing us
450 tbb::task::destroy(*my_root);
451 }
452 }; // TbbWorkersTrapper
453
454
455 #if __TBB_STATISTICS
456 static bool StatisticsMode = true;
457 #else
458 static bool StatisticsMode = false;
459 #endif
460
461 //! Suppresses silly warning
__TBB_bool(bool b)462 inline bool __TBB_bool( bool b ) { return b; }
463
464 #define START_WORKERS(needScheduler, p, a, setWorkersAffinity, trapWorkers) \
465 tbb::task_scheduler_init init(tbb::task_scheduler_init::deferred); \
466 TbbWorkersTrapper *trapper = NULL; \
467 if ( theSettings.my_opts & UseTaskScheduler \
468 && (needScheduler) && ((setWorkersAffinity) || (trapWorkers)) ) \
469 { \
470 init.initialize( p ); \
471 if ( __TBB_bool(setWorkersAffinity) ) \
472 Affinitize( p, a ); \
473 if ( __TBB_bool(trapWorkers) ) \
474 trapper = new TbbWorkersTrapper; \
475 }
476
477 #define STOP_WORKERS() \
478 if ( theSettings.my_opts & UseTaskScheduler && init.is_active() ) { \
479 if ( trapper ) \
480 delete trapper; \
481 init.terminate(); \
482 /* Give asynchronous deinitialization time to complete */ \
483 Harness::Sleep(50); \
484 }
485
486 typedef void (Test::*RunMemFnPtr)( Test::ThreadInfo& );
487
488 TimingSeries *TlsTimings;
489 Harness::SpinBarrier multipleMastersBarrier;
490
491 class TimingFunctor {
492 Test* my_test;
493 RunConfig *my_cfg;
494 RunMemFnPtr my_fnRun;
495 size_t my_numRuns;
496 size_t my_numRepeats;
497 uintptr_t my_availableMethods;
498
TimeSingleRun(Test::ThreadInfo & ti) const499 duration_t TimeSingleRun ( Test::ThreadInfo& ti ) const {
500 if ( my_availableMethods & idOnStart )
501 my_test->OnStart(ti);
502 // Warming run
503 (my_test->*my_fnRun)(ti);
504 multipleMastersBarrier.wait();
505 tbb::tick_count t0 = tbb::tick_count::now();
506 (my_test->*my_fnRun)(ti);
507 duration_t t = (tbb::tick_count::now() - t0).seconds();
508 if ( my_availableMethods & idOnFinish )
509 my_test->OnFinish(ti);
510 return t;
511 }
512
513 public:
TimingFunctor(Test * test,RunConfig * cfg,RunMemFnPtr fnRun,size_t numRuns,size_t nRepeats,uintptr_t availableMethods)514 TimingFunctor ( Test* test, RunConfig *cfg, RunMemFnPtr fnRun,
515 size_t numRuns, size_t nRepeats, uintptr_t availableMethods )
516 : my_test(test), my_cfg(cfg), my_fnRun(fnRun)
517 , my_numRuns(numRuns), my_numRepeats(nRepeats), my_availableMethods(availableMethods)
518 {}
519
operator ()(int tid) const520 void operator()( int tid ) const {
521 Test::ThreadInfo ti = { tid, NULL };
522 durations_t &d = TlsTimings[tid].my_durations;
523 bool singleMaster = my_cfg->my_numMasters == 1;
524 START_WORKERS( (!singleMaster || (singleMaster && StatisticsMode)) && my_fnRun != &Test::RunSerial,
525 my_cfg->my_numThreads, my_cfg->my_affinityMode, singleMaster, singleMaster );
526 for ( uintptr_t k = 0; k < my_numRuns; ++k ) {
527 if ( my_numRepeats > 1 ) {
528 d[k] = 0;
529 if ( my_availableMethods & idPrePostProcess ) {
530 for ( uintptr_t i = 0; i < my_numRepeats; ++i )
531 d[k] += TimeSingleRun(ti);
532 }
533 else {
534 multipleMastersBarrier.wait();
535 tbb::tick_count t0 = tbb::tick_count::now();
536 for ( uintptr_t i = 0; i < my_numRepeats; ++i )
537 (my_test->*my_fnRun)(ti);
538 d[k] = (tbb::tick_count::now() - t0).seconds();
539 }
540 d[k] /= my_numRepeats;
541 }
542 else
543 d[k] = TimeSingleRun(ti);
544 }
545 STOP_WORKERS();
546 TlsTimings[tid].CalculateStatistics();
547 }
548 }; // class TimingFunctor
549
DoTiming(TestResults & tr,RunConfig & cfg,RunMemFnPtr fnRun,size_t nRepeats,TimingSeries & ts)550 void DoTiming ( TestResults& tr, RunConfig &cfg, RunMemFnPtr fnRun, size_t nRepeats, TimingSeries& ts ) {
551 int numThreads = cfg.NumMasters();
552 size_t numRuns = ts.my_durations.size() / numThreads;
553 TimingFunctor body( tr.my_test, &cfg, fnRun, numRuns, nRepeats, tr.my_availableMethods );
554 multipleMastersBarrier.initialize(numThreads);
555 tr.my_test->SetWorkload(cfg.my_workloadID);
556 if ( numThreads == 1 ) {
557 TimingSeries *t = TlsTimings;
558 TlsTimings = &ts;
559 body(0);
560 TlsTimings = t;
561 }
562 else {
563 ts.my_durations.resize(numThreads * numRuns);
564 NativeParallelFor( numThreads, body );
565 for ( int i = 0, j = 0; i < numThreads; ++i ) {
566 durations_t &d = TlsTimings[i].my_durations;
567 for ( size_t k = 0; k < numRuns; ++k, ++j )
568 ts.my_durations[j] = d[k];
569 }
570 ts.CalculateStatistics();
571 }
572 }
573
574 //! Runs the test function, does statistical processing, and, if title is nonzero, prints results.
575 /** If histogramFileName is a string, the histogram of individual runs is generated and stored
576 in a file with the given name. If it is NULL then the histogram is printed on the console.
577 By default no histogram is generated.
578 The histogram format is: "rate bucket start" "number of tests in this bucket". **/
RunTestImpl(TestResults & tr,RunConfig & cfg,RunMemFnPtr pfnTest,TimingSeries & ts)579 void RunTestImpl ( TestResults& tr, RunConfig &cfg, RunMemFnPtr pfnTest, TimingSeries& ts ) {
580 // nRepeats is a number of repeated calls to the test function made as
581 // part of the same run. It is determined experimentally by the following
582 // calibration process so that the total run time was approx. RunDuration.
583 // This is helpful to increase the measurement precision in case of very
584 // short tests.
585 size_t nRepeats = 1;
586 // A minimal stats is enough when doing calibration
587 CalibrationTiming.my_durations.resize( (NumRuns < 4 ? NumRuns : 3) * cfg.NumMasters() );
588 // There's no need to be too precise when calculating nRepeats. And reasonably
589 // far extrapolation can speed up the process significantly.
590 for (;;) {
591 DoTiming( tr, cfg, pfnTest, nRepeats, CalibrationTiming );
592 if ( CalibrationTiming.my_avgTime * nRepeats > 1e-4 )
593 break;
594 nRepeats *= 2;
595 }
596 nRepeats *= (uintptr_t)ceil( RunDuration / (CalibrationTiming.my_avgTime * nRepeats) );
597
598 DoTiming(tr, cfg, pfnTest, nRepeats, ts);
599
600 // No histogram for baseline measurements
601 if ( pfnTest != &Test::RunSerial && pfnTest != &Test::Baseline ) {
602 const char* histogramName = theSettings.my_histogramName;
603 if ( histogramName != NoHistogram && tr.my_test->HistogramName() != DefaultHistogram )
604 histogramName = tr.my_test->HistogramName();
605 if ( histogramName != NoHistogram )
606 TraceHistogram( ts.my_durations, histogramName );
607 }
608 } // RunTestImpl
609
610 typedef void (*TestActionFn) ( TestResults&, int mastersRange, int w, int p, int m, int a, int& numTests );
611
TestResultIndex(int mastersRange,int w,int p,int m,int a)612 int TestResultIndex ( int mastersRange, int w, int p, int m, int a ) {
613 return ((w * (MaxThread - MinThread + 1) + (p - MinThread)) * mastersRange + m) * NumActiveAffModes + a;
614 }
615
RunTest(TestResults & tr,int mastersRange,int w,int p,int m,int a,int & numTests)616 void RunTest ( TestResults& tr, int mastersRange, int w, int p, int m, int a, int& numTests ) {
617 size_t r = TestResultIndex(mastersRange, w, p, m, a);
618 ASSERT( r < tr.my_results.size(), NULL );
619 RunConfig &rc = tr.my_results[r].my_config;
620 rc.my_maxConcurrency = MaxConcurrency;
621 rc.my_numThreads = p;
622 rc.my_numMasters = m + tr.my_test->MinNumMasters();
623 rc.my_affinityMode = a;
624 rc.my_workloadID = w;
625 RunTestImpl( tr, rc, &Test::Run, tr.my_results[r].my_timing );
626 printf( "Running tests: %04.1f%%\r", ++numTests * 100. / TotalConfigs ); fflush(stdout);
627 }
628
WalkTests(TestActionFn fn,int & numTests,bool setAffinity,bool trapWorkers,bool multipleMasters)629 void WalkTests ( TestActionFn fn, int& numTests, bool setAffinity, bool trapWorkers, bool multipleMasters ) {
630 for ( int p = MinThread; p <= MaxThread; ++p ) {
631 NumThreads = p;
632 MaxConcurrency = p < NumCpus ? p : NumCpus;
633 for ( int a = 0; a < NumActiveAffModes; ++a ) {
634 START_WORKERS( multipleMasters || !StatisticsMode, p, a, setAffinity, trapWorkers );
635 for ( size_t i = 0; i < theSession.size(); ++i ) {
636 TestResults &tr = theSession[i];
637 Test *t = tr.my_test;
638 int mastersRange = t->MaxNumMasters() - t->MinNumMasters() + 1;
639 int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t->NumWorkloads();
640 for ( int w = 0; w < numWorkloads; ++w ) {
641 if ( multipleMasters )
642 for ( int m = 1; m < mastersRange; ++m )
643 fn( tr, mastersRange, w, p, m, a, numTests );
644 else
645 fn( tr, mastersRange, w, p, 0, a, numTests );
646 }
647 }
648 STOP_WORKERS();
649 }
650 }
651 }
652
RunTests()653 void RunTests () {
654 int numTests = 0;
655 WalkTests( &RunTest, numTests, !StatisticsMode, !StatisticsMode, false );
656 if ( MaxTbbMasters > 1 )
657 WalkTests( &RunTest, numTests, true, false, true );
658 }
659
InitTestData(TestResults & tr,int mastersRange,int w,int p,int m,int a,int &)660 void InitTestData ( TestResults& tr, int mastersRange, int w, int p, int m, int a, int& ) {
661 size_t r = TestResultIndex(mastersRange, w, p, m, a);
662 ASSERT( r < tr.my_results.size(), NULL );
663 tr.my_results[r].my_timing.my_durations.resize(
664 (theSettings.my_opts & UseTaskScheduler ? tr.my_test->MinNumMasters() + m : p) * NumRuns );
665 }
666
667 char WorkloadName[MaxWorkloadNameLen + 1];
668
PrepareTests()669 void PrepareTests () {
670 printf( "Initializing...\r" );
671 NumActiveAffModes = theSettings.my_opts & UseAffinityModes ? NumAffinitizationModes : 1;
672 TotalConfigs = 0;
673 TitleFieldLen = strlen( TestNameColumnTitle );
674 WorkloadFieldLen = strlen( WorkloadNameColumnTitle );
675 int numThreads = MaxThread - MinThread + 1;
676 int numConfigsBase = numThreads * NumActiveAffModes;
677 int totalWorkloads = 0;
678 for ( size_t i = 0; i < theSession.size(); ++i ) {
679 TestResults &tr = theSession[i];
680 Test &t = *tr.my_test;
681 int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t.NumWorkloads();
682 int numConfigs = numConfigsBase * numWorkloads;
683 if ( t.MaxNumMasters() > 1 ) {
684 ASSERT( theSettings.my_opts & UseTaskScheduler, "Multiple masters mode is only valid for task scheduler tests" );
685 if ( MaxTbbMasters < t.MaxNumMasters() )
686 MaxTbbMasters = t.MaxNumMasters();
687 numConfigs *= t.MaxNumMasters() - t.MinNumMasters() + 1;
688 }
689 totalWorkloads += numWorkloads;
690 TotalConfigs += numConfigs;
691
692 const char* testName = t.Name();
693 if ( testName )
694 tr.my_testName = testName;
695 ASSERT( tr.my_testName, "Neither Test::Name() is implemented, nor RTTI is enabled" );
696 TitleFieldLen = max( TitleFieldLen, strlen(tr.my_testName) );
697
698 tr.my_results.resize( numConfigs );
699 tr.my_serialBaselines.resize( numWorkloads );
700 tr.my_baselines.resize( numWorkloads );
701 tr.my_workloadNames.resize( numWorkloads );
702 }
703 TimingSeries tmpTiming;
704 TlsTimings = &tmpTiming; // All measurements are serial here
705 int n = 0;
706 for ( size_t i = 0; i < theSession.size(); ++i ) {
707 TestResults &tr = theSession[i];
708 Test &t = *tr.my_test;
709 // Detect which methods are overridden by the test implementation
710 g_absentMethods = 0;
711 Test::ThreadInfo ti = { 0 };
712 t.SetWorkload(0);
713 t.OnStart(ti);
714 t.RunSerial(ti);
715 t.OnFinish(ti);
716 if ( theSettings.my_opts & UseSerialBaseline && !(g_absentMethods & idRunSerial) )
717 tr.my_availableMethods |= idRunSerial;
718 if ( !(g_absentMethods & idOnStart) )
719 tr.my_availableMethods |= idOnStart;
720
721 RunConfig rc = { 1, 1, 1, 0, 0 };
722 int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t.NumWorkloads();
723 for ( int w = 0; w < numWorkloads; ++w ) {
724 WorkloadName[0] = 0;
725 t.SetWorkload(w);
726 if ( !WorkloadName[0] )
727 sprintf( WorkloadName, "%d", w );
728 size_t len = strlen(WorkloadName);
729 tr.my_workloadNames[w] = new char[len + 1];
730 strcpy ( (char*)tr.my_workloadNames[w], WorkloadName );
731 WorkloadFieldLen = max( WorkloadFieldLen, len );
732
733 rc.my_workloadID = w;
734 if ( theSettings.my_opts & UseBaseline )
735 RunTestImpl( tr, rc, &Test::Baseline, tr.my_baselines[w] );
736 if ( tr.my_availableMethods & idRunSerial )
737 RunTestImpl( tr, rc, &Test::RunSerial, tr.my_serialBaselines[w] );
738 printf( "Measuring baselines: %04.1f%%\r", ++n * 100. / totalWorkloads ); fflush(stdout);
739 }
740 }
741 TlsTimings = new TimingSeries[MaxThread + MaxTbbMasters - 1];
742 if ( theSettings.my_opts & UseTaskScheduler ? MaxTbbMasters : MaxThread )
743 WalkTests( &InitTestData, n, false, false, theSettings.my_opts & UseTaskScheduler ? true : false );
744 CalibrationTiming.my_durations.reserve( MaxTbbMasters * 3 );
745 printf( " \r");
746 }
747
748 FILE* ResFile = NULL;
749
Report(char const * fmt,...)750 void Report ( char const* fmt, ... ) {
751 va_list args;
752 if ( ResFile ) {
753 va_start( args, fmt );
754 vfprintf( ResFile, fmt, args );
755 va_end( args );
756 }
757 va_start( args, fmt );
758 vprintf( fmt, args );
759 va_end( args );
760 }
761
PrintResults()762 void PrintResults () {
763 if ( theSettings.my_resFile )
764 ResFile = fopen( theSettings.my_resFile, "w" );
765 Report( "%-*s %-*s %s", TitleFieldLen, "Test-name", WorkloadFieldLen, "Workload",
766 MaxTbbMasters > 1 ? "W M " : "T " );
767 if ( theSettings.my_opts & UseAffinityModes )
768 Report( "Aff " );
769 Report( "%-*s SD,%% %-*s %-*s %-*s ",
770 RateFieldLen, "Avg.time", OvhdFieldLen, "Par.ovhd,%",
771 RateFieldLen, "Min.time", RateFieldLen, "Max.time" );
772 Report( " | Repeats = %lu, CPUs %d\n", (unsigned long)NumRuns, NumCpus );
773 for ( size_t i = 0; i < theSession.size(); ++i ) {
774 TestResults &tr = theSession[i];
775 for ( size_t j = 0; j < tr.my_results.size(); ++j ) {
776 RunResults &rr = tr.my_results[j];
777 RunConfig &rc = rr.my_config;
778 int w = rc.my_workloadID;
779 TimingSeries &ts = rr.my_timing;
780 duration_t baselineTime = tr.my_baselines[w].my_avgTime,
781 cleanTime = ts.my_avgTime - baselineTime;
782 Report( "%-*s %-*s ", TitleFieldLen, tr.my_testName, WorkloadFieldLen, tr.my_workloadNames[w] );
783 if ( MaxTbbMasters > 1 )
784 Report( "%-4d %-4d ", rc.my_numThreads - 1, rc.my_numMasters );
785 else
786 Report( "%-4d ", rc.my_numThreads );
787 if ( theSettings.my_opts & UseAffinityModes )
788 Report( "%%-8s ", AffinitizationModeNames[rc.my_affinityMode] );
789 Report( "%-*.2e %-6.1f ", RateFieldLen, cleanTime, ts.my_stdDev);
790 if ( tr.my_availableMethods & idRunSerial ) {
791 duration_t serialTime = (tr.my_serialBaselines[w].my_avgTime - baselineTime) / rc.my_maxConcurrency;
792 Report( "%-*.1f ", OvhdFieldLen, 100*(cleanTime - serialTime)/serialTime );
793 }
794 else
795 Report( "%*s%*s ", OvhdFieldLen/2, "-", OvhdFieldLen - OvhdFieldLen/2, "" );
796 Report( "%-*.2e %-*.2e ", RateFieldLen, ts.my_minTime - baselineTime, RateFieldLen, ts.my_maxTime - baselineTime);
797 Report( "\n" );
798 }
799 }
800 delete [] TlsTimings;
801 if ( ResFile )
802 fclose(ResFile);
803 }
804
RegisterTest(Test * t,const char * className,bool takeOwnership)805 __TBB_PERF_API void RegisterTest ( Test* t, const char* className, bool takeOwnership ) {
806 // Just collect test objects at this stage
807 theSession.push_back( TestResults(t, className, takeOwnership) );
808 }
809
810 } // namespace internal
811
Baseline(ThreadInfo &)812 __TBB_PERF_API void Test::Baseline ( ThreadInfo& ) {}
813
RunSerial(ThreadInfo &)814 __TBB_PERF_API void Test::RunSerial ( ThreadInfo& ) { internal::g_absentMethods |= internal::idRunSerial; }
815
OnStart(ThreadInfo &)816 __TBB_PERF_API void Test::OnStart ( ThreadInfo& ) { internal::g_absentMethods |= internal::idOnStart; }
817
OnFinish(ThreadInfo &)818 __TBB_PERF_API void Test::OnFinish ( ThreadInfo& ) { internal::g_absentMethods |= internal::idOnFinish; }
819
WipeCaches()820 __TBB_PERF_API void WipeCaches () { NativeParallelFor( NumCpus, internal::WiperBody() ); }
821
EmptyFunc()822 __TBB_PERF_API void EmptyFunc () {}
AnchorFunc(void *)823 __TBB_PERF_API void AnchorFunc ( void* ) {}
AnchorFunc2(void *,void *)824 __TBB_PERF_API void AnchorFunc2 ( void*, void* ) {}
825
SetWorkloadName(const char * format,...)826 __TBB_PERF_API void SetWorkloadName( const char* format, ... ) {
827 internal::WorkloadName[MaxWorkloadNameLen] = 0;
828 va_list args;
829 va_start(args, format);
830 vsnprintf( internal::WorkloadName, MaxWorkloadNameLen, format, args );
831 va_end(args);
832 }
833
834
TestMain(int argc,char * argv[],const SessionSettings * defaultSettings)835 __TBB_PERF_API int TestMain( int argc, char* argv[], const SessionSettings* defaultSettings ) {
836 #if _MSC_VER
837 HANDLE hMutex = CreateMutex( NULL, FALSE, "Global\\TBB_OMP_PerfSession" );
838 WaitForSingleObject( hMutex, INFINITE );
839 #endif
840 MinThread = MaxThread = NumCpus;
841 if ( defaultSettings )
842 theSettings = *defaultSettings;
843 ParseCommandLine( argc, argv ); // May override data in theSettings
844
845 internal::PrepareTests ();
846 internal::RunTests ();
847 internal::PrintResults();
848 REPORT("\n");
849 #if _MSC_VER
850 ReleaseMutex( hMutex );
851 CloseHandle( hMutex );
852 #endif
853 return 0;
854 }
855
856 } // namespace Perf
857