1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "perf.h"
18 
19 #include <cstdlib>
20 #include <cmath>
21 #include <vector>
22 #include <algorithm>
23 #include <cassert>
24 
25 #include "tbb/tick_count.h"
26 
27 #define HARNESS_CUSTOM_MAIN 1
28 #include "../src/test/harness.h"
29 #include "../src/test/harness_barrier.h"
30 
31 #include "tbb/task_scheduler_init.h"
32 #include "tbb/task.h"
33 #include "tbb/atomic.h"
34 
35 #if  __linux__ || __APPLE__ || (__FreeBSD__||__DragonFly__) || __NetBSD__
36     #include <sys/resource.h>
37 #endif
38 
39 __TBB_PERF_API int NumCpus = tbb::task_scheduler_init::default_num_threads(),
40                    NumThreads,
41                    MaxConcurrency;
42 
43 namespace Perf {
44 
45 SessionSettings theSettings;
46 
47 namespace internal {
48 
49     typedef std::vector<duration_t> durations_t;
50 
51     static uintptr_t NumRuns = 7;
52     static duration_t RunDuration = 0.01;
53 
54     static const int RateFieldLen = 10;
55     static const int OvhdFieldLen = 12;
56 
57     const char* TestNameColumnTitle = "Test name";
58     const char* WorkloadNameColumnTitle = "Workload";
59 
60     size_t TitleFieldLen = 0;
61     size_t WorkloadFieldLen = 0;
62 
63     int TotalConfigs = 0;
64     int MaxTbbMasters = 1;
65 
66     //! Defines the mapping between threads and cores in the undersubscription mode
67     /** When adding new enumerator, insert it before amLast, and do not specify
68         its value explicitly. **/
69     enum AffinitizationMode {
70         amFirst = 0,
71         amDense = amFirst,
72         amSparse,
73         //! Used to track the number of supported affinitization modes
74         amLast
75     };
76 
77     static const int NumAffinitizationModes = amLast - amFirst;
78 
79     const char* AffinitizationModeNames[] = { "dense", "sparse" };
80 
81     int NumActiveAffModes = 1;
82 
83     //! Settings of a test run configuration
84     struct RunConfig {
85         int my_maxConcurrency;
86         int my_numThreads;      // For task scheduler tests this is number of workers + 1
87         int my_numMasters;      // Used for task scheduler tests only
88         int my_affinityMode;    // Used for task scheduler tests only
89         int my_workloadID;
90 
NumMastersPerf::internal::RunConfig91         int NumMasters () const {
92             return theSettings.my_opts & UseTaskScheduler ? my_numMasters : my_numThreads;
93         }
94     };
95 
StandardDeviation(double avg,const durations_t & d)96     double StandardDeviation ( double avg, const durations_t& d ) {
97         double  std_dev = 0;
98         for ( uintptr_t i = 0; i < d.size(); ++i ) {
99             double  dev = fabs(d[i] - avg);
100             std_dev += dev * dev;
101         }
102         std_dev = sqrt(std_dev / d.size());
103         return std_dev / avg * 100;
104     }
105 
Statistics(const durations_t & d,duration_t & avgTime,double & stdDev,duration_t & minTime,duration_t & maxTime)106     void Statistics ( const durations_t& d,
107                       duration_t& avgTime, double& stdDev,
108                       duration_t& minTime, duration_t& maxTime )
109     {
110         minTime = maxTime = avgTime = d[0];
111         for ( size_t i = 1; i < d.size(); ++i ) {
112             avgTime += d[i];
113             if ( minTime > d[i] )
114                 minTime = d[i];
115             else if ( maxTime < d[i] )
116                 maxTime = d[i];
117         }
118         avgTime = avgTime / d.size();
119         stdDev = StandardDeviation( avgTime, d );
120     }
121 
122     //! Timing data for the series of repeated runs and results of their statistical processing
123     struct TimingSeries {
124         //! Statistical timing series
125         durations_t my_durations;
126 
127         //! Average time obtained from my_durations data
128         duration_t  my_avgTime;
129 
130         //! Minimal time obtained from my_durations data
131         duration_t  my_minTime;
132 
133         //! Minimal time obtained from my_durations data
134         duration_t  my_maxTime;
135 
136         //! Standard deviation of my_avgTime value (per cent)
137         double  my_stdDev;
138 
TimingSeriesPerf::internal::TimingSeries139         TimingSeries ( uintptr_t nruns = NumRuns )
140             : my_durations(nruns), my_avgTime(0), my_minTime(0), my_maxTime(0)
141         {}
142 
CalculateStatisticsPerf::internal::TimingSeries143         void CalculateStatistics () {
144             Statistics( my_durations, my_avgTime, my_stdDev, my_minTime, my_maxTime );
145         }
146     }; // struct TimingSeries
147 
148     //! Settings and timing results for a test run configuration
149     struct RunResults {
150         //! Run configuration settings
151         RunConfig   my_config;
152 
153         //! Timing results for this run configuration
154         TimingSeries my_timing;
155     };
156 
157     typedef std::vector<const char*>    names_t;
158     typedef std::vector<TimingSeries>   timings_t;
159     typedef std::vector<RunResults>     test_results_t;
160 
161     enum TestMethods {
162         idRunSerial = 0x01,
163         idOnStart = 0x02,
164         idOnFinish = 0x04,
165         idPrePostProcess = idOnStart | idOnFinish
166     };
167 
168     //! Set of flags identifying methods not overridden by the currently active test
169     /** Used as a scratch var. **/
170     uintptr_t g_absentMethods;
171 
172     //! Test object and timing results for all of its configurations
173     struct TestResults {
174         //! Pointer to the test object interface
175         Test*           my_test;
176 
177         //! Set of flags identifying optional methods overridden by my_test
178         /** A set of ORed TestMethods flags **/
179         uintptr_t       my_availableMethods;
180 
181         //! Vector of serial times for each workload supported by this test
182         /** Element index in the vector serves as a zero based workload ID. **/
183         timings_t       my_serialBaselines;
184 
185         //! Common baselines for both parallel and serial variants
186         /** Element index in the vector serves as a zero based workload ID. **/
187         timings_t       my_baselines;
188 
189         //! Strings identifying workloads to be used in output
190         names_t         my_workloadNames;
191 
192         //! Vector of timings for all run configurations of my_test
193         test_results_t  my_results;
194 
195         const char*     my_testName;
196 
197         mutable bool    my_hasOwnership;
198 
TestResultsPerf::internal::TestResults199         TestResults ( Test* t, const char* className, bool takeOwnership )
200             : my_test(t), my_availableMethods(0), my_testName(className), my_hasOwnership(takeOwnership)
201         {}
202 
TestResultsPerf::internal::TestResults203         TestResults ( const TestResults& tr )
204             : my_test(tr.my_test)
205             , my_availableMethods(0)
206             , my_testName(tr.my_testName)
207             , my_hasOwnership(tr.my_hasOwnership)
208         {
209             tr.my_hasOwnership = false;
210         }
211 
~TestResultsPerf::internal::TestResults212         ~TestResults () {
213             for ( size_t i = 0; i < my_workloadNames.size(); ++i )
214                 delete my_workloadNames[i];
215             if ( my_hasOwnership )
216                 delete my_test;
217         }
218     }; // struct TestResults
219 
220     typedef std::vector<TestResults> session_t;
221 
222     session_t theSession;
223 
224     TimingSeries CalibrationTiming;
225 
226     const uintptr_t CacheSize = 8*1024*1024;
227     volatile intptr_t W[CacheSize];
228 
229     struct WiperBody {
operator ()Perf::internal::WiperBody230         void operator()( int ) const {
231             volatile intptr_t sink = 0;
232             for ( uintptr_t i = 0; i < CacheSize; ++i )
233                 sink += W[i];
234         }
235     };
236 
TraceHistogram(const durations_t & t,const char * histogramFileName)237     void TraceHistogram ( const durations_t& t, const char* histogramFileName ) {
238         FILE* f = histogramFileName ? fopen(histogramFileName, "wt") : stdout;
239         uintptr_t  n = t.size();
240         const uintptr_t num_buckets = 100;
241         double  min_val = *std::min_element(t.begin(), t.end()),
242                 max_val = *std::max_element(t.begin(), t.end()),
243                 bucket_size = (max_val - min_val) / num_buckets;
244         std::vector<uintptr_t> hist(num_buckets + 1, 0);
245         for ( uintptr_t i = 0; i < n; ++i )
246             ++hist[uintptr_t((t[i]-min_val)/bucket_size)];
247         ASSERT (hist[num_buckets] == 1, "");
248         ++hist[num_buckets - 1];
249         hist.resize(num_buckets);
250         fprintf (f, "Histogram: nvals = %u, min = %g, max = %g, nbuckets = %u\n", (unsigned)n, min_val, max_val, (unsigned)num_buckets);
251         double bucket = min_val;
252         for ( uintptr_t i = 0; i < num_buckets; ++i, bucket+=bucket_size )
253             fprintf (f, "%12g\t%u\n", bucket, (unsigned)hist[i]);
254         fclose(f);
255     }
256 
257 #if _MSC_VER
258     typedef DWORD_PTR cpu_set_t;
259 
260     class AffinityHelper {
261         static const unsigned MaxAffinitySetSize = sizeof(cpu_set_t) * 8;
262         static unsigned AffinitySetSize;
263 
264         //! Mapping from a CPU index to a valid affinity cpu_mask
265         /** The first element is not used. **/
266         static cpu_set_t m_affinities[MaxAffinitySetSize + 1];
267 
268         static cpu_set_t m_processMask;
269 
270         class Initializer {
271         public:
Initializer()272             Initializer () {
273                 SYSTEM_INFO si;
274                 GetNativeSystemInfo(&si);
275                 ASSERT( si.dwNumberOfProcessors <= MaxAffinitySetSize, "Too many CPUs" );
276                 AffinitySetSize = min (si.dwNumberOfProcessors, MaxAffinitySetSize);
277                 cpu_set_t systemMask = 0;
278                 GetProcessAffinityMask( GetCurrentProcess(), &m_processMask, &systemMask );
279                 cpu_set_t cpu_mask = 1;
280                 for ( DWORD i = 0; i < AffinitySetSize; ++i ) {
281                     while ( !(cpu_mask & m_processMask) && cpu_mask )
282                         cpu_mask <<= 1;
283                     ASSERT( cpu_mask != 0, "Process affinity set is culled?" );
284                     m_affinities[i] = cpu_mask;
285                     cpu_mask <<= 1;
286                 }
287             }
288         }; // class AffinityHelper::Initializer
289 
290         static Initializer m_initializer;
291 
292     public:
CpuAffinity(int cpuIndex)293         static cpu_set_t CpuAffinity ( int cpuIndex ) {
294             return m_affinities[cpuIndex % AffinitySetSize];
295         }
296 
ProcessMask()297         static const cpu_set_t& ProcessMask () { return m_processMask; }
298     }; // class AffinityHelper
299 
300     unsigned AffinityHelper::AffinitySetSize = 0;
301     cpu_set_t AffinityHelper::m_affinities[AffinityHelper::MaxAffinitySetSize + 1] = {0};
302     cpu_set_t AffinityHelper::m_processMask = 0;
303     AffinityHelper::Initializer AffinityHelper::m_initializer;
304 
305     #define CPU_ZERO(cpu_mask)              (*cpu_mask = 0)
306     #define CPU_SET(cpu_idx, cpu_mask)      (*cpu_mask |= AffinityHelper::CpuAffinity(cpu_idx))
307     #define CPU_CLR(cpu_idx, cpu_mask)      (*cpu_mask &= ~AffinityHelper::CpuAffinity(cpu_idx))
308     #define CPU_ISSET(cpu_idx, cpu_mask)    ((*cpu_mask & AffinityHelper::CpuAffinity(cpu_idx)) != 0)
309 
310 #elif __linux__ /* end of _MSC_VER */
311 
312     #include <unistd.h>
313     #include <sys/types.h>
314     #include <linux/unistd.h>
315 
gettid()316     pid_t gettid() { return (pid_t)syscall(__NR_gettid); }
317 
318     #define GET_MASK(cpu_set) (*(unsigned*)(void*)&cpu_set)
319     #define RES_STAT(res) (res != 0 ? "failed" : "ok")
320 
321     class AffinityHelper {
322         static cpu_set_t m_processMask;
323 
324         class Initializer {
325         public:
Initializer()326             Initializer () {
327                 CPU_ZERO (&m_processMask);
328                 int res = sched_getaffinity( getpid(), sizeof(cpu_set_t), &m_processMask );
329                 ASSERT ( res == 0, "sched_getaffinity failed" );
330             }
331         }; // class AffinityHelper::Initializer
332 
333         static Initializer m_initializer;
334 
335     public:
ProcessMask()336         static const cpu_set_t& ProcessMask () { return m_processMask; }
337     }; // class AffinityHelper
338 
339     cpu_set_t AffinityHelper::m_processMask;
340     AffinityHelper::Initializer AffinityHelper::m_initializer;
341 #endif /* __linux__ */
342 
PinTheThread(int cpu_idx,tbb::atomic<int> & nThreads)343     bool PinTheThread ( int cpu_idx, tbb::atomic<int>& nThreads ) {
344     #if _MSC_VER || __linux__
345         cpu_set_t orig_mask, target_mask;
346         CPU_ZERO( &target_mask );
347         CPU_SET( cpu_idx, &target_mask );
348         ASSERT ( CPU_ISSET(cpu_idx, &target_mask), "CPU_SET failed" );
349     #endif
350     #if _MSC_VER
351         orig_mask = SetThreadAffinityMask( GetCurrentThread(), target_mask );
352         if ( !orig_mask )
353             return false;
354     #elif __linux__
355         CPU_ZERO( &orig_mask );
356         int res = sched_getaffinity( gettid(), sizeof(cpu_set_t), &orig_mask );
357         ASSERT ( res == 0, "sched_getaffinity failed" );
358         res = sched_setaffinity( gettid(), sizeof(cpu_set_t), &target_mask );
359         ASSERT ( res == 0, "sched_setaffinity failed" );
360     #endif /* _MSC_VER */
361         --nThreads;
362         while ( nThreads )
363             __TBB_Yield();
364     #if _MSC_VER
365         SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
366     #endif
367         return true;
368     }
369 
370     class AffinitySetterTask : tbb::task {
371         static bool m_result;
372         static tbb::atomic<int> m_nThreads;
373         int m_idx;
374 
execute()375         tbb::task* execute () {
376             //TestAffinityOps();
377             m_result = PinTheThread( m_idx, m_nThreads );
378             return NULL;
379         }
380 
381     public:
AffinitySetterTask(int idx)382         AffinitySetterTask ( int idx ) : m_idx(idx) {}
383 
384         friend bool AffinitizeTBB ( int, int /*mode*/ );
385     };
386 
387     bool AffinitySetterTask::m_result = true;
388     tbb::atomic<int> AffinitySetterTask::m_nThreads;
389 
AffinitizeTBB(int p,int affMode)390     bool AffinitizeTBB ( int p, int affMode ) {
391     #if _MSC_VER
392         SetThreadPriority (GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
393         SetPriorityClass (GetCurrentProcess(), HIGH_PRIORITY_CLASS);
394     #endif
395         AffinitySetterTask::m_result = true;
396         AffinitySetterTask::m_nThreads = p;
397         tbb::task_list  tl;
398         for ( int i = 0; i < p; ++i ) {
399             tbb::task &t = *new( tbb::task::allocate_root() ) AffinitySetterTask( affMode == amSparse ? i * NumCpus / p : i );
400             t.set_affinity( tbb::task::affinity_id(i + 1) );
401             tl.push_back( t );
402         }
403         tbb::task::spawn_root_and_wait(tl);
404         return AffinitySetterTask::m_result;
405     }
406 
407     inline
Affinitize(int p,int affMode)408     void Affinitize ( int p, int affMode ) {
409         if ( !AffinitizeTBB (p, affMode) )
410             REPORT("Warning: Failed to set affinity for %d TBB threads\n", p);
411     }
412 
413     class TbbWorkersTrapper {
414         tbb::atomic<int> my_refcount;
415         tbb::task *my_root;
416         tbb::task_group_context my_context;
417         Harness::SpinBarrier my_barrier;
418 
419         friend class TrapperTask;
420 
421         class TrapperTask : public tbb::task {
422             TbbWorkersTrapper& my_owner;
423 
execute()424             tbb::task* execute () {
425                 my_owner.my_barrier.wait();
426                 my_owner.my_root->wait_for_all();
427                 my_owner.my_barrier.wait();
428                 return NULL;
429             }
430         public:
TrapperTask(TbbWorkersTrapper & owner)431             TrapperTask ( TbbWorkersTrapper& owner ) : my_owner(owner) {}
432         };
433 
434     public:
TbbWorkersTrapper()435         TbbWorkersTrapper ()
436             : my_context(tbb::task_group_context::bound,
437                          tbb::task_group_context::default_traits | tbb::task_group_context::concurrent_wait)
438         {
439             my_root = new ( tbb::task::allocate_root(my_context) ) tbb::empty_task;
440             my_root->set_ref_count(2);
441             my_barrier.initialize(NumThreads);
442             for ( int i = 1; i < NumThreads; ++i )
443                 tbb::task::spawn( *new(tbb::task::allocate_root()) TrapperTask(*this) );
444             my_barrier.wait(); // Wait util all workers are ready
445         }
446 
~TbbWorkersTrapper()447         ~TbbWorkersTrapper () {
448             my_root->decrement_ref_count();
449             my_barrier.wait(); // Make sure no tasks are referencing us
450             tbb::task::destroy(*my_root);
451         }
452     }; // TbbWorkersTrapper
453 
454 
455 #if __TBB_STATISTICS
456     static bool StatisticsMode = true;
457 #else
458     static bool StatisticsMode = false;
459 #endif
460 
461 //! Suppresses silly warning
__TBB_bool(bool b)462 inline bool __TBB_bool( bool b ) { return b; }
463 
464 #define START_WORKERS(needScheduler, p, a, setWorkersAffinity, trapWorkers) \
465     tbb::task_scheduler_init init(tbb::task_scheduler_init::deferred);      \
466     TbbWorkersTrapper *trapper = NULL;                                      \
467     if ( theSettings.my_opts & UseTaskScheduler                   \
468          && (needScheduler) && ((setWorkersAffinity) || (trapWorkers)) )    \
469     {                                                                       \
470         init.initialize( p );                                               \
471         if ( __TBB_bool(setWorkersAffinity) )                               \
472             Affinitize( p, a );                                             \
473         if ( __TBB_bool(trapWorkers) )                                      \
474             trapper = new TbbWorkersTrapper;                                \
475     }
476 
477 #define STOP_WORKERS()  \
478     if ( theSettings.my_opts & UseTaskScheduler && init.is_active() ) {     \
479         if ( trapper )                                                      \
480             delete trapper;                                                 \
481         init.terminate();                                                   \
482         /* Give asynchronous deinitialization time to complete */           \
483         Harness::Sleep(50);                                                 \
484     }
485 
486     typedef void (Test::*RunMemFnPtr)( Test::ThreadInfo& );
487 
488     TimingSeries *TlsTimings;
489     Harness::SpinBarrier  multipleMastersBarrier;
490 
491     class TimingFunctor {
492         Test* my_test;
493         RunConfig *my_cfg;
494         RunMemFnPtr my_fnRun;
495         size_t my_numRuns;
496         size_t my_numRepeats;
497         uintptr_t my_availableMethods;
498 
TimeSingleRun(Test::ThreadInfo & ti) const499         duration_t TimeSingleRun ( Test::ThreadInfo& ti ) const {
500             if ( my_availableMethods & idOnStart )
501                 my_test->OnStart(ti);
502             // Warming run
503             (my_test->*my_fnRun)(ti);
504             multipleMastersBarrier.wait();
505             tbb::tick_count t0 = tbb::tick_count::now();
506             (my_test->*my_fnRun)(ti);
507             duration_t t = (tbb::tick_count::now() - t0).seconds();
508             if ( my_availableMethods & idOnFinish )
509                 my_test->OnFinish(ti);
510             return t;
511         }
512 
513     public:
TimingFunctor(Test * test,RunConfig * cfg,RunMemFnPtr fnRun,size_t numRuns,size_t nRepeats,uintptr_t availableMethods)514         TimingFunctor ( Test* test, RunConfig *cfg, RunMemFnPtr fnRun,
515                         size_t numRuns, size_t nRepeats, uintptr_t availableMethods )
516             : my_test(test), my_cfg(cfg), my_fnRun(fnRun)
517             , my_numRuns(numRuns), my_numRepeats(nRepeats), my_availableMethods(availableMethods)
518         {}
519 
operator ()(int tid) const520         void operator()( int tid ) const {
521             Test::ThreadInfo ti = { tid, NULL };
522             durations_t &d = TlsTimings[tid].my_durations;
523             bool singleMaster = my_cfg->my_numMasters == 1;
524             START_WORKERS( (!singleMaster || (singleMaster && StatisticsMode)) && my_fnRun != &Test::RunSerial,
525                             my_cfg->my_numThreads, my_cfg->my_affinityMode, singleMaster, singleMaster );
526             for ( uintptr_t k = 0; k < my_numRuns; ++k )  {
527                 if ( my_numRepeats > 1 ) {
528                     d[k] = 0;
529                     if ( my_availableMethods & idPrePostProcess ) {
530                         for ( uintptr_t i = 0; i < my_numRepeats; ++i )
531                             d[k] += TimeSingleRun(ti);
532                     }
533                     else {
534                         multipleMastersBarrier.wait();
535                         tbb::tick_count t0 = tbb::tick_count::now();
536                         for ( uintptr_t i = 0; i < my_numRepeats; ++i )
537                             (my_test->*my_fnRun)(ti);
538                         d[k] = (tbb::tick_count::now() - t0).seconds();
539                     }
540                     d[k] /= my_numRepeats;
541                 }
542                 else
543                     d[k] = TimeSingleRun(ti);
544             }
545             STOP_WORKERS();
546             TlsTimings[tid].CalculateStatistics();
547         }
548     }; // class TimingFunctor
549 
DoTiming(TestResults & tr,RunConfig & cfg,RunMemFnPtr fnRun,size_t nRepeats,TimingSeries & ts)550     void DoTiming ( TestResults& tr, RunConfig &cfg, RunMemFnPtr fnRun, size_t nRepeats, TimingSeries& ts ) {
551         int numThreads = cfg.NumMasters();
552         size_t numRuns = ts.my_durations.size() / numThreads;
553         TimingFunctor body( tr.my_test, &cfg, fnRun, numRuns, nRepeats, tr.my_availableMethods );
554         multipleMastersBarrier.initialize(numThreads);
555         tr.my_test->SetWorkload(cfg.my_workloadID);
556         if ( numThreads == 1 ) {
557             TimingSeries *t = TlsTimings;
558             TlsTimings = &ts;
559             body(0);
560             TlsTimings = t;
561         }
562         else {
563             ts.my_durations.resize(numThreads * numRuns);
564             NativeParallelFor( numThreads, body );
565             for ( int i = 0, j = 0; i < numThreads; ++i ) {
566                 durations_t &d = TlsTimings[i].my_durations;
567                 for ( size_t k = 0; k < numRuns; ++k, ++j )
568                     ts.my_durations[j] = d[k];
569             }
570             ts.CalculateStatistics();
571         }
572     }
573 
574     //! Runs the test function, does statistical processing, and, if title is nonzero, prints results.
575     /** If histogramFileName is a string, the histogram of individual runs is generated and stored
576         in a file with the given name. If it is NULL then the histogram is printed on the console.
577         By default no histogram is generated.
578         The histogram format is: "rate bucket start" "number of tests in this bucket". **/
RunTestImpl(TestResults & tr,RunConfig & cfg,RunMemFnPtr pfnTest,TimingSeries & ts)579     void RunTestImpl ( TestResults& tr, RunConfig &cfg, RunMemFnPtr pfnTest, TimingSeries& ts ) {
580         // nRepeats is a number of repeated calls to the test function made as
581         // part of the same run. It is determined experimentally by the following
582         // calibration process so that the total run time was approx. RunDuration.
583         // This is helpful to increase the measurement precision in case of very
584         // short tests.
585         size_t nRepeats = 1;
586         // A minimal stats is enough when doing calibration
587         CalibrationTiming.my_durations.resize( (NumRuns < 4 ? NumRuns : 3) * cfg.NumMasters() );
588         // There's no need to be too precise when calculating nRepeats. And reasonably
589         // far extrapolation can speed up the process significantly.
590         for (;;) {
591             DoTiming( tr, cfg, pfnTest, nRepeats, CalibrationTiming );
592             if ( CalibrationTiming.my_avgTime * nRepeats > 1e-4 )
593                 break;
594             nRepeats *= 2;
595         }
596         nRepeats *= (uintptr_t)ceil( RunDuration / (CalibrationTiming.my_avgTime * nRepeats) );
597 
598         DoTiming(tr, cfg, pfnTest, nRepeats, ts);
599 
600         // No histogram for baseline measurements
601         if ( pfnTest != &Test::RunSerial && pfnTest != &Test::Baseline ) {
602             const char* histogramName = theSettings.my_histogramName;
603             if ( histogramName != NoHistogram && tr.my_test->HistogramName() != DefaultHistogram )
604                 histogramName = tr.my_test->HistogramName();
605             if ( histogramName != NoHistogram )
606                 TraceHistogram( ts.my_durations, histogramName );
607         }
608     } // RunTestImpl
609 
610     typedef void (*TestActionFn) ( TestResults&, int mastersRange, int w, int p, int m, int a, int& numTests );
611 
TestResultIndex(int mastersRange,int w,int p,int m,int a)612     int TestResultIndex ( int mastersRange, int w, int p, int m, int a ) {
613         return ((w * (MaxThread - MinThread + 1) + (p - MinThread)) * mastersRange + m) * NumActiveAffModes + a;
614     }
615 
RunTest(TestResults & tr,int mastersRange,int w,int p,int m,int a,int & numTests)616     void RunTest ( TestResults& tr, int mastersRange, int w, int p, int m, int a, int& numTests ) {
617         size_t r = TestResultIndex(mastersRange, w, p, m, a);
618         ASSERT( r < tr.my_results.size(), NULL );
619         RunConfig &rc = tr.my_results[r].my_config;
620         rc.my_maxConcurrency = MaxConcurrency;
621         rc.my_numThreads = p;
622         rc.my_numMasters = m + tr.my_test->MinNumMasters();
623         rc.my_affinityMode = a;
624         rc.my_workloadID = w;
625         RunTestImpl( tr, rc, &Test::Run, tr.my_results[r].my_timing );
626         printf( "Running tests: %04.1f%%\r",  ++numTests * 100. / TotalConfigs ); fflush(stdout);
627     }
628 
WalkTests(TestActionFn fn,int & numTests,bool setAffinity,bool trapWorkers,bool multipleMasters)629     void WalkTests ( TestActionFn fn, int& numTests, bool setAffinity, bool trapWorkers, bool multipleMasters ) {
630         for ( int p = MinThread; p <= MaxThread; ++p ) {
631             NumThreads = p;
632             MaxConcurrency = p < NumCpus ? p : NumCpus;
633             for ( int a = 0; a < NumActiveAffModes; ++a ) {
634                 START_WORKERS( multipleMasters || !StatisticsMode, p, a, setAffinity, trapWorkers );
635                 for ( size_t i = 0; i < theSession.size(); ++i ) {
636                     TestResults &tr = theSession[i];
637                     Test *t = tr.my_test;
638                     int mastersRange = t->MaxNumMasters() - t->MinNumMasters() + 1;
639                     int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t->NumWorkloads();
640                     for ( int w = 0; w < numWorkloads; ++w ) {
641                         if ( multipleMasters )
642                             for ( int m = 1; m < mastersRange; ++m )
643                                 fn( tr, mastersRange, w, p, m, a, numTests );
644                         else
645                             fn( tr, mastersRange, w, p, 0, a, numTests );
646                     }
647                 }
648                 STOP_WORKERS();
649             }
650         }
651     }
652 
RunTests()653     void RunTests () {
654         int numTests = 0;
655         WalkTests( &RunTest, numTests, !StatisticsMode, !StatisticsMode, false );
656         if ( MaxTbbMasters > 1 )
657             WalkTests( &RunTest, numTests, true, false, true );
658     }
659 
InitTestData(TestResults & tr,int mastersRange,int w,int p,int m,int a,int &)660     void InitTestData ( TestResults& tr, int mastersRange, int w, int p, int m, int a, int& ) {
661         size_t r = TestResultIndex(mastersRange, w, p, m, a);
662         ASSERT( r < tr.my_results.size(), NULL );
663         tr.my_results[r].my_timing.my_durations.resize(
664             (theSettings.my_opts & UseTaskScheduler ? tr.my_test->MinNumMasters() + m : p) * NumRuns );
665     }
666 
667     char WorkloadName[MaxWorkloadNameLen + 1];
668 
PrepareTests()669     void PrepareTests () {
670         printf( "Initializing...\r" );
671         NumActiveAffModes = theSettings.my_opts & UseAffinityModes ? NumAffinitizationModes : 1;
672         TotalConfigs = 0;
673         TitleFieldLen = strlen( TestNameColumnTitle );
674         WorkloadFieldLen = strlen( WorkloadNameColumnTitle );
675         int numThreads = MaxThread - MinThread + 1;
676         int numConfigsBase = numThreads * NumActiveAffModes;
677         int totalWorkloads = 0;
678         for ( size_t i = 0; i < theSession.size(); ++i ) {
679             TestResults &tr = theSession[i];
680             Test &t = *tr.my_test;
681             int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t.NumWorkloads();
682             int numConfigs = numConfigsBase * numWorkloads;
683             if ( t.MaxNumMasters() > 1 ) {
684                 ASSERT( theSettings.my_opts & UseTaskScheduler, "Multiple masters mode is only valid for task scheduler tests" );
685                 if ( MaxTbbMasters < t.MaxNumMasters() )
686                     MaxTbbMasters = t.MaxNumMasters();
687                 numConfigs *= t.MaxNumMasters() - t.MinNumMasters() + 1;
688             }
689             totalWorkloads += numWorkloads;
690             TotalConfigs += numConfigs;
691 
692             const char* testName = t.Name();
693             if ( testName )
694                 tr.my_testName = testName;
695             ASSERT( tr.my_testName, "Neither Test::Name() is implemented, nor RTTI is enabled" );
696             TitleFieldLen = max( TitleFieldLen, strlen(tr.my_testName) );
697 
698             tr.my_results.resize( numConfigs );
699             tr.my_serialBaselines.resize( numWorkloads );
700             tr.my_baselines.resize( numWorkloads );
701             tr.my_workloadNames.resize( numWorkloads );
702         }
703         TimingSeries tmpTiming;
704         TlsTimings = &tmpTiming; // All measurements are serial here
705         int n = 0;
706         for ( size_t i = 0; i < theSession.size(); ++i ) {
707             TestResults &tr = theSession[i];
708             Test &t = *tr.my_test;
709             // Detect which methods are overridden by the test implementation
710             g_absentMethods = 0;
711             Test::ThreadInfo ti = { 0 };
712             t.SetWorkload(0);
713             t.OnStart(ti);
714             t.RunSerial(ti);
715             t.OnFinish(ti);
716             if ( theSettings.my_opts & UseSerialBaseline && !(g_absentMethods & idRunSerial) )
717                 tr.my_availableMethods |= idRunSerial;
718             if ( !(g_absentMethods & idOnStart) )
719                 tr.my_availableMethods |= idOnStart;
720 
721             RunConfig rc = { 1, 1, 1, 0, 0 };
722             int numWorkloads = theSettings.my_opts & UseSmallestWorkloadOnly ? 1 : t.NumWorkloads();
723             for ( int w = 0; w < numWorkloads; ++w ) {
724                 WorkloadName[0] = 0;
725                 t.SetWorkload(w);
726                 if ( !WorkloadName[0] )
727                     sprintf( WorkloadName, "%d", w );
728                 size_t len = strlen(WorkloadName);
729                 tr.my_workloadNames[w] = new char[len + 1];
730                 strcpy ( (char*)tr.my_workloadNames[w], WorkloadName );
731                 WorkloadFieldLen = max( WorkloadFieldLen, len );
732 
733                 rc.my_workloadID = w;
734                 if ( theSettings.my_opts & UseBaseline )
735                     RunTestImpl( tr, rc, &Test::Baseline, tr.my_baselines[w] );
736                 if ( tr.my_availableMethods & idRunSerial )
737                     RunTestImpl( tr, rc, &Test::RunSerial, tr.my_serialBaselines[w] );
738                 printf( "Measuring baselines: %04.1f%%\r",  ++n * 100. / totalWorkloads ); fflush(stdout);
739             }
740         }
741         TlsTimings = new TimingSeries[MaxThread + MaxTbbMasters - 1];
742         if ( theSettings.my_opts & UseTaskScheduler ? MaxTbbMasters : MaxThread )
743             WalkTests( &InitTestData, n, false, false, theSettings.my_opts & UseTaskScheduler ? true : false );
744         CalibrationTiming.my_durations.reserve( MaxTbbMasters * 3 );
745         printf( "                                                          \r");
746     }
747 
748     FILE* ResFile = NULL;
749 
Report(char const * fmt,...)750     void Report ( char const* fmt, ... ) {
751         va_list args;
752         if ( ResFile ) {
753             va_start( args, fmt );
754             vfprintf( ResFile, fmt, args );
755             va_end( args );
756         }
757         va_start( args, fmt );
758         vprintf( fmt, args );
759         va_end( args );
760     }
761 
PrintResults()762     void PrintResults () {
763         if ( theSettings.my_resFile )
764             ResFile = fopen( theSettings.my_resFile, "w" );
765         Report( "%-*s %-*s %s", TitleFieldLen, "Test-name", WorkloadFieldLen, "Workload",
766                                 MaxTbbMasters > 1 ? "W    M    " : "T    " );
767         if ( theSettings.my_opts & UseAffinityModes )
768             Report( "Aff  " );
769         Report( "%-*s SD,%%  %-*s %-*s %-*s ",
770                 RateFieldLen, "Avg.time", OvhdFieldLen, "Par.ovhd,%",
771                 RateFieldLen, "Min.time", RateFieldLen, "Max.time" );
772         Report( " | Repeats = %lu, CPUs %d\n", (unsigned long)NumRuns, NumCpus );
773         for ( size_t i = 0; i < theSession.size(); ++i ) {
774             TestResults &tr = theSession[i];
775             for ( size_t j = 0; j < tr.my_results.size(); ++j ) {
776                 RunResults &rr = tr.my_results[j];
777                 RunConfig &rc = rr.my_config;
778                 int w = rc.my_workloadID;
779                 TimingSeries &ts = rr.my_timing;
780                 duration_t baselineTime = tr.my_baselines[w].my_avgTime,
781                            cleanTime = ts.my_avgTime - baselineTime;
782                 Report( "%-*s %-*s ", TitleFieldLen, tr.my_testName, WorkloadFieldLen, tr.my_workloadNames[w] );
783                 if ( MaxTbbMasters > 1 )
784                     Report( "%-4d %-4d ", rc.my_numThreads - 1, rc.my_numMasters );
785                 else
786                     Report( "%-4d ", rc.my_numThreads );
787                 if ( theSettings.my_opts & UseAffinityModes )
788                     Report( "%%-8s ", AffinitizationModeNames[rc.my_affinityMode] );
789                 Report( "%-*.2e %-6.1f ", RateFieldLen, cleanTime, ts.my_stdDev);
790                 if ( tr.my_availableMethods & idRunSerial  ) {
791                     duration_t serialTime = (tr.my_serialBaselines[w].my_avgTime - baselineTime) / rc.my_maxConcurrency;
792                     Report( "%-*.1f ", OvhdFieldLen, 100*(cleanTime - serialTime)/serialTime );
793                 }
794                 else
795                     Report( "%*s%*s ", OvhdFieldLen/2, "-", OvhdFieldLen - OvhdFieldLen/2, "" );
796                 Report( "%-*.2e %-*.2e ", RateFieldLen, ts.my_minTime - baselineTime, RateFieldLen, ts.my_maxTime - baselineTime);
797                 Report( "\n" );
798             }
799         }
800         delete [] TlsTimings;
801         if ( ResFile )
802             fclose(ResFile);
803     }
804 
RegisterTest(Test * t,const char * className,bool takeOwnership)805     __TBB_PERF_API void RegisterTest ( Test* t, const char* className, bool takeOwnership ) {
806         // Just collect test objects at this stage
807         theSession.push_back( TestResults(t, className, takeOwnership) );
808     }
809 
810 } // namespace internal
811 
Baseline(ThreadInfo &)812 __TBB_PERF_API void Test::Baseline ( ThreadInfo& ) {}
813 
RunSerial(ThreadInfo &)814 __TBB_PERF_API void Test::RunSerial ( ThreadInfo& ) { internal::g_absentMethods |= internal::idRunSerial; }
815 
OnStart(ThreadInfo &)816 __TBB_PERF_API void Test::OnStart ( ThreadInfo& ) { internal::g_absentMethods |= internal::idOnStart; }
817 
OnFinish(ThreadInfo &)818 __TBB_PERF_API void Test::OnFinish ( ThreadInfo& ) { internal::g_absentMethods |= internal::idOnFinish; }
819 
WipeCaches()820 __TBB_PERF_API void WipeCaches () { NativeParallelFor( NumCpus, internal::WiperBody() ); }
821 
EmptyFunc()822 __TBB_PERF_API void EmptyFunc () {}
AnchorFunc(void *)823 __TBB_PERF_API void AnchorFunc ( void* ) {}
AnchorFunc2(void *,void *)824 __TBB_PERF_API void AnchorFunc2 ( void*, void* ) {}
825 
SetWorkloadName(const char * format,...)826 __TBB_PERF_API void SetWorkloadName( const char* format, ... ) {
827     internal::WorkloadName[MaxWorkloadNameLen] = 0;
828     va_list args;
829     va_start(args, format);
830     vsnprintf( internal::WorkloadName, MaxWorkloadNameLen, format, args );
831     va_end(args);
832 }
833 
834 
TestMain(int argc,char * argv[],const SessionSettings * defaultSettings)835 __TBB_PERF_API int TestMain( int argc, char* argv[], const SessionSettings* defaultSettings ) {
836 #if _MSC_VER
837     HANDLE hMutex = CreateMutex( NULL, FALSE, "Global\\TBB_OMP_PerfSession" );
838     WaitForSingleObject( hMutex, INFINITE );
839 #endif
840     MinThread = MaxThread = NumCpus;
841     if ( defaultSettings )
842         theSettings = *defaultSettings;
843     ParseCommandLine( argc, argv );  // May override data in theSettings
844 
845     internal::PrepareTests ();
846     internal::RunTests ();
847     internal::PrintResults();
848     REPORT("\n");
849 #if _MSC_VER
850     ReleaseMutex( hMutex );
851     CloseHandle( hMutex );
852 #endif
853     return 0;
854 }
855 
856 } // namespace Perf
857