1 /*
2 Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
3 
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9 
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 
15 #include "btThreads.h"
16 #include "btQuickprof.h"
17 #include <algorithm>  // for min and max
18 
19 #if BT_USE_OPENMP && BT_THREADSAFE
20 
21 #include <omp.h>
22 
23 #endif  // #if BT_USE_OPENMP && BT_THREADSAFE
24 
25 #if BT_USE_PPL && BT_THREADSAFE
26 
27 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
28 #include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
29 // Visual Studio 2010 and later should come with it
30 #include <concrtrm.h>  // for GetProcessorCount()
31 
32 #endif  // #if BT_USE_PPL && BT_THREADSAFE
33 
34 #if BT_USE_TBB && BT_THREADSAFE
35 
36 // use Intel Threading Building Blocks for thread management
37 #define __TBB_NO_IMPLICIT_LINKAGE 1
38 #include <tbb/tbb.h>
39 #include <tbb/task_scheduler_init.h>
40 #include <tbb/parallel_for.h>
41 #include <tbb/blocked_range.h>
42 
43 #endif  // #if BT_USE_TBB && BT_THREADSAFE
44 
45 #if BT_THREADSAFE
46 //
47 // Lightweight spin-mutex based on atomics
48 // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower
49 // presumably because when it fails to lock at first it would sleep the thread and trigger costly
50 // context switching.
51 //
52 
53 #if __cplusplus >= 201103L
54 
55 // for anything claiming full C++11 compliance, use C++11 atomics
56 // on GCC or Clang you need to compile with -std=c++11
57 #define USE_CPP11_ATOMICS 1
58 
59 #elif defined(_MSC_VER)
60 
61 // on MSVC, use intrinsics instead
62 #define USE_MSVC_INTRINSICS 1
63 
64 #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
65 
66 // available since GCC 4.7 and some versions of clang
67 // todo: check for clang
68 #define USE_GCC_BUILTIN_ATOMICS 1
69 
70 #elif defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
71 
72 // available since GCC 4.1
73 #define USE_GCC_BUILTIN_ATOMICS_OLD 1
74 
75 #endif
76 
77 #if USE_CPP11_ATOMICS
78 
79 #include <atomic>
80 #include <thread>
81 
82 #define THREAD_LOCAL_STATIC thread_local static
83 
tryLock()84 bool btSpinMutex::tryLock()
85 {
86 	std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
87 	int expected = 0;
88 	return std::atomic_compare_exchange_weak_explicit(aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire);
89 }
90 
lock()91 void btSpinMutex::lock()
92 {
93 	// note: this lock does not sleep the thread.
94 	while (!tryLock())
95 	{
96 		// spin
97 	}
98 }
99 
unlock()100 void btSpinMutex::unlock()
101 {
102 	std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
103 	std::atomic_store_explicit(aDest, int(0), std::memory_order_release);
104 }
105 
106 #elif USE_MSVC_INTRINSICS
107 
108 #define WIN32_LEAN_AND_MEAN
109 
110 #include <windows.h>
111 #include <intrin.h>
112 
113 #define THREAD_LOCAL_STATIC __declspec(thread) static
114 
tryLock()115 bool btSpinMutex::tryLock()
116 {
117 	volatile long* aDest = reinterpret_cast<long*>(&mLock);
118 	return (0 == _InterlockedCompareExchange(aDest, 1, 0));
119 }
120 
lock()121 void btSpinMutex::lock()
122 {
123 	// note: this lock does not sleep the thread
124 	while (!tryLock())
125 	{
126 		// spin
127 	}
128 }
129 
unlock()130 void btSpinMutex::unlock()
131 {
132 	volatile long* aDest = reinterpret_cast<long*>(&mLock);
133 	_InterlockedExchange(aDest, 0);
134 }
135 
136 #elif USE_GCC_BUILTIN_ATOMICS
137 
138 #define THREAD_LOCAL_STATIC static __thread
139 
tryLock()140 bool btSpinMutex::tryLock()
141 {
142 	int expected = 0;
143 	bool weak = false;
144 	const int memOrderSuccess = __ATOMIC_ACQ_REL;
145 	const int memOrderFail = __ATOMIC_ACQUIRE;
146 	return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail);
147 }
148 
lock()149 void btSpinMutex::lock()
150 {
151 	// note: this lock does not sleep the thread
152 	while (!tryLock())
153 	{
154 		// spin
155 	}
156 }
157 
unlock()158 void btSpinMutex::unlock()
159 {
160 	__atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE);
161 }
162 
163 #elif USE_GCC_BUILTIN_ATOMICS_OLD
164 
165 #define THREAD_LOCAL_STATIC static __thread
166 
tryLock()167 bool btSpinMutex::tryLock()
168 {
169 	return __sync_bool_compare_and_swap(&mLock, int(0), int(1));
170 }
171 
lock()172 void btSpinMutex::lock()
173 {
174 	// note: this lock does not sleep the thread
175 	while (!tryLock())
176 	{
177 		// spin
178 	}
179 }
180 
unlock()181 void btSpinMutex::unlock()
182 {
183 	// write 0
184 	__sync_fetch_and_and(&mLock, int(0));
185 }
186 
187 #else  //#elif USE_MSVC_INTRINSICS
188 
189 #error "no threading primitives defined -- unknown platform"
190 
191 #endif  //#else //#elif USE_MSVC_INTRINSICS
192 
193 #else  //#if BT_THREADSAFE
194 
195 // These should not be called ever
lock()196 void btSpinMutex::lock()
197 {
198 	btAssert(!"unimplemented btSpinMutex::lock() called");
199 }
200 
unlock()201 void btSpinMutex::unlock()
202 {
203 	btAssert(!"unimplemented btSpinMutex::unlock() called");
204 }
205 
tryLock()206 bool btSpinMutex::tryLock()
207 {
208 	btAssert(!"unimplemented btSpinMutex::tryLock() called");
209 	return true;
210 }
211 
212 #define THREAD_LOCAL_STATIC static
213 
214 #endif  // #else //#if BT_THREADSAFE
215 
216 struct ThreadsafeCounter
217 {
218 	unsigned int mCounter;
219 	btSpinMutex mMutex;
220 
ThreadsafeCounterThreadsafeCounter221 	ThreadsafeCounter()
222 	{
223 		mCounter = 0;
224 		--mCounter;  // first count should come back 0
225 	}
226 
getNextThreadsafeCounter227 	unsigned int getNext()
228 	{
229 		// no need to optimize this with atomics, it is only called ONCE per thread!
230 		mMutex.lock();
231 		mCounter++;
232 		if (mCounter >= BT_MAX_THREAD_COUNT)
233 		{
234 			btAssert(!"thread counter exceeded");
235 			// wrap back to the first worker index
236 			mCounter = 1;
237 		}
238 		unsigned int val = mCounter;
239 		mMutex.unlock();
240 		return val;
241 	}
242 };
243 
244 static btITaskScheduler* gBtTaskScheduler=0;
245 static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
246 static btSpinMutex gThreadsRunningCounterMutex;
247 static ThreadsafeCounter gThreadCounter;
248 
249 //
250 // BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index.
251 //
252 // BT_DETECT_BAD_THREAD_INDEX is a developer option to test if
253 // certain assumptions about how the task scheduler manages its threads
254 // holds true.
255 // The main assumption is:
256 //   - when the threadpool is resized, the task scheduler either
257 //      1. destroys all worker threads and creates all new ones in the correct number, OR
258 //      2. never destroys a worker thread
259 //
260 // We make that assumption because we can't easily enumerate the worker threads of a task scheduler
261 // to assign nice sequential thread-indexes. We also do not get notified if a worker thread is destroyed,
262 // so we can't tell when a thread-index is no longer being used.
263 // We allocate thread-indexes as needed with a sequential global thread counter.
264 //
265 // Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but
266 // continues to re-use other threads and the application repeatedly resizes the thread pool of the
267 // task scheduler.
268 // In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we
269 // wrap the thread counter back to 1. This should only happen if the worker threads have all been
270 // destroyed and re-created.
271 //
272 // BT_DETECT_BAD_THREAD_INDEX only works for Win32 right now,
273 // but could be adapted to work with pthreads
274 #define BT_DETECT_BAD_THREAD_INDEX 0
275 
276 #if BT_DETECT_BAD_THREAD_INDEX
277 
278 typedef DWORD ThreadId_t;
279 const static ThreadId_t kInvalidThreadId = 0;
280 ThreadId_t gDebugThreadIds[BT_MAX_THREAD_COUNT];
281 
getDebugThreadId()282 static ThreadId_t getDebugThreadId()
283 {
284 	return GetCurrentThreadId();
285 }
286 
287 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
288 
289 // return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT)
btGetCurrentThreadIndex()290 unsigned int btGetCurrentThreadIndex()
291 {
292 	const unsigned int kNullIndex = ~0U;
293 	THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex;
294 	if (sThreadIndex == kNullIndex)
295 	{
296 		sThreadIndex = gThreadCounter.getNext();
297 		btAssert(sThreadIndex < BT_MAX_THREAD_COUNT);
298 	}
299 #if BT_DETECT_BAD_THREAD_INDEX
300 	if (gBtTaskScheduler && sThreadIndex > 0)
301 	{
302 		ThreadId_t tid = getDebugThreadId();
303 		// if not set
304 		if (gDebugThreadIds[sThreadIndex] == kInvalidThreadId)
305 		{
306 			// set it
307 			gDebugThreadIds[sThreadIndex] = tid;
308 		}
309 		else
310 		{
311 			if (gDebugThreadIds[sThreadIndex] != tid)
312 			{
313 				// this could indicate the task scheduler is breaking our assumptions about
314 				// how threads are managed when threadpool is resized
315 				btAssert(!"there are 2 or more threads with the same thread-index!");
316 				__debugbreak();
317 			}
318 		}
319 	}
320 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
321 	return sThreadIndex;
322 }
323 
btIsMainThread()324 bool btIsMainThread()
325 {
326 	return btGetCurrentThreadIndex() == 0;
327 }
328 
btResetThreadIndexCounter()329 void btResetThreadIndexCounter()
330 {
331 	// for when all current worker threads are destroyed
332 	btAssert(btIsMainThread());
333 	gThreadCounter.mCounter = 0;
334 }
335 
btITaskScheduler(const char * name)336 btITaskScheduler::btITaskScheduler(const char* name)
337 {
338 	m_name = name;
339 	m_savedThreadCounter = 0;
340 	m_isActive = false;
341 }
342 
activate()343 void btITaskScheduler::activate()
344 {
345 	// gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler.
346 	// The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1)
347 	// The thread-indexes need to be unique amongst the threads that can be running simultaneously.
348 	// Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different
349 	// task schedulers to share the same thread index because they can't be running at the same time.
350 	// So each task scheduler needs to keep its own thread counter value
351 	if (!m_isActive)
352 	{
353 		gThreadCounter.mCounter = m_savedThreadCounter;  // restore saved thread counter
354 		m_isActive = true;
355 	}
356 }
357 
deactivate()358 void btITaskScheduler::deactivate()
359 {
360 	if (m_isActive)
361 	{
362 		m_savedThreadCounter = gThreadCounter.mCounter;  // save thread counter
363 		m_isActive = false;
364 	}
365 }
366 
btPushThreadsAreRunning()367 void btPushThreadsAreRunning()
368 {
369 	gThreadsRunningCounterMutex.lock();
370 	gThreadsRunningCounter++;
371 	gThreadsRunningCounterMutex.unlock();
372 }
373 
btPopThreadsAreRunning()374 void btPopThreadsAreRunning()
375 {
376 	gThreadsRunningCounterMutex.lock();
377 	gThreadsRunningCounter--;
378 	gThreadsRunningCounterMutex.unlock();
379 }
380 
btThreadsAreRunning()381 bool btThreadsAreRunning()
382 {
383 	return gThreadsRunningCounter != 0;
384 }
385 
btSetTaskScheduler(btITaskScheduler * ts)386 void btSetTaskScheduler(btITaskScheduler* ts)
387 {
388 	int threadId = btGetCurrentThreadIndex();  // make sure we call this on main thread at least once before any workers run
389 	if (threadId != 0)
390 	{
391 		btAssert(!"btSetTaskScheduler must be called from the main thread!");
392 		return;
393 	}
394 	if (gBtTaskScheduler)
395 	{
396 		// deactivate old task scheduler
397 		gBtTaskScheduler->deactivate();
398 	}
399 	gBtTaskScheduler = ts;
400 	if (ts)
401 	{
402 		// activate new task scheduler
403 		ts->activate();
404 	}
405 }
406 
btGetTaskScheduler()407 btITaskScheduler* btGetTaskScheduler()
408 {
409 	return gBtTaskScheduler;
410 }
411 
btParallelFor(int iBegin,int iEnd,int grainSize,const btIParallelForBody & body)412 void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body)
413 {
414 #if BT_THREADSAFE
415 
416 #if BT_DETECT_BAD_THREAD_INDEX
417 	if (!btThreadsAreRunning())
418 	{
419 		// clear out thread ids
420 		for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
421 		{
422 			gDebugThreadIds[i] = kInvalidThreadId;
423 		}
424 	}
425 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
426 
427 	btAssert(gBtTaskScheduler != NULL);  // call btSetTaskScheduler() with a valid task scheduler first!
428 	gBtTaskScheduler->parallelFor(iBegin, iEnd, grainSize, body);
429 
430 #else  // #if BT_THREADSAFE
431 
432 	// non-parallel version of btParallelFor
433 	btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
434 	body.forLoop(iBegin, iEnd);
435 
436 #endif  // #if BT_THREADSAFE
437 }
438 
btParallelSum(int iBegin,int iEnd,int grainSize,const btIParallelSumBody & body)439 btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body)
440 {
441 #if BT_THREADSAFE
442 
443 #if BT_DETECT_BAD_THREAD_INDEX
444 	if (!btThreadsAreRunning())
445 	{
446 		// clear out thread ids
447 		for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
448 		{
449 			gDebugThreadIds[i] = kInvalidThreadId;
450 		}
451 	}
452 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
453 
454 	btAssert(gBtTaskScheduler != NULL);  // call btSetTaskScheduler() with a valid task scheduler first!
455 	return gBtTaskScheduler->parallelSum(iBegin, iEnd, grainSize, body);
456 
457 #else  // #if BT_THREADSAFE
458 
459 	// non-parallel version of btParallelSum
460 	btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
461 	return body.sumLoop(iBegin, iEnd);
462 
463 #endif  //#else // #if BT_THREADSAFE
464 }
465 
466 ///
467 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
468 ///                              (really just useful for testing performance of single threaded vs multi)
469 ///
470 class btTaskSchedulerSequential : public btITaskScheduler
471 {
472 public:
btTaskSchedulerSequential()473 	btTaskSchedulerSequential() : btITaskScheduler("Sequential") {}
getMaxNumThreads() const474 	virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
getNumThreads() const475 	virtual int getNumThreads() const BT_OVERRIDE { return 1; }
setNumThreads(int numThreads)476 	virtual void setNumThreads(int numThreads) BT_OVERRIDE {}
parallelFor(int iBegin,int iEnd,int grainSize,const btIParallelForBody & body)477 	virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
478 	{
479 		BT_PROFILE("parallelFor_sequential");
480 		body.forLoop(iBegin, iEnd);
481 	}
parallelSum(int iBegin,int iEnd,int grainSize,const btIParallelSumBody & body)482 	virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
483 	{
484 		BT_PROFILE("parallelSum_sequential");
485 		return body.sumLoop(iBegin, iEnd);
486 	}
487 };
488 
489 #if BT_USE_OPENMP && BT_THREADSAFE
490 ///
491 /// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler
492 ///
493 class btTaskSchedulerOpenMP : public btITaskScheduler
494 {
495 	int m_numThreads;
496 
497 public:
btTaskSchedulerOpenMP()498 	btTaskSchedulerOpenMP() : btITaskScheduler("OpenMP")
499 	{
500 		m_numThreads = 0;
501 	}
getMaxNumThreads() const502 	virtual int getMaxNumThreads() const BT_OVERRIDE
503 	{
504 		return omp_get_max_threads();
505 	}
getNumThreads() const506 	virtual int getNumThreads() const BT_OVERRIDE
507 	{
508 		return m_numThreads;
509 	}
setNumThreads(int numThreads)510 	virtual void setNumThreads(int numThreads) BT_OVERRIDE
511 	{
512 		// With OpenMP, because it is a standard with various implementations, we can't
513 		// know for sure if every implementation has the same behavior of destroying all
514 		// previous threads when resizing the threadpool
515 		m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
516 		omp_set_num_threads(1);  // hopefully, all previous threads get destroyed here
517 		omp_set_num_threads(m_numThreads);
518 		m_savedThreadCounter = 0;
519 		if (m_isActive)
520 		{
521 			btResetThreadIndexCounter();
522 		}
523 	}
parallelFor(int iBegin,int iEnd,int grainSize,const btIParallelForBody & body)524 	virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
525 	{
526 		BT_PROFILE("parallelFor_OpenMP");
527 		btPushThreadsAreRunning();
528 #pragma omp parallel for schedule(static, 1)
529 		for (int i = iBegin; i < iEnd; i += grainSize)
530 		{
531 			BT_PROFILE("OpenMP_forJob");
532 			body.forLoop(i, (std::min)(i + grainSize, iEnd));
533 		}
534 		btPopThreadsAreRunning();
535 	}
parallelSum(int iBegin,int iEnd,int grainSize,const btIParallelSumBody & body)536 	virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
537 	{
538 		BT_PROFILE("parallelFor_OpenMP");
539 		btPushThreadsAreRunning();
540 		btScalar sum = btScalar(0);
541 #pragma omp parallel for schedule(static, 1) reduction(+ \
542 													   : sum)
543 		for (int i = iBegin; i < iEnd; i += grainSize)
544 		{
545 			BT_PROFILE("OpenMP_sumJob");
546 			sum += body.sumLoop(i, (std::min)(i + grainSize, iEnd));
547 		}
548 		btPopThreadsAreRunning();
549 		return sum;
550 	}
551 };
552 #endif  // #if BT_USE_OPENMP && BT_THREADSAFE
553 
554 #if BT_USE_TBB && BT_THREADSAFE
555 ///
556 /// btTaskSchedulerTBB -- wrapper around Intel Threaded Building Blocks task scheduler
557 ///
558 class btTaskSchedulerTBB : public btITaskScheduler
559 {
560 	int m_numThreads;
561 	tbb::task_scheduler_init* m_tbbSchedulerInit;
562 
563 public:
btTaskSchedulerTBB()564 	btTaskSchedulerTBB() : btITaskScheduler("IntelTBB")
565 	{
566 		m_numThreads = 0;
567 		m_tbbSchedulerInit = NULL;
568 	}
~btTaskSchedulerTBB()569 	~btTaskSchedulerTBB()
570 	{
571 		if (m_tbbSchedulerInit)
572 		{
573 			delete m_tbbSchedulerInit;
574 			m_tbbSchedulerInit = NULL;
575 		}
576 	}
577 
getMaxNumThreads() const578 	virtual int getMaxNumThreads() const BT_OVERRIDE
579 	{
580 		return tbb::task_scheduler_init::default_num_threads();
581 	}
getNumThreads() const582 	virtual int getNumThreads() const BT_OVERRIDE
583 	{
584 		return m_numThreads;
585 	}
setNumThreads(int numThreads)586 	virtual void setNumThreads(int numThreads) BT_OVERRIDE
587 	{
588 		m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
589 		if (m_tbbSchedulerInit)
590 		{
591 			// destroys all previous threads
592 			delete m_tbbSchedulerInit;
593 			m_tbbSchedulerInit = NULL;
594 		}
595 		m_tbbSchedulerInit = new tbb::task_scheduler_init(m_numThreads);
596 		m_savedThreadCounter = 0;
597 		if (m_isActive)
598 		{
599 			btResetThreadIndexCounter();
600 		}
601 	}
602 	struct ForBodyAdapter
603 	{
604 		const btIParallelForBody* mBody;
605 
ForBodyAdapterbtTaskSchedulerTBB::ForBodyAdapter606 		ForBodyAdapter(const btIParallelForBody* body) : mBody(body) {}
operator ()btTaskSchedulerTBB::ForBodyAdapter607 		void operator()(const tbb::blocked_range<int>& range) const
608 		{
609 			BT_PROFILE("TBB_forJob");
610 			mBody->forLoop(range.begin(), range.end());
611 		}
612 	};
parallelFor(int iBegin,int iEnd,int grainSize,const btIParallelForBody & body)613 	virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
614 	{
615 		BT_PROFILE("parallelFor_TBB");
616 		ForBodyAdapter tbbBody(&body);
617 		btPushThreadsAreRunning();
618 		tbb::parallel_for(tbb::blocked_range<int>(iBegin, iEnd, grainSize),
619 						  tbbBody,
620 						  tbb::simple_partitioner());
621 		btPopThreadsAreRunning();
622 	}
623 	struct SumBodyAdapter
624 	{
625 		const btIParallelSumBody* mBody;
626 		btScalar mSum;
627 
SumBodyAdapterbtTaskSchedulerTBB::SumBodyAdapter628 		SumBodyAdapter(const btIParallelSumBody* body) : mBody(body), mSum(btScalar(0)) {}
SumBodyAdapterbtTaskSchedulerTBB::SumBodyAdapter629 		SumBodyAdapter(const SumBodyAdapter& src, tbb::split) : mBody(src.mBody), mSum(btScalar(0)) {}
joinbtTaskSchedulerTBB::SumBodyAdapter630 		void join(const SumBodyAdapter& src) { mSum += src.mSum; }
operator ()btTaskSchedulerTBB::SumBodyAdapter631 		void operator()(const tbb::blocked_range<int>& range)
632 		{
633 			BT_PROFILE("TBB_sumJob");
634 			mSum += mBody->sumLoop(range.begin(), range.end());
635 		}
636 	};
parallelSum(int iBegin,int iEnd,int grainSize,const btIParallelSumBody & body)637 	virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
638 	{
639 		BT_PROFILE("parallelSum_TBB");
640 		SumBodyAdapter tbbBody(&body);
641 		btPushThreadsAreRunning();
642 		tbb::parallel_deterministic_reduce(tbb::blocked_range<int>(iBegin, iEnd, grainSize), tbbBody);
643 		btPopThreadsAreRunning();
644 		return tbbBody.mSum;
645 	}
646 };
647 #endif  // #if BT_USE_TBB && BT_THREADSAFE
648 
649 #if BT_USE_PPL && BT_THREADSAFE
650 ///
651 /// btTaskSchedulerPPL -- wrapper around Microsoft Parallel Patterns Lib task scheduler
652 ///
653 class btTaskSchedulerPPL : public btITaskScheduler
654 {
655 	int m_numThreads;
656 	concurrency::combinable<btScalar> m_sum;  // for parallelSum
657 public:
btTaskSchedulerPPL()658 	btTaskSchedulerPPL() : btITaskScheduler("PPL")
659 	{
660 		m_numThreads = 0;
661 	}
getMaxNumThreads() const662 	virtual int getMaxNumThreads() const BT_OVERRIDE
663 	{
664 		return concurrency::GetProcessorCount();
665 	}
getNumThreads() const666 	virtual int getNumThreads() const BT_OVERRIDE
667 	{
668 		return m_numThreads;
669 	}
setNumThreads(int numThreads)670 	virtual void setNumThreads(int numThreads) BT_OVERRIDE
671 	{
672 		// capping the thread count for PPL due to a thread-index issue
673 		const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31);
674 		m_numThreads = (std::max)(1, (std::min)(maxThreadCount, numThreads));
675 		using namespace concurrency;
676 		if (CurrentScheduler::Id() != -1)
677 		{
678 			CurrentScheduler::Detach();
679 		}
680 		SchedulerPolicy policy;
681 		{
682 			// PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads
683 			// force it to destroy old threads
684 			policy.SetConcurrencyLimits(1, 1);
685 			CurrentScheduler::Create(policy);
686 			CurrentScheduler::Detach();
687 		}
688 		policy.SetConcurrencyLimits(m_numThreads, m_numThreads);
689 		CurrentScheduler::Create(policy);
690 		m_savedThreadCounter = 0;
691 		if (m_isActive)
692 		{
693 			btResetThreadIndexCounter();
694 		}
695 	}
696 	struct ForBodyAdapter
697 	{
698 		const btIParallelForBody* mBody;
699 		int mGrainSize;
700 		int mIndexEnd;
701 
ForBodyAdapterbtTaskSchedulerPPL::ForBodyAdapter702 		ForBodyAdapter(const btIParallelForBody* body, int grainSize, int end) : mBody(body), mGrainSize(grainSize), mIndexEnd(end) {}
operator ()btTaskSchedulerPPL::ForBodyAdapter703 		void operator()(int i) const
704 		{
705 			BT_PROFILE("PPL_forJob");
706 			mBody->forLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
707 		}
708 	};
parallelFor(int iBegin,int iEnd,int grainSize,const btIParallelForBody & body)709 	virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
710 	{
711 		BT_PROFILE("parallelFor_PPL");
712 		// PPL dispatch
713 		ForBodyAdapter pplBody(&body, grainSize, iEnd);
714 		btPushThreadsAreRunning();
715 		// note: MSVC 2010 doesn't support partitioner args, so avoid them
716 		concurrency::parallel_for(iBegin,
717 								  iEnd,
718 								  grainSize,
719 								  pplBody);
720 		btPopThreadsAreRunning();
721 	}
722 	struct SumBodyAdapter
723 	{
724 		const btIParallelSumBody* mBody;
725 		concurrency::combinable<btScalar>* mSum;
726 		int mGrainSize;
727 		int mIndexEnd;
728 
SumBodyAdapterbtTaskSchedulerPPL::SumBodyAdapter729 		SumBodyAdapter(const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end) : mBody(body), mSum(sum), mGrainSize(grainSize), mIndexEnd(end) {}
operator ()btTaskSchedulerPPL::SumBodyAdapter730 		void operator()(int i) const
731 		{
732 			BT_PROFILE("PPL_sumJob");
733 			mSum->local() += mBody->sumLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
734 		}
735 	};
sumFunc(btScalar a,btScalar b)736 	static btScalar sumFunc(btScalar a, btScalar b) { return a + b; }
parallelSum(int iBegin,int iEnd,int grainSize,const btIParallelSumBody & body)737 	virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
738 	{
739 		BT_PROFILE("parallelSum_PPL");
740 		m_sum.clear();
741 		SumBodyAdapter pplBody(&body, &m_sum, grainSize, iEnd);
742 		btPushThreadsAreRunning();
743 		// note: MSVC 2010 doesn't support partitioner args, so avoid them
744 		concurrency::parallel_for(iBegin,
745 								  iEnd,
746 								  grainSize,
747 								  pplBody);
748 		btPopThreadsAreRunning();
749 		return m_sum.combine(sumFunc);
750 	}
751 };
752 #endif  // #if BT_USE_PPL && BT_THREADSAFE
753 
754 // create a non-threaded task scheduler (always available)
btGetSequentialTaskScheduler()755 btITaskScheduler* btGetSequentialTaskScheduler()
756 {
757 	static btTaskSchedulerSequential sTaskScheduler;
758 	return &sTaskScheduler;
759 }
760 
761 // create an OpenMP task scheduler (if available, otherwise returns null)
btGetOpenMPTaskScheduler()762 btITaskScheduler* btGetOpenMPTaskScheduler()
763 {
764 #if BT_USE_OPENMP && BT_THREADSAFE
765 	static btTaskSchedulerOpenMP sTaskScheduler;
766 	return &sTaskScheduler;
767 #else
768 	return NULL;
769 #endif
770 }
771 
772 // create an Intel TBB task scheduler (if available, otherwise returns null)
btGetTBBTaskScheduler()773 btITaskScheduler* btGetTBBTaskScheduler()
774 {
775 #if BT_USE_TBB && BT_THREADSAFE
776 	static btTaskSchedulerTBB sTaskScheduler;
777 	return &sTaskScheduler;
778 #else
779 	return NULL;
780 #endif
781 }
782 
783 // create a PPL task scheduler (if available, otherwise returns null)
btGetPPLTaskScheduler()784 btITaskScheduler* btGetPPLTaskScheduler()
785 {
786 #if BT_USE_PPL && BT_THREADSAFE
787 	static btTaskSchedulerPPL sTaskScheduler;
788 	return &sTaskScheduler;
789 #else
790 	return NULL;
791 #endif
792 }
793