1/*
2Copyright (c) 2012 Advanced Micro Devices, Inc.
3
4This software is provided 'as-is', without any express or implied warranty.
5In no event will the authors be held liable for any damages arising from the use of this software.
6Permission is granted to anyone to use this software for any purpose,
7including commercial applications, and to alter it and redistribute it freely,
8subject to the following restrictions:
9
101. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
112. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
123. This notice may not be removed or altered from any source distribution.
13*/
14//Originally written by Takahiro Harada
15
16#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
17
18#pragma OPENCL EXTENSION cl_amd_printf : enable
19#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
20#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
21#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
22#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
23
24#ifdef cl_ext_atomic_counters_32
25#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
26#else
27#define counter32_t volatile __global int*
28#endif
29
30
31typedef unsigned int u32;
32typedef unsigned short u16;
33typedef unsigned char u8;
34
35#define GET_GROUP_IDX get_group_id(0)
36#define GET_LOCAL_IDX get_local_id(0)
37#define GET_GLOBAL_IDX get_global_id(0)
38#define GET_GROUP_SIZE get_local_size(0)
39#define GET_NUM_GROUPS get_num_groups(0)
40#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
41#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
42#define AtomInc(x) atom_inc(&(x))
43#define AtomInc1(x, out) out = atom_inc(&(x))
44#define AppendInc(x, out) out = atomic_inc(x)
45#define AtomAdd(x, value) atom_add(&(x), value)
46#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
47#define AtomXhg(x, value) atom_xchg ( &(x), value )
48
49
50#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
51
52#define make_float4 (float4)
53#define make_float2 (float2)
54#define make_uint4 (uint4)
55#define make_int4 (int4)
56#define make_uint2 (uint2)
57#define make_int2 (int2)
58
59
60#define max2 max
61#define min2 min
62
63
64#define WG_SIZE 64
65
66
67
68
69
70typedef struct
71{
72	int m_n;
73	int m_start;
74	int m_staticIdx;
75	int m_paddings[1];
76} ConstBuffer;
77
78typedef struct
79{
80	int m_a;
81	int m_b;
82	u32 m_idx;
83}Elem;
84
85#define STACK_SIZE (WG_SIZE*10)
86//#define STACK_SIZE (WG_SIZE)
87#define RING_SIZE 1024
88#define RING_SIZE_MASK (RING_SIZE-1)
89#define CHECK_SIZE (WG_SIZE)
90
91
92#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
93#define RING_END ldsTmp
94
95u32 readBuf(__local u32* buff, int idx)
96{
97	idx = idx % (32*CHECK_SIZE);
98	int bitIdx = idx%32;
99	int bufIdx = idx/32;
100	return buff[bufIdx] & (1<<bitIdx);
101}
102
103void writeBuf(__local u32* buff, int idx)
104{
105	idx = idx % (32*CHECK_SIZE);
106	int bitIdx = idx%32;
107	int bufIdx = idx/32;
108//	buff[bufIdx] |= (1<<bitIdx);
109	atom_or( &buff[bufIdx], (1<<bitIdx) );
110}
111
112u32 tryWrite(__local u32* buff, int idx)
113{
114	idx = idx % (32*CHECK_SIZE);
115	int bitIdx = idx%32;
116	int bufIdx = idx/32;
117	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
118	return ((ans >> bitIdx)&1) == 0;
119}
120
121//	batching on the GPU
122__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
123		__global const u32* gN, __global const u32* gStart, __global int* batchSizes,
124		int m_staticIdx )
125{
126	__local u32 ldsStackIdx[STACK_SIZE];
127	__local u32 ldsStackEnd;
128	__local Elem ldsRingElem[RING_SIZE];
129	__local u32 ldsRingEnd;
130	__local u32 ldsTmp;
131	__local u32 ldsCheckBuffer[CHECK_SIZE];
132	__local u32 ldsFixedBuffer[CHECK_SIZE];
133	__local u32 ldsGEnd;
134	__local u32 ldsDstEnd;
135
136	int wgIdx = GET_GROUP_IDX;
137	int lIdx = GET_LOCAL_IDX;
138
139	const int m_n = gN[wgIdx];
140	const int m_start = gStart[wgIdx];
141
142	if( lIdx == 0 )
143	{
144		ldsRingEnd = 0;
145		ldsGEnd = 0;
146		ldsStackEnd = 0;
147		ldsDstEnd = m_start;
148	}
149
150
151
152//	while(1)
153//was 250
154	int ie=0;
155	int maxBatch = 0;
156	for(ie=0; ie<50; ie++)
157	{
158		ldsFixedBuffer[lIdx] = 0;
159
160		for(int giter=0; giter<4; giter++)
161		{
162			int ringCap = GET_RING_CAPACITY;
163
164			//	1. fill ring
165			if( ldsGEnd < m_n )
166			{
167				while( ringCap > WG_SIZE )
168				{
169					if( ldsGEnd >= m_n ) break;
170					if( lIdx < ringCap - WG_SIZE )
171					{
172						int srcIdx;
173						AtomInc1( ldsGEnd, srcIdx );
174						if( srcIdx < m_n )
175						{
176							int dstIdx;
177							AtomInc1( ldsRingEnd, dstIdx );
178
179							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
180							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
181							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
182							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
183							ldsRingElem[dstIdx].m_idx = srcIdx;
184						}
185					}
186					ringCap = GET_RING_CAPACITY;
187				}
188			}
189
190			GROUP_LDS_BARRIER;
191
192			//	2. fill stack
193			__local Elem* dst = ldsRingElem;
194			if( lIdx == 0 ) RING_END = 0;
195
196			int srcIdx=lIdx;
197			int end = ldsRingEnd;
198
199			{
200				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
201				{
202					Elem e;
203					if(srcIdx<end) e = ldsRingElem[srcIdx];
204					bool done = (srcIdx<end)?false:true;
205
206					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
207
208					if( !done )
209					{
210						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));
211						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));
212
213						if( aUsed==0 && bUsed==0 )
214						{
215							int aAvailable=1;
216							int bAvailable=1;
217							int ea = abs(e.m_a);
218							int eb = abs(e.m_b);
219
220							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
221							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
222
223							if (!aStatic)
224								aAvailable = tryWrite( ldsCheckBuffer, ea );
225							if (!bStatic)
226								bAvailable = tryWrite( ldsCheckBuffer, eb );
227
228							//aAvailable = aStatic? 1: aAvailable;
229							//bAvailable = bStatic? 1: bAvailable;
230
231							bool success = (aAvailable && bAvailable);
232							if(success)
233							{
234
235								if (!aStatic)
236									writeBuf( ldsFixedBuffer, ea );
237								if (!bStatic)
238									writeBuf( ldsFixedBuffer, eb );
239							}
240							done = success;
241						}
242					}
243
244					//	put it aside
245					if(srcIdx<end)
246					{
247						if( done )
248						{
249							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
250							if( dstIdx < STACK_SIZE )
251								ldsStackIdx[dstIdx] = e.m_idx;
252							else{
253								done = false;
254								AtomAdd( ldsStackEnd, -1 );
255							}
256						}
257						if( !done )
258						{
259							int dstIdx; AtomInc1( RING_END, dstIdx );
260							dst[dstIdx] = e;
261						}
262					}
263
264					//	if filled, flush
265					if( ldsStackEnd == STACK_SIZE )
266					{
267						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
268						{
269							int idx = m_start + ldsStackIdx[i];
270							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
271							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
272							gConstraintsOut[ dstIdx ].m_batchIdx = ie;
273						}
274						if( lIdx == 0 ) ldsStackEnd = 0;
275
276						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE)
277						ldsFixedBuffer[lIdx] = 0;
278					}
279				}
280			}
281
282			if( lIdx == 0 ) ldsRingEnd = RING_END;
283		}
284
285		GROUP_LDS_BARRIER;
286
287		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
288		{
289			int idx = m_start + ldsStackIdx[i];
290			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
291			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
292			gConstraintsOut[ dstIdx ].m_batchIdx = ie;
293		}
294
295		//	in case it couldn't consume any pair. Flush them
296		//	todo. Serial batch worth while?
297		if( ldsStackEnd == 0 )
298		{
299			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
300			{
301				int idx = m_start + ldsRingElem[i].m_idx;
302				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
303				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
304				int curBatch = 100+i;
305				if (maxBatch < curBatch)
306					maxBatch = curBatch;
307
308				gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
309
310			}
311			GROUP_LDS_BARRIER;
312			if( lIdx == 0 ) ldsRingEnd = 0;
313		}
314
315		if( lIdx == 0 ) ldsStackEnd = 0;
316
317		GROUP_LDS_BARRIER;
318
319		//	termination
320		if( ldsGEnd == m_n && ldsRingEnd == 0 )
321			break;
322	}
323
324	if( lIdx == 0 )
325	{
326		if (maxBatch < ie)
327			maxBatch=ie;
328		batchSizes[wgIdx]=maxBatch;
329	}
330
331}
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354