1/*
2 * Copyright (C) 2018-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8#ifndef SCHEDULER_EMULATION
9#include "device_enqueue.h"
10#endif
11
12// float passed as int
13extern float __intel__getProfilingTimerResolution();
14
15#ifndef EMULATION_ENTER_FUNCTION
16#define EMULATION_ENTER_FUNCTION( )
17#endif
18
19#ifndef NULL
20#define NULL                                    0
21#endif
22
23#define SIMD8                                   0
24#define SIMD16                                  1
25#define SIMD32                                  2
26
27#define SHORT_SIZE_IN_BYTES                     2
28#define DWORD_SIZE_IN_BYTES                     4
29#define QWORD_SIZE_IN_BYTES                     8
30
31#define MAX_GLOBAL_ARGS                         255
32
33#define MASK_LOW16_BITS                         0xFFFF
34//Currently setting to 8
35#define MAX_WALKERS_IN_PARALELL                 PARALLEL_SCHEDULER_HW_GROUPS
36//Need 4 uints per walker ( command packet offset, slb offest, dsh offset, idt offset,  + 1 to store total
37#define PARALLEL_SCHEDULER_OFFSETS_NUMBER       4
38#define PARALLEL_SCHEDULER_LOCAL_MEM_SIZE       ( MAX_WALKERS_IN_PARALELL * PARALLEL_SCHEDULER_OFFSETS_NUMBER + 1 )
39//Last index
40#define TOTAL_ENQUEUES_FOUND                    ( PARALLEL_SCHEDULER_LOCAL_MEM_SIZE - 1 )
41
42//CURBE STUFF, only entries that really needs to be patched
43#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT                                    1
44#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE                                    2
45#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE                                   3
46#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS                                    4
47#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS                                    5
48#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES          8
49#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET                                 16
50#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS                               17
51#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT                                       22
52#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE                           28
53
54#define SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH                                        ( 9 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
55#define SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT                                       ( 10 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
56#define SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH                                        ( 11 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
57#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE                            ( 12 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
58#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER                                ( 13 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
59#define SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE                                   ( 18 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
60#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES                                  ( 20 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
61#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS                               ( 27 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
62#define SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID                                    ( 35 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
63#define SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER                           ( 39 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
64
65#define DATA_PARAMETER_SAMPLER_ADDRESS_MODE                                         ( 14 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
66#define DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS                                    ( 15 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
67#define DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED                          ( 21 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
68#define SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID                                  ( 35 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
69
70//CURBE STUFF, only entries that really needs to be patched
71#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK                               ( 1 << 1 )
72#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK                               ( 1 << 2 )
73#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK                              ( 1 << 3 )
74#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK                               ( 1 << 4 )
75#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK                               ( 1 << 5 )
76#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK     ( 1 << 8 )
77#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK                            ( 1 << 16 )
78#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK                          ( 1 << 17 )
79#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK                                  ( 1 << 22 )
80#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK                      ( 1 << 28 )
81#define SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES                                ( ( ulong ) 1 << SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
82#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER                                     ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT )
83
84
85#define SCHEDULER_DATA_PARAMETER_SAMPLER_MASK                                       ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT )
86//Error codes
87#define SCHEDULER_CURBE_TOKEN_MISSED                                                10
88#define SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH                                     11
89
90#define CAN_BE_RECLAIMED                                                            123456
91
92#define SCHEDULER_MSF_INITIAL                                                       1
93#define SCHEDULER_MSF_SECOND                                                        2
94
95//Uncomment to enable logging debug data
96//#define    ENABLE_DEBUG_BUFFER    1
97
98#ifdef ENABLE_DEBUG_BUFFER
99//Update DebugDataInfo types in device_enqueue.h and PrintDebugDataBuffer() in cmd_queue_device.cpp
100
101//Flags
102#define    DDB_HAS_DATA_INFO        ( 1 << 0 )
103#define    DDB_SCHEDULER_PROFILING  ( 1 << 1 )
104
105#define    DDB_ALL                  ( 0xffffffff )
106
107#endif
108
109//Turn this to 1 to turn on debug calls, notice that it will cause up to 10 x longer time to build scheduler
110//#define    SCHEDULER_DEBUG_MODE 1
111
112//#define    DISABLE_RESOURCE_RECLAMATION                                                 1
113/*
114Resource reclamation procedure
1151. Move all new command packets from queue_t to qstorage
1162. In case there is place in storage for whole queue, reclaim space on queue
1173. Construct stack basing on new commands added in the qstorage
1184. Browse stack to find next item for execution
1195. When you take item from the stack and schedule it , reclaim place on qstorage buffer
120*/
121
122typedef struct
123{
124    uint3           ActualLocalSize;
125    uint3           WalkerDimSize;
126    uint3           WalkerStartPoint;
127} IGIL_WalkerData;
128
129typedef struct
130{
131    uint3           LocalWorkSize;
132    uint3           TotalDimSize;
133    IGIL_WalkerData WalkerArray[ 8 ];
134} IGIL_WalkerEnumeration;
135
136inline void patchDword( __global uint* patchedDword, uint startOffset, uint endOffset, uint value )
137{
138    uint LeftMask  = ALL_BITS_SET_DWORD_MASK >> ( DWORD_SIZE_IN_BITS - endOffset - 1 );
139    uint RightMask = ALL_BITS_SET_DWORD_MASK << ( startOffset );
140    uint CleanMask = ~( RightMask & LeftMask );
141    *patchedDword &= CleanMask;
142    *patchedDword |= ( value << startOffset );
143}
144
145inline __global IGIL_KernelAddressData* IGIL_GetKernelAddressData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId )
146{
147    return ( __global IGIL_KernelAddressData* ) ( &pKernelReflection->m_data[ blockId ] );
148}
149__global IGIL_KernelData* IGIL_GetKernelData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId )
150{
151    __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
152    uint Offset                                         = pKernelAddressData->m_KernelDataOffset;
153    __global char* pKernelReflectionRaw                 = ( __global char * ) pKernelReflection;
154    return ( __global IGIL_KernelData* ) ( &pKernelReflectionRaw[ Offset ] );
155}
156
157
158inline __global IGIL_CommandHeader* TEMP_IGIL_GetCommandHeader( __global IGIL_CommandQueue* q, uint offset )
159{
160    __global uchar *pQueueRaw = (__global uchar *) q;
161
162    __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset );
163
164    return pCommand;
165}
166//Make sure enough command packets are in command queue before calling this function.
167__global IGIL_CommandHeader* TEMP_IGIL_GetNthCommandHeader( __global IGIL_CommandQueue* q, uint initialOffset, uint number )
168{
169    __global uchar *pQueueRaw = (__global uchar *) q;
170
171    __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset );
172    uint Offset = initialOffset;
173    //Traverse queue_t unless nth command packet is found
174    while( number > 0 )
175    {
176        Offset  += pCommand->m_commandSize;
177        pCommand = TEMP_IGIL_GetCommandHeader( q, Offset );
178        number--;
179    }
180
181    return pCommand;
182}
183
184//Make sure enough command packets are in command queue before calling this function.
185uint TEMP_IGIL_GetNthCommandHeaderOffset( __global IGIL_CommandQueue* q, uint initialOffset, uint number )
186{
187    __global uchar *pQueueRaw = (__global uchar *) q;
188
189    __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset );
190    uint Offset = initialOffset;
191    //Traverse queue_t unless nth command packet is found
192    while( number > 0 )
193    {
194        Offset  += pCommand->m_commandSize;
195        pCommand = TEMP_IGIL_GetCommandHeader( q, Offset );
196        number--;
197    }
198
199    return Offset;
200}
201
202__global IGIL_CommandHeader* GetCommandHeaderFromStorage( __global uint* queueStorage, uint offset )
203{
204    __global uchar *pQueueRaw = ( __global uchar * ) queueStorage;
205
206    __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset );
207
208    return pCommand;
209}
210
211inline queue_t TEMP_IGIL_GetQueueT( IGIL_CommandHeader * queuePtr )
212{
213    return __builtin_astype( queuePtr, queue_t );
214}
215
216inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvents( __global IGIL_EventPool *pool )
217{
218    return ( __global IGIL_DeviceEvent * )( pool + 1 );
219}
220
221inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvent( __global IGIL_EventPool *pool, uint eventId )
222{
223    __global IGIL_DeviceEvent * pEvent = ( __global IGIL_DeviceEvent * )( pool + 1 );
224
225    return ( __global IGIL_DeviceEvent * )( pEvent + eventId );
226}
227bool SetEventState( __global IGIL_EventPool *pool, uint eventId, int state )
228{
229    __global IGIL_DeviceEvent* pDeviceEvent = TEMP_IGIL_GetDeviceEvent( pool, eventId );
230    pDeviceEvent->m_state = state;
231    return true;
232}
233
234void TEMP_IGIL_FreeEvent( clk_event_t event, __global IGIL_EventPool *pool )
235{
236    //Offset into the event data in the pool
237    __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( pool );
238
239    atomic_xchg( &events[ ( uint )(size_t)__builtin_astype( event, void* ) ].m_state, IGIL_EVENT_UNUSED );
240}
241
242void  IGILLOCAL_MEMCPY_GTOG( __global void* pDst, __global void* pSrc, int numBytes )
243{
244    numBytes = numBytes >> 2;
245    for( int i = 0; i < numBytes; i++ )
246    {
247        ( ( __global uint* )pDst ) [ i ] = ( ( __global int* )pSrc )[ i ];
248    }
249}
250
251//Global memcpy running on all witems possible, make sure it's run from all hw threads.
252void GLOBAL_MEMCPYUINT( __global void* pDst, __global void* pSrc, int numBytes )
253{
254    uint total_local_size   = get_local_size( 0 );
255    uint LoopCtr            = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES );
256    uint LeftOver           = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES );
257    uint Lid                = get_local_id( 0 );
258    uint i                  = 0;
259
260    //Main copy
261    for( i = 0; i < LoopCtr; i++ )
262    {
263         ( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ];
264    }
265    //Copy what's left
266    if( LeftOver != 0 )
267    {
268        if( Lid * DWORD_SIZE_IN_BYTES < LeftOver )
269        {
270            ( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ];
271        }
272    }
273}
274
275//In SIMD8 to fully use cachelines copy in portions of uint2 , 8 bytes x 8 witems = 64 bytes cacheline size.
276//Global memcpy running on all witems possible, make sure it's run from all hw threads.
277void GLOBAL_MEMCPY( __global void* pDst, __global void* pSrc, int numBytes )
278{
279    //In case I need dword copy use uint version of this function.
280    if( ( numBytes % ( DWORD_SIZE_IN_BYTES * 2 ) ) != 0 )
281    {
282       GLOBAL_MEMCPYUINT( pDst, pSrc, numBytes );
283    }
284    else
285    {
286        uint total_local_size   = get_local_size( 0 );
287        uint LoopCtr            = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES * 2 );
288        uint LeftOver           = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES * 2 );
289        uint Lid                = get_local_id( 0 );
290        uint i                  = 0;
291
292        //Main copy
293        for( i = 0; i < LoopCtr; i++ )
294        {
295             ( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ];
296        }
297        //Copy what's left
298        if( LeftOver != 0 )
299        {
300            if( Lid * DWORD_SIZE_IN_BYTES * 2 < LeftOver )
301            {
302                ( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ];
303            }
304        }
305    }
306}
307
308//This works only for 32 bit types
309uint GetNextPowerof2( uint number )
310{
311    number --;
312    number |= number >> 1;
313    number |= number >> 2;
314    number |= number >> 4;
315    number |= number >> 8;
316    number |= number >> 16;
317    number ++;
318    return number;
319}
320
321#define OCLRT_ALIGN( a, b )                 ( ( ( ( a ) % ( b ) ) != 0 ) ?  ( ( a ) - ( ( a ) % ( b ) ) + ( b ) ) : ( a ) )
322#define OCLRT_MAX( a, b )                   ( ( ( a ) > ( b ) ) ?  ( a ) : ( b ) )
323
324#ifndef SCHEDULER_EMULATION
325#include "scheduler_definitions.h"
326#endif
327
328#ifdef ENABLE_DEBUG_BUFFER
329//Adds private uint* data to debug buffer
330//  ddb - global buffer to keep data
331//  src - source data
332//  numberOfElements
333//  localID - WI ID , when 0xffffffff all WI copy data
334//  returns 0 when data was copied, else -1
335int AddToDebugBufferParallel( __global DebugDataBuffer* ddb, uint* pSrc, uint numberOfElements, uint localID )
336{
337    if( ddb->m_flags == 0 )
338    {
339        //All work items in local group copies data
340        if( localID == 0xffffffff )
341        {
342            //Check if there is enough place for new data in ddb for every workitem in local group
343            if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements )
344            {
345                uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
346                uint i;
347                int srcPos;
348                for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
349                {
350                    ddb->m_data[ i ] = pSrc[ srcPos ];
351                }
352                return 0;
353            }
354        }
355        else
356        {
357            //Check if there is enough place for new data in ddb for one workitem in local group
358            if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements )
359            {
360                if( get_local_id(0) == localID )
361                {
362                    uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
363                    uint i;
364                    int srcPos;
365                    for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
366                    {
367                        ddb->m_data[ i ] = pSrc[ srcPos ];
368                    }
369                    return 0;
370                }
371            }
372        }
373    }
374    return -1;
375}
376
377//Adds private uint to debug buffer
378int AddToDBParallel(__global DebugDataBuffer* ddb, uint pSrc, uint localID)
379{
380    return AddToDebugBufferParallel(ddb, &pSrc, 1, localID);
381}
382
383//Adds global uint* data to debug buffer
384//  ddb - global buffer to keep data
385//  src - source data
386//  numberOfElements
387//  localID - WI ID , when 0xffffffff all WI copy data
388//  returns 0 when data was copied, else -1
389int AddGlobalToDebugBufferParallel( __global DebugDataBuffer* ddb, __global uint* pSrc, uint numberOfElements, uint localID )
390{
391    if( ddb->m_flags == 0 )
392    {
393        //All work items in local group copies data
394        if( localID == 0xffffffff )
395        {
396            //Check if there is enough place for new data in ddb for every workitem in local group
397            if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements )
398            {
399                uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
400                uint i;
401                int srcPos;
402                for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
403                {
404                    ddb->m_data[ i ] = pSrc[ srcPos ];
405                }
406                return 0;
407            }
408        }
409        else
410        {
411            //Check if there is enough place for new data in ddb for one workitem in local group
412            if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements )
413            {
414                if( get_local_id(0) == localID )
415                {
416                    uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
417                    uint i;
418                    int srcPos;
419                    for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
420                    {
421                        ddb->m_data[ i ] = pSrc[ srcPos ];
422                    }
423                    return 0;
424                }
425            }
426        }
427    }
428    return -1;
429}
430
431
432
433//Adds private uint data to debug buffer
434//  ddb - global buffer to keep data
435//  src - source data
436//  dataType - enum defining added data type
437//  returns 0 when data was copied, else -1
438int AddToDebugBuffer( __global DebugDataBuffer* ddb, __private ulong src, uint dataType, uint localID )
439{
440    if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
441    {
442        //All work items in local group copies data
443        if( localID == 0xffffffff )
444        {
445            //Check if there is enough place for new data in ddb
446            if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 4 * get_local_size( 0 ) ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
447            {
448                //Check flags
449                if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
450                {
451                    uint startIndex                 = atomic_add( &ddb->m_offset, 1 );
452                    uint dataIndex                  = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize );
453                    uint stackTop                   = atomic_add( &ddb->m_stackTop, 8 );
454
455                    __global uchar* pCharDebugQueue = ( __global uchar * )ddb;
456                    __global uint* dest             = ( __global uint* )&pCharDebugQueue[ stackTop ];
457                    __global DebugDataInfo* debugInfo;
458
459                    dest[ 0 ]                 = ( uint )src & 0xffffffff;
460                    dest[ 1 ]               = ( src & 0xffffffff00000000 ) >> 32;
461                    debugInfo               = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] );
462                    debugInfo->m_dataType   = ( DebugDataTypes )dataType;
463                    debugInfo->m_dataSize   = 8;
464                    ddb->m_flags            |= DDB_HAS_DATA_INFO;
465                    return 0;
466                }
467            }
468        }
469        else
470        {
471            //Check if there is enough place for new data in ddb
472            if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 8 ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
473            {
474                if( get_local_id( 0 ) == localID )
475                {
476                    uint startIndex                 = atomic_add( &ddb->m_offset, 1 );
477                    uint dataIndex                  = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize );
478                    uint stackTop                   = atomic_add( &ddb->m_stackTop, 8 );
479
480                    __global uchar* pCharDebugQueue = ( __global uchar * )ddb;
481                    __global uint* dest             = ( __global uint* )&pCharDebugQueue[ stackTop ];
482                    __global DebugDataInfo* debugInfo;
483
484                    dest[ 0 ]                 = ( uint )src & 0xffffffff;
485                    dest[ 1 ]               = ( src & 0xffffffff00000000 ) >> 32;
486                    debugInfo               = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] );
487                    debugInfo->m_dataType   = ( DebugDataTypes )dataType;
488                    debugInfo->m_dataSize   = 8;
489                    ddb->m_flags            |= DDB_HAS_DATA_INFO;
490                    return 0;
491                }
492
493            }
494        }
495    }
496    return -1;
497}
498
499//Adds data to debug buffer
500//  ddb - global buffer to keep data
501//  src - source
502//  bytes - number of bytes from src to put into ddb
503//  dataType - enum defining added data type
504//  returns 0 when data was copied, else -1
505int AddGlobalToDebugBuffer( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType )
506{
507    if( get_local_id( 0 ) )
508    {
509        //Check if there is enough place for new data in ddb
510        if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
511        {
512            //Check flags
513            if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
514            {
515                __global uchar* pCharDebugQueue = ( __global uchar * )ddb;
516                __global DebugDataInfo* debugInfo;
517                IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes );
518                debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] );
519                debugInfo->m_dataType = ( DebugDataTypes )dataType;
520                debugInfo->m_dataSize = bytes;
521
522                ddb->m_dataInfoTop  = ddb->m_dataInfoTop - ddb->m_dataInfoSize;
523                ddb->m_stackTop     = ddb->m_stackTop + bytes;
524                ddb->m_offset       = ddb->m_offset + ( bytes / 4 );
525                ddb->m_flags        |= DDB_HAS_DATA_INFO;
526                return 0;
527            }
528        }
529    }
530    return -1;
531}
532
533int AddGlobalToDebugBufferAllIds( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType, uint localID )
534{
535    if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
536    {
537        //Check if there is enough place for new data in ddb
538        if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
539        {
540            //Check flags
541            if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
542            {
543                __global uchar* pCharDebugQueue = ( __global uchar * )ddb;
544                __global DebugDataInfo* debugInfo;
545                IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes );
546                debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] );
547                debugInfo->m_dataType = ( DebugDataTypes )dataType;
548                debugInfo->m_dataSize = bytes;
549
550                ddb->m_dataInfoTop  = ddb->m_dataInfoTop - ddb->m_dataInfoSize;
551                ddb->m_stackTop     = ddb->m_stackTop + bytes;
552                ddb->m_offset       = ddb->m_offset + ( bytes / 4 );
553                ddb->m_flags        |= DDB_HAS_DATA_INFO;
554                return 0;
555            }
556        }
557    }
558    return -1;
559}
560
561#endif
562
563#define MAX_SLB_OFFSET ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * SECOND_LEVEL_BUFFER_NUMBER_OF_ENQUEUES )
564
565#ifndef SCHEDULER_EMULATION
566#include "scheduler_builtin_kernel.inl"
567#endif
568
569//SOME COMMON CODE FUNCTIONS
570//COMMON CODE STARTS HERE
571//Not thread safe - make sure it's called in thread safe fashion.
572
573void patchIDData( __global char* dsh,
574                  uint blockId,
575                  uint numberOfHwThreads,
576                  uint slmSize )
577{
578    __global char* DSHIData                      = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + 1 ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
579    __global uint* DSHIntData                    = ( __global uint* )( DSHIData );
580    //Barrier enable is pre-patched on the host.
581    patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
582
583    //Patch SLM.
584    uint SLMPatchValue = GetPatchValueForSLMSize( slmSize );
585    patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue );
586}
587/*
588this is how it works :
589When constructing primary batch, first IDT table is also constructed, for all blocks, it is constructed as follows:
590[0] - parent id
591[1 .. x ] block id
592[last aligned ] scheduler
593
594now when we enter SLB, we forgot about first IDT, and we point all interface descriptor loads to point at scheduler which was last in the first IDT, to be first in the new IDT.
595
596This way we can copy Interface Descriptors for blocks from the first IDT and assign Interface Descriptors dynamically in scheduler.
597*/
598
599void CopyAndPatchIDData( __global char* dsh,
600                         uint blockId,
601                         uint numberOfHwThreads,
602                         uint slmSize,
603                         uint interfaceDescriptorOffset,
604                         uint blockStartId )
605{
606    __global char* DSHIData                      = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
607    __global uint* DSHIntData                    = ( __global uint* )( DSHIData );
608
609    //Copy to ID InterfaceDescriptorOffset
610    __global char* DSHDestIData                  = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
611    __global uint* DSHDestIntData                = ( __global uint* )( DSHDestIData );
612    __global uint* DSHDestIntStartData           = DSHDestIntData;
613
614    for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ )
615    {
616        DSHDestIntData[ i ] = DSHIntData[ i ];
617    }
618
619    //Barrier enable is pre-patched on the host.
620    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
621
622    //Patch SLM.
623    uint SLMPatchValue = GetPatchValueForSLMSize( slmSize );
624    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue );
625}
626
627void CopyAndPatchIDData20( __global char* dsh,
628                           uint blockId,
629                           uint numberOfHwThreads,
630                           uint slmSize,
631                           uint interfaceDescriptorOffset,
632                           uint blockStartId,
633                           uint bToffset,
634                           uint dshOffset,
635                           uint numOfSamplers
636#ifdef ENABLE_DEBUG_BUFFER
637                           , __global DebugDataBuffer* DebugQueue
638#endif
639                           )
640{
641    EMULATION_ENTER_FUNCTION( );
642
643    __global char* DSHIData                      = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
644    __global uint* DSHIntData                    = ( __global uint* )( DSHIData );
645
646    //Copy to ID InterfaceDescriptorOffset
647    __global char* DSHDestIData                  = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
648    __global uint* DSHDestIntData                = ( __global uint* )( DSHDestIData );
649    __global uint* DSHDestIntStartData           = DSHDestIntData;
650
651    for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ )
652    {
653        DSHDestIntData[ i ] = DSHIntData[ i ];
654    }
655
656    //Barrier enable is pre-patched on the host.
657    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
658
659    //Patch BT offset
660    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_BINDING_TABLE_POINTER_DWORD ] ), 5, 15, ( bToffset >> 5 ) );
661
662    //Patch SLM.
663    uint PatchValue = GetPatchValueForSLMSize( slmSize );
664    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, PatchValue );
665
666    PatchValue = ( DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] & 0xffffffe0 ) + ( dshOffset );
667    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 5, 31, ( ( PatchValue ) >> 5 ) );
668
669    //Samplers in multiple of 4
670    numOfSamplers = ( numOfSamplers + 3 ) / 4;
671    patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 2, 4, numOfSamplers );
672}
673
674
675void patchGpGpuWalker(
676    uint secondLevelBatchOffset,
677    __global uint* secondaryBatchBuffer,
678    uint interfaceDescriptorOffset,
679    uint simdSize,
680    uint totalLocalWorkSize,
681    uint3 dimSize,
682    uint3 startPoint,
683    uint numberOfHwThreadsPerWg,
684    uint indirectPayloadSize,
685    uint ioHoffset )
686{
687    EMULATION_ENTER_FUNCTION( );
688
689    //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
690    uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
691    //INTERFACE_DESCRIPTOR for GPGPU_WALKER
692    //INTERFACE DESCRIPTOR is one plus the block id
693    uint PatchOffset = CmdPacketStart + GPGPU_WALKER_INTERFACE_DESCRIPTOR_ID_OFFSET;
694    //Patch id data
695    patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
696                0, 5, ( interfaceDescriptorOffset ) );
697    PatchOffset = CmdPacketStart + GPGPU_WALKER_THREAD_WIDTH_DWORD;
698    //THREAD_WIDTH for GPGPU_WALKER
699    patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
700                0, 5, ( numberOfHwThreadsPerWg - 1 ) );
701
702    PatchOffset = CmdPacketStart + GPGPU_WALKER_SIMDSIZE_DWORD;
703
704    //SIMD SIZE for GPGPU_WALKER
705    //Double Check the bits for SIMDSize
706    if( simdSize == 8 )
707    {
708        patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
709                    30, 31, SIMD8 );
710    }
711    else if ( simdSize == 16 )
712    {
713        patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
714                    30, 31, SIMD16 );
715    }
716    else
717    {
718        patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
719                    30, 31, SIMD32 );
720    }
721
722    //XDIM for GPGPU_WALKER
723    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XDIM_DWORD ] = dimSize.x;
724    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_X ] = startPoint.x;
725    //YDIM
726    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YDIM_DWORD ] = dimSize.y;
727    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Y ] = startPoint.y;
728    //ZDIM for GPGPU_WALKER
729    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_ZDIM_DWORD ] = dimSize.z;
730    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Z ] = startPoint.z;
731
732    //XMASK for GPGPU_WALKER
733    uint mask = ( 1 << ( totalLocalWorkSize % simdSize ) ) - 1;
734    if( mask == 0 )
735        mask = ~0;
736
737    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XMASK_DWORD ] = mask;
738
739    //YMASK for GPGPU_WALKER
740    uint YMask = ~0;
741
742    secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YMASK_DWORD ] = YMask;
743
744
745    patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_DATA_LENGTH_OFFSET ] ),
746                0, 16, indirectPayloadSize );
747
748    patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_START_ADDRESS_OFFSET ] ),
749                0, 31, ioHoffset );
750}
751
752int PatchMediaStateFlush(
753    uint secondLevelBatchOffset,
754    __global uint* secondaryBatchBuffer,
755    uint interfaceDescriptorOffset,
756    uint msfNumber )
757{
758    //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
759    uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
760    uint MsfOffset;
761
762    if( msfNumber == SCHEDULER_MSF_INITIAL )
763    {
764        MsfOffset = MEDIA_STATE_FLUSH_INITIAL_INTERFACE_DESCRIPTOR_OFFSET;
765    }
766    else if ( msfNumber == SCHEDULER_MSF_SECOND )
767    {
768        MsfOffset = MEDIA_STATE_FLUSH_INTERFACE_DESCRIPTOR_OFFSET;
769    }
770    else
771    {
772        return -1;
773    }
774    patchDword( &( secondaryBatchBuffer[ CmdPacketStart + MsfOffset ] ), 0, 5, interfaceDescriptorOffset );
775
776    return 0;
777}
778
779#if defined WA_LRI_COMMANDS_EXIST
780void PatchMiLoadRegisterImm(
781    uint secondLevelBatchOffset,
782    __global uint* secondaryBatchBuffer,
783    uint enqueueOffset,
784    uint registerAddress,
785    uint value )
786{
787    //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
788    uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
789
790    secondaryBatchBuffer[ CmdPacketStart + enqueueOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD;
791    patchDword( &( secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_ADDRESS_DWORD_OFFSET ] ), 2, 22, registerAddress >> 2 );
792    secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_VALUE_DWORD_OFFSET ] = value;
793}
794
795void AddMiLoadRegisterImm(
796    __global uint* secondaryBatchBuffer,
797    __private uint* dwordOffset,
798    uint value )
799{
800    secondaryBatchBuffer[ *dwordOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD;
801    ( *dwordOffset )++;
802    secondaryBatchBuffer[ *dwordOffset ] = 0;
803    patchDword( &( secondaryBatchBuffer[ *dwordOffset ] ), 2, 22, CTXT_PREMP_DBG_ADDRESS_VALUE >> 2 );
804    ( *dwordOffset )++;
805    secondaryBatchBuffer[ *dwordOffset ] = value; //CTXT_PREMP_ON_MI_ARB_CHECK_ONLY or CTXT_PREMP_DEFAULT_VALUE
806    ( *dwordOffset )++;
807}
808
809void SetDisablePreemptionRegister(
810    uint secondLevelBatchOffset,
811    __global uint* secondaryBatchBuffer )
812{
813    PatchMiLoadRegisterImm( secondLevelBatchOffset,
814                            secondaryBatchBuffer,
815                            IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET,
816                            CTXT_PREMP_DBG_ADDRESS_VALUE,
817                            CTXT_PREMP_ON_MI_ARB_CHECK_ONLY );
818}
819
820void SetEnablePreemptionRegister(
821    uint secondLevelBatchOffset,
822    __global uint* secondaryBatchBuffer )
823{
824    PatchMiLoadRegisterImm( secondLevelBatchOffset,
825                            secondaryBatchBuffer,
826                            IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET,
827                            CTXT_PREMP_DBG_ADDRESS_VALUE,
828                            CTXT_PREMP_DEFAULT_VALUE );
829}
830
831void NoopPreemptionCommand(
832    uint secondLevelBatchOffset,
833    uint cmdOffset,
834    __global uint* secondaryBatchBuffer )
835{
836    uint CmdPacketStart = cmdOffset + secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
837    for( int i = 0; i < OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET; i++ )
838    {
839        secondaryBatchBuffer[ CmdPacketStart + i ] = 0;
840    }
841}
842#endif //WA_LRI_COMMANDS_EXIST
843
844
845//PQueue is needed for SLBOffset
846void AddCmdsInSLBforScheduler20Parallel( uint slbOffset,
847                                        __global IGIL_CommandQueue* pQueue,
848                                        __global uint * secondaryBatchBuffer,
849                                        __global char * dsh )
850{
851    EMULATION_ENTER_FUNCTION( );
852#ifdef SCHEDULER_EMULATION
853    uint3 StartPoint = { 0, 0, 0 };
854    uint3 DimSize = { get_num_groups( 0 ), 1, 1 };
855#else
856    uint3 StartPoint = ( uint3 )( 0 );
857    uint3 DimSize = ( uint3 )( get_num_groups( 0 ), 1, 1 );
858#endif
859    patchGpGpuWalker( slbOffset,
860                      secondaryBatchBuffer,
861                      0,
862                      PARALLEL_SCHEDULER_COMPILATION_SIZE_20,
863                      get_local_size(0),
864                      DimSize,
865                      StartPoint,
866                      PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20,
867                      SIZEOF_3GRFS * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 + pQueue->m_controls.m_SchedulerConstantBufferSize,
868                      pQueue->m_controls.m_SchedulerDSHOffset );
869
870    PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_INITIAL );
871    PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_SECOND );
872
873//When commands exists and scheduler does not require preemption off, noop the commands space
874#if defined WA_LRI_COMMANDS_EXIST
875#if defined WA_SCHEDULER_PREEMPTION
876    if( pQueue->m_controls.m_EventTimestampAddress == 0u )
877    {
878        SetEnablePreemptionRegister( slbOffset, secondaryBatchBuffer );
879        SetDisablePreemptionRegister( slbOffset, secondaryBatchBuffer );
880    }
881    else
882    {
883        NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
884        NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
885    }
886#else
887    //This is case, where LRI preemption is not required around scheduler WALKERs, but space for LRI commands exists, make sure they are nooped then
888    NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
889    NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
890#endif //WA_SCHEDULER_PREEMPTION
891#endif //WA_LRI_COMMANDS_EXIST
892}
893
894int generateLocalIDSParallel20(
895    __global char* dsh,
896    uint3 localSize,
897    uint hwThreads,
898    uint simdSize )
899{
900    uint it, currX, currY, currZ, FlattendID;
901
902    uint Max = 1;
903    if( simdSize == 32 )
904    {
905        Max = 2;
906    }
907
908    //Update full GRFs, each WI generate ID for one work item in x,y and z
909    //in case we generate SIMD8 payload using 16 wi , idle half of them
910    if( get_local_id( 0 ) < simdSize )
911    {
912        for( it = 0; it < hwThreads; it++ )
913        {
914            for( uint multip = 0; multip < Max; multip++ )
915            {
916                //We are in simd 8, each wi process generation for 1 wi
917                FlattendID = get_local_id( 0 ) + it * simdSize + 16 * ( multip );
918
919                currX = FlattendID % localSize.x;
920                currY = ( FlattendID / localSize.x ) % localSize.y;
921                currZ = ( FlattendID / ( localSize.x * localSize.y ) );//not needed % localSize.z;
922
923                *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + multip * GRF_SIZE )                                   = ( ushort )currX;
924                *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + multip * GRF_SIZE )                  = ( ushort )currY;
925                *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + GRF_SIZE * Max + multip * GRF_SIZE ) = ( ushort )currZ;
926            }
927        }
928    }
929    return 0;
930}
931
932//Function generate local ids.
933//SIMD16 version
934int generateLocalIDSsimd16(
935    __global char* dsh,
936    uint3 localSize,
937    uint hwThreads)
938{
939    typedef union
940    {
941        ushort16 vectors;
942        ushort varray[ 16 ];
943    }vectorUnion;
944
945    __private vectorUnion LidX;
946    __private vectorUnion LidY;
947    __private vectorUnion LidZ;
948
949    __private ushort currX = 0;
950    __private ushort currY = 0;
951    __private ushort currZ = 0;
952
953    //Assuming full load of hw thread , remainder done separately
954    for(uint it = 0; it < hwThreads; it++ )
955    {
956        //This will be unrolled by compiler
957        for(uint x = 0; x < 16; x++ )
958        {
959            LidX.varray[ x ] = currX++;
960            LidY.varray[ x ] = currY;
961            LidZ.varray[ x ] = currZ;
962
963            if( currX == localSize.x )
964            {
965                currX = 0;
966                currY++;
967            }
968
969            if( currY == localSize.y )
970            {
971                currY = 0;
972                currZ++;
973            }
974        }
975
976        *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 )                          = LidX.vectors;
977        *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE )               = LidY.vectors;
978        *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE )    = LidZ.vectors;
979
980    }
981
982    return 0;
983}
984
985//Function generate local ids.
986//SIMD8 version
987int generateLocalIDSsimd8(
988    __global char* dsh,
989    uint3 localSize,
990    uint hwThreads)
991{
992    typedef union
993    {
994        ushort8 vectors;
995        ushort varray[ 8 ];
996    }vectorUnion;
997
998    __private vectorUnion LidX;
999    __private vectorUnion LidY;
1000    __private vectorUnion LidZ;
1001
1002    __private ushort currX = 0;
1003    __private ushort currY = 0;
1004    __private ushort currZ = 0;
1005
1006    //Assuming full load of hw thread , remainder done separately
1007    for(uint it = 0; it < hwThreads; it++ )
1008    {
1009        //This will be unrolled by compiler
1010        for(uint x = 0; x < 8; x++ )
1011        {
1012            LidX.varray[ x ] = currX++;
1013            LidY.varray[ x ] = currY;
1014            LidZ.varray[ x ] = currZ;
1015
1016            if( currX == localSize.x )
1017            {
1018                currX = 0;
1019                currY++;
1020            }
1021
1022            if( currY == localSize.y )
1023            {
1024                currY = 0;
1025                currZ++;
1026            }
1027        }
1028
1029        *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 )                          = LidX.vectors;
1030        *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE )               = LidY.vectors;
1031        *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE )    = LidZ.vectors;
1032    }
1033
1034    return 0;
1035}
1036
1037//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once
1038int PatchDSH1Token( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1039                    uint value )
1040{
1041    EMULATION_ENTER_FUNCTION( );
1042
1043    uint PatchOffset;
1044#if SCHEDULER_DEBUG_MODE
1045    //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
1046    if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
1047    {
1048        return -1;
1049    }
1050#endif
1051    PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1052    *( __global uint * )( &pDsh[ PatchOffset ] ) = value;
1053
1054    currentIndex++;
1055    return currentIndex;
1056}
1057
1058int PatchLocalMemEntities( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1059                           __global IGIL_CommandHeader* pCommand )
1060{
1061    uint PatchOffset;
1062#if SCHEDULER_DEBUG_MODE
1063    //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
1064    if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
1065    {
1066        return -1;
1067    }
1068#endif
1069    //First patch is with 0
1070    PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1071
1072    //SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch
1073    if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) )
1074    {
1075        *( __global ulong * )( &pDsh[PatchOffset] ) = 0;
1076    }
1077    else
1078    {
1079        *( __global uint * )( &pDsh[ PatchOffset ] ) = 0;
1080    }
1081
1082
1083    currentIndex++;
1084    uint Alignement;
1085    uint iter = 0;
1086    uint CurrentSum = 0;
1087    uint CurrentValue;
1088    //For each global captured there will be uint with index and ulong with address.
1089    uint GlobalPointersSize = ( pCommand->m_numGlobalCapturedBuffer * ( sizeof( ulong ) +  sizeof( uint ) ) ) / sizeof( uint );
1090
1091    __global uint* pLocalMemSizes = &pCommand->m_data[ pCommand->m_numDependencies + pCommand->m_numScalarArguments + GlobalPointersSize ];
1092    //Check if there is second surface
1093    while( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
1094    {
1095        PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1096
1097        //Value needs to be aligned to the value stored in sourceoffset
1098        Alignement = OCLRT_MAX( DWORD_SIZE_IN_BYTES, pKernelCurbeParams[ currentIndex ].m_sourceOffset );
1099
1100        CurrentValue = pLocalMemSizes[ iter ];
1101        CurrentValue = OCLRT_ALIGN( CurrentValue, Alignement );
1102        CurrentSum   += CurrentValue;
1103
1104        //SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch
1105        if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) )
1106        {
1107            *( __global ulong * )( &pDsh[PatchOffset] ) = ( ulong )CurrentSum;
1108        }
1109        else
1110        {
1111            *( __global uint * )( &pDsh[ PatchOffset ] ) = CurrentSum;
1112        }
1113
1114        currentIndex++;
1115        iter++;
1116    }
1117    return currentIndex;
1118}
1119
1120//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once
1121int PatchDSH1TokenParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1122                              uint value )
1123{
1124    EMULATION_ENTER_FUNCTION( );
1125
1126    uint PatchOffset;
1127    if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
1128    {
1129        PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1130        *( __global uint * ) ( &pDsh[ PatchOffset ] ) = value;
1131    }
1132    currentIndex++;
1133    return currentIndex;
1134}
1135
1136//Function patches a curbe parametr, this version of function works on 3d curbe tokens
1137//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
1138int PatchDSH6TokensParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1139                               uint value1, uint value2, uint value3 )
1140{
1141    EMULATION_ENTER_FUNCTION( );
1142
1143    uint PatchOffset, SourceOffset;
1144    uint WorkingOffset;
1145    uint ShiftSize;
1146
1147    //Check if we patch 3 or 6 curbe tokens
1148    if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType )
1149    {
1150        ShiftSize = 6;
1151    }
1152    else
1153    {
1154        ShiftSize = 3;
1155    }
1156
1157    if( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + ShiftSize )
1158    {
1159        WorkingOffset = currentIndex + get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
1160        PatchOffset   = pKernelCurbeParams[ WorkingOffset ].m_patchOffset;
1161        SourceOffset  = pKernelCurbeParams[ WorkingOffset ].m_sourceOffset;
1162
1163        if( SourceOffset == 0 )
1164        {
1165            *( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
1166        }
1167        else if( SourceOffset == 4 )
1168        {
1169            *( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
1170        }
1171        else if( SourceOffset == 8 )
1172        {
1173            *( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
1174        }
1175    }
1176
1177    currentIndex += ShiftSize;
1178    return currentIndex;
1179}
1180
1181int PatchLocalWorkSizes( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1182                         uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ )
1183{
1184    EMULATION_ENTER_FUNCTION( );
1185
1186    uint PatchOffset, SourceOffset;
1187
1188    //Tokens are sorted by m_sourceOffset, it means that first 3 keys are always used to compute global_id and are always present
1189    for( uint it = 0; it < 3; it++ )
1190    {
1191        PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1192        SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
1193
1194        if( SourceOffset == 0 )
1195        {
1196            *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalX;
1197        }
1198        else if( SourceOffset == 4 )
1199        {
1200            *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalY;
1201        }
1202        else if( SourceOffset == 8 )
1203        {
1204            *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalZ;
1205        }
1206        currentIndex++;
1207    }
1208    //If there are 3 more tokens, it means that get_local_size is used within a kernel, to deal with it patch with the second set of variables
1209    if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
1210    {
1211        for( uint it = 0; it < 3; it++ )
1212        {
1213            PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1214            SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
1215
1216            if( SourceOffset == 0 )
1217            {
1218                *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalX;
1219            }
1220            else if( SourceOffset == 4 )
1221            {
1222                *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalY;
1223            }
1224            else if( SourceOffset == 8 )
1225            {
1226                *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalZ;
1227            }
1228            currentIndex++;
1229        }
1230    }
1231    return currentIndex;
1232}
1233
1234//Function patches a curbe parametr, this version of function works on 3d curbe tokens
1235//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
1236int PatchLocalWorkSizesParallel( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1237                                 uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ )
1238{
1239    EMULATION_ENTER_FUNCTION( );
1240
1241    uint ShiftSize;
1242
1243    //Check if we patch 3 or 6 curbe tokens
1244    if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType )
1245    {
1246        ShiftSize = 6;
1247    }
1248    else
1249    {
1250        ShiftSize = 3;
1251    }
1252
1253    //Use single threaded version
1254    if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
1255    {
1256        PatchLocalWorkSizes( currentIndex, SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE, pKernelCurbeParams, pDsh, enqLocalX, enqLocalY, enqLocalZ, cutLocalX, cutLocalY, cutLocalZ );
1257    }
1258
1259    currentIndex += ShiftSize;
1260    return currentIndex;
1261}
1262
1263//Function patches a curbe parametr, this version of function works on 3d curbe tokens
1264//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
1265int PatchDSH6Tokens( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
1266                     uint value1, uint value2, uint value3 )
1267{
1268    EMULATION_ENTER_FUNCTION( );
1269
1270    uint PatchOffset, SourceOffset;
1271#if SCHEDULER_DEBUG_MODE
1272    //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
1273    if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
1274    {
1275        return -1;
1276    }
1277#endif
1278    for( uint it = 0; it < 3; it++ )
1279    {
1280        PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1281        SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
1282
1283        if( SourceOffset == 0 )
1284        {
1285            *( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
1286        }
1287        else if( SourceOffset == 4 )
1288        {
1289            *( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
1290        }
1291        else if( SourceOffset == 8 )
1292        {
1293            *( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
1294        }
1295        currentIndex++;
1296    }
1297    //Check if there are 3 more.
1298    if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
1299    {
1300        for( uint it = 0; it < 3; it++ )
1301        {
1302            PatchOffset  = pKernelCurbeParams[ currentIndex ].m_patchOffset;
1303            SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
1304
1305            if( SourceOffset == 0 )
1306            {
1307                *( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
1308            }
1309            else if( SourceOffset == 4 )
1310            {
1311                *( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
1312            }
1313            else if( SourceOffset == 8 )
1314            {
1315                *( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
1316            }
1317            currentIndex++;
1318        }
1319    }
1320
1321    return currentIndex;
1322}
1323//Common code
1324
1325inline __global char* GetPtrToCurbeData( uint offset, __global IGIL_KernelDataHeader * pKernelReflection )
1326{
1327    __global char * pRawKernelReflection = ( __global char * )pKernelReflection;
1328    return ( pRawKernelReflection + offset );
1329}
1330
1331__global char* GetPtrToKernelReflectionOffset( uint offset, __global IGIL_KernelDataHeader * pKernelReflection )
1332{
1333    __global char * pRawKernelReflection = ( __global char * )pKernelReflection;
1334    return ( pRawKernelReflection + offset );
1335}
1336
1337void InitWalkerDataParallel( __local IGIL_WalkerEnumeration* pWalkerEnumData,
1338                             uint workDim,
1339                             uint* pWalkerCount,
1340                             uint3 edgeArray,
1341                             uint3 globalDim,
1342                             uint3 globalSizes,
1343                             uint3 localSizes )
1344{
1345    EMULATION_ENTER_FUNCTION( );
1346
1347    pWalkerEnumData->TotalDimSize.x = globalDim.x;
1348    pWalkerEnumData->TotalDimSize.y = globalDim.y;
1349    pWalkerEnumData->TotalDimSize.z = globalDim.z;
1350
1351    pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.x = localSizes.x;
1352    pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.x = 0;
1353    pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.x = globalDim.x;
1354
1355    pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.y = localSizes.y;
1356    pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.y = 0;
1357    pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.y = globalDim.y;
1358
1359    pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.z = localSizes.z;
1360    pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.z = 0;
1361    pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.z = globalDim.z;
1362
1363    uint WalkerCount = 1;
1364
1365    if( edgeArray.x != 0 )
1366    {
1367        pWalkerEnumData->TotalDimSize.x++;
1368
1369        pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.x = edgeArray.x;
1370        pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.x = globalDim.x;
1371        pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
1372
1373        pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.y = localSizes.y;
1374        pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.y = 0;
1375        pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.y = globalDim.y;
1376
1377        pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.z = localSizes.z;
1378        pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.z = 0;
1379        pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.z = globalDim.z;
1380
1381        WalkerCount++;
1382    }
1383
1384    if( workDim > 1 )
1385    {
1386        if( edgeArray.y != 0 )
1387        {
1388            pWalkerEnumData->TotalDimSize.y++;
1389
1390            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
1391            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
1392            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
1393
1394            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
1395            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
1396            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
1397
1398            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z;
1399            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0;
1400            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z;
1401
1402            WalkerCount++;
1403        }
1404
1405        if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) )
1406        {
1407            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
1408            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
1409            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
1410
1411            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
1412            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
1413            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
1414
1415            pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z;
1416            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0;
1417            pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z;
1418
1419            WalkerCount++;
1420        }
1421        if( workDim > 2 )
1422        {
1423            if( edgeArray.z != 0 )
1424            {
1425                pWalkerEnumData->TotalDimSize.z++;
1426
1427                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
1428                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
1429                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
1430
1431                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y;
1432                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0;
1433                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y;
1434
1435                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
1436                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
1437                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
1438
1439                WalkerCount++;
1440            }
1441            if( ( edgeArray.x != 0 ) & ( edgeArray.z != 0 ) )
1442            {
1443                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
1444                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
1445                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
1446
1447                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y;
1448                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0;
1449                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y;
1450
1451                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
1452                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
1453                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
1454
1455                WalkerCount++;
1456            }
1457
1458            if( ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) )
1459            {
1460                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
1461                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
1462                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
1463
1464                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
1465                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
1466                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
1467
1468                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
1469                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
1470                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
1471
1472                WalkerCount++;
1473            }
1474            if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) )
1475            {
1476                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
1477                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
1478                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
1479
1480                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
1481                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
1482                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
1483
1484                pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
1485                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
1486                pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
1487
1488                WalkerCount++;
1489            }
1490        }
1491    }
1492
1493    *pWalkerCount = WalkerCount;
1494}
1495
1496//Compute number of Walkers needed for this command packet, this function assumes that command packet is initialized
1497inline int GetWalkerCount( __global IGIL_CommandHeader* pCommand )
1498{
1499    int WalkerCount = 1;
1500    for( uint dim = 0; ( dim < pCommand->m_range.m_dispatchDimensions ); dim++ )
1501    {
1502        if( ( pCommand->m_range.m_globalWorkSize[ dim ] % pCommand->m_range.m_localWorkSize[ dim ] ) != 0 )
1503        {
1504            WalkerCount *= 2;
1505        }
1506    }
1507    return WalkerCount;
1508}
1509
1510//This function intializes command packet, checks for null case, sets proper LWS sizes and return WalkerCount needed for this packet
1511inline void InitializeCommandPacket( __global IGIL_CommandHeader* pCommand )
1512{
1513    EMULATION_ENTER_FUNCTION( );
1514
1515    //Check for NULL case
1516    if( pCommand->m_range.m_localWorkSize[ 0 ] == 0 )
1517    {
1518        //If null case detected use 16 x 1 x 1 lws
1519        if( pCommand->m_range.m_globalWorkSize[ 0 ] >= 16 )
1520        {
1521            pCommand->m_range.m_localWorkSize[ 0 ] = 16;
1522        }
1523        else
1524        {
1525            pCommand->m_range.m_localWorkSize[ 0 ] = pCommand->m_range.m_globalWorkSize[ 0 ];
1526        }
1527
1528        pCommand->m_range.m_localWorkSize[ 1 ] = 1;
1529        pCommand->m_range.m_localWorkSize[ 2 ] = 1;
1530    }
1531
1532}
1533
1534//Patches the address for pipe  control
1535void PatchPipeControlProfilingAddres( __global uint* secondaryBatchBuffer, uint slBoffset, ulong address, uint pipeControlOffset )
1536{
1537    EMULATION_ENTER_FUNCTION( );
1538
1539    //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
1540    uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD;
1541    uint DwordOffset         = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_ADDRESS_FIELD_DWORD;
1542    //Patch P_C event timestamp address in SLB in 3rd and 4th dword
1543    secondaryBatchBuffer[ DwordOffset ] = 0;
1544    patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_END_BIT, ( uint )( address >> PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT ) );
1545    DwordOffset++;
1546    secondaryBatchBuffer[ DwordOffset ] = 0;
1547    patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_END_BIT, ( address >> 32 ) );
1548
1549    //Patch Timestamp bit
1550    patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_GENERATE_TIME_STAMP );
1551}
1552
1553void DisablePostSyncBitInPipeControl( __global uint* secondaryBatchBuffer, uint slBoffset, uint pipeControlOffset )
1554{
1555    //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
1556    uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD;
1557    //Patch P_C event timestamp address in SLB in 3rd and 4th dword
1558    patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_NO_POSTSYNC_OPERATION );
1559}
1560
1561
1562int PatchDSH( __global IGIL_CommandQueue* pQueue,
1563              __global IGIL_KernelDataHeader * pKernelReflection,
1564              __global char* dsh,
1565              uint blockId,
1566              __global IGIL_CommandHeader* pCommandHeader,
1567              __global uint* secondaryBatchBuffer,
1568              uint dshOffset,
1569              uint intefaceDescriptorOffset,
1570              __local IGIL_WalkerEnumeration* pWalkerMain,
1571              uint walkerPos )
1572{
1573    EMULATION_ENTER_FUNCTION( );
1574
1575    __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
1576    __global IGIL_KernelData* pBlockData                = IGIL_GetKernelData( pKernelReflection, blockId );
1577    ulong PatchMask                                     = pBlockData->m_PatchTokensMask;
1578    uint CurrentIndex                                   = 0;
1579
1580    __global char* pDsh                                 = ( __global char* )&dsh[ dshOffset ];
1581    __global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data;
1582
1583    uint NumberOfDepencies                              = pCommandHeader->m_numDependencies;
1584    uint PatchOffset;
1585    __global char* pScalarData                          = ( __global char* )( &pCommandHeader->m_data[ NumberOfDepencies ] );
1586    __global char *pDshOnKRS                            = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection );
1587
1588    uint SizeOfScalarsFromCurbe                         = 0;
1589    uint CurbeSize;
1590
1591    uint TotalLocalSize;
1592    uint ThreadPayloadSize;
1593    uint NumberOfHWThreads;
1594    uint WorkDim;
1595    uint3 GlobalOffset;
1596    uint3 GlobalSizes;
1597    uint3 ActualLocalSize;
1598
1599    GlobalOffset.x = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ];
1600    GlobalOffset.y = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ];
1601    GlobalOffset.z = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ];
1602
1603    GlobalSizes.x = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ];
1604    GlobalSizes.y = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ];
1605    GlobalSizes.z = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ];
1606
1607    ActualLocalSize.x = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.x;
1608    ActualLocalSize.y = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.y;
1609    ActualLocalSize.z = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.z;
1610
1611    WorkDim = pCommandHeader->m_range.m_dispatchDimensions;
1612    TotalLocalSize = ActualLocalSize.x * ActualLocalSize.y * ActualLocalSize.z;
1613
1614    NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize;
1615    if( TotalLocalSize % pBlockData->m_SIMDSize )
1616    {
1617        NumberOfHWThreads++;
1618    }
1619
1620    ThreadPayloadSize = NumberOfHWThreads * 3 * GRF_SIZE;
1621    //Copy constant buffer to designated area on DSH.
1622
1623    //pDshOnKRS seems to be in the global address space, not the private address space
1624    //Copy SamplerState and Constant Buffer at once
1625    IGILLOCAL_MEMCPY_GTOG( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + pBlockData->m_SizeOfSamplerHeap );
1626
1627    if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK )
1628    {
1629        while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT )
1630        {
1631            CurbeSize               = pKernelCurbeParams[ CurrentIndex ].m_parameterSize;
1632            SizeOfScalarsFromCurbe += CurbeSize;
1633            PatchOffset             = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
1634
1635            //pScalarData is in the global address space, not the private address space
1636            IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize );
1637            pScalarData            += CurbeSize;
1638            CurrentIndex++;
1639        }
1640#if SCHEDULER_DEBUG_MODE
1641        if( pCommandHeader->m_sizeOfScalarArguments != SizeOfScalarsFromCurbe )
1642        {
1643            pCommandHeader->m_commandState = SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH;
1644            return -1;
1645        }
1646#endif
1647    }
1648
1649    if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK )
1650    {
1651        CurrentIndex     = PatchLocalWorkSizes( CurrentIndex,
1652                                                SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE,
1653                                                pKernelCurbeParams,
1654                                                pDsh,
1655                                                pWalkerMain->LocalWorkSize.x,
1656                                                pWalkerMain->LocalWorkSize.y,
1657                                                pWalkerMain->LocalWorkSize.z,
1658                                                ActualLocalSize.x,
1659                                                ActualLocalSize.y,
1660                                                ActualLocalSize.z );
1661#if SCHEDULER_DEBUG_MODE
1662        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1663        {
1664            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1665            return -1;
1666        }
1667#endif
1668    }
1669
1670    if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK )
1671    {
1672        CurrentIndex     = PatchDSH6Tokens( CurrentIndex,
1673                                            SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE,
1674                                            pKernelCurbeParams,
1675                                            pDsh,
1676                                            GlobalSizes.x,
1677                                            GlobalSizes.y,
1678                                            GlobalSizes.z );
1679#if SCHEDULER_DEBUG_MODE
1680        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1681        {
1682            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1683            return -1;
1684        }
1685#endif
1686    }
1687    if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK )
1688    {
1689        CurrentIndex     = PatchDSH6Tokens( CurrentIndex,
1690                                            SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS,
1691                                            pKernelCurbeParams,
1692                                            pDsh,
1693                                            pWalkerMain->TotalDimSize.x,
1694                                            pWalkerMain->TotalDimSize.y,
1695                                            pWalkerMain->TotalDimSize.z );
1696#if SCHEDULER_DEBUG_MODE
1697        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1698        {
1699            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1700            return -1;
1701        }
1702#endif
1703    }
1704    if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK )
1705    {
1706        CurrentIndex     = PatchDSH1Token( CurrentIndex,
1707                                           SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS,
1708                                           pKernelCurbeParams,
1709                                           pDsh,
1710                                           WorkDim );
1711#if SCHEDULER_DEBUG_MODE
1712        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1713        {
1714            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1715            return -1;
1716        }
1717#endif
1718    }
1719
1720    if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK )
1721    {
1722        CurrentIndex     = PatchLocalMemEntities( CurrentIndex,
1723                                                  SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES,
1724                                                  pKernelCurbeParams,
1725                                                  pDsh,
1726                                                  pCommandHeader );
1727#if SCHEDULER_DEBUG_MODE
1728        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1729        {
1730            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1731            return -1;
1732        }
1733#endif
1734    }
1735
1736    if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK )
1737    {
1738        CurrentIndex     = PatchDSH6Tokens( CurrentIndex,
1739                                            SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET,
1740                                            pKernelCurbeParams,
1741                                            pDsh,
1742                                            GlobalOffset.x,
1743                                            GlobalOffset.y,
1744                                            GlobalOffset.z );
1745#if SCHEDULER_DEBUG_MODE
1746        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1747        {
1748            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1749            return -1;
1750        }
1751#endif
1752    }
1753
1754    if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK )
1755    {
1756        CurrentIndex     = PatchDSH1Token( CurrentIndex,
1757                                           SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS,
1758                                           pKernelCurbeParams,
1759                                           pDsh,
1760                                           NumberOfHWThreads );
1761#if SCHEDULER_DEBUG_MODE
1762        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1763        {
1764            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1765            return -1;
1766        }
1767#endif
1768    }
1769
1770    if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK )
1771    {
1772        CurrentIndex     = PatchDSH1Token( CurrentIndex,
1773                                           SCHEDULER_DATA_PARAMETER_PARENT_EVENT,
1774                                           pKernelCurbeParams,
1775                                           pDsh,
1776                                           pCommandHeader->m_event );
1777#if SCHEDULER_DEBUG_MODE
1778        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1779        {
1780            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1781            return -1;
1782        }
1783#endif
1784    }
1785
1786    if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK )
1787    {
1788        CurrentIndex     = PatchDSH6Tokens( CurrentIndex,
1789                                            SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE,
1790                                            pKernelCurbeParams,
1791                                            pDsh,
1792                                            pWalkerMain->LocalWorkSize.x,
1793                                            pWalkerMain->LocalWorkSize.y,
1794                                            pWalkerMain->LocalWorkSize.z );
1795#if SCHEDULER_DEBUG_MODE
1796        if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
1797        {
1798            pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
1799            return -1;
1800        }
1801#endif
1802    }
1803    if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER )
1804    {
1805        if( pCommandHeader->m_numGlobalCapturedBuffer > 0 )
1806        {
1807            //Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader
1808            __global    uint*  pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] );
1809            __global    uint*  pGlobalPtrs    = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] );
1810            uint        StartIndex            = CurrentIndex;
1811
1812            //Argument in command header are not in correct sequence, that's why proper key needs to be located
1813            for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++)
1814            {
1815                //Reset CurrentIndex as we need to start from the beginning.
1816                CurrentIndex  = StartIndex;
1817                while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
1818                {
1819                    //Patch only if exact match occurs
1820                    if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes )
1821                    {
1822                        PatchOffset             = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
1823                        //64 bit patching
1824                        if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 )
1825                        {
1826                            __global uint* pDst = (__global uint *) &pDsh[PatchOffset];
1827                            pDst[ 0 ] = pGlobalPtrs[ 0 ];
1828                            pDst[ 1 ] = pGlobalPtrs[ 1 ];
1829                        }
1830                        else
1831                        {
1832                            __global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ];
1833                            *pDst               = ( uint ) *pGlobalPtrs;
1834                        }
1835                    }
1836                    CurrentIndex++;
1837                }
1838                pGlobalPtrs += 2;
1839                pGlobalIndexes++;
1840            }
1841        }
1842    }
1843
1844    //Now generate local IDS
1845    if( pBlockData->m_SIMDSize == 8 )
1846    {
1847        generateLocalIDSsimd8( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads );
1848    }
1849    else
1850    {
1851        generateLocalIDSsimd16( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads );
1852    }
1853
1854    uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize;
1855
1856    //Update Interface Descriptor Data with SLM size  / number of HW threads.
1857    CopyAndPatchIDData( dsh, blockId, NumberOfHWThreads, TotalSLMSize, intefaceDescriptorOffset, pQueue->m_controls.m_StartBlockID );
1858
1859    //Add WalkerStartSize
1860    patchGpGpuWalker( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, pBlockData->m_SIMDSize,
1861                      TotalLocalSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerDimSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerStartPoint,
1862                      NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, dshOffset );
1863
1864    PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_INITIAL );
1865    PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_SECOND );
1866
1867    return 0;
1868}
1869
1870//Returns: isSRGB(ChannelOrder) ? ChannelOrder : 0;
1871inline uint GetSRGBChannelOrder( uint channelOrder )
1872{
1873    const uint AsSrgb = channelOrder - CL_sRGB;
1874    const uint NumSrgbFormats = CL_sBGRA - CL_sRGB;
1875    if( AsSrgb < NumSrgbFormats )
1876      return channelOrder;
1877    else
1878      return 0;
1879}
1880
1881void PatchDSHParallelWithDynamicDSH20( uint slbOffsetBase,
1882                                      uint dshOffsetBase,
1883                                      uint intefaceDescriptorOffsetBase,
1884                                      __global IGIL_KernelDataHeader * pKernelReflection,
1885                                      __global char* dsh,
1886                                      uint blockId,
1887                                      __global IGIL_CommandHeader* pCommandHeader,
1888                                      __global uint* secondaryBatchBuffer,
1889                                      __global IGIL_CommandQueue* pQueue,
1890                                      __global IGIL_EventPool* eventsPool,
1891                                      __global char* ssh,
1892                                      uint btOffset,
1893                                      __local IGIL_WalkerEnumeration* pWalkerEnum,
1894                                      __local uint* objectIds
1895#ifdef ENABLE_DEBUG_BUFFER
1896                                      , __global DebugDataBuffer* DebugQueue
1897#endif
1898                                      )
1899{
1900    EMULATION_ENTER_FUNCTION( );
1901
1902    __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
1903    __global IGIL_KernelData* pBlockData                = IGIL_GetKernelData( pKernelReflection, blockId );
1904    ulong PatchMask                                     = pBlockData->m_PatchTokensMask;
1905    uint CurrentIndex                                   = 0;
1906
1907    __global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data;
1908
1909    uint NumberOfDepencies                              = pCommandHeader->m_numDependencies;
1910    uint PatchOffset;
1911
1912    uint CurbeSize;
1913
1914    //Get pointer to the Sampler State
1915    __global char *pDshOnKRS                            = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection );
1916
1917    uint WalkerCount                                    = GetWalkerCount( pCommandHeader );
1918    __global char *pKernelReflectionChar                = ( __global char * ) pKernelReflection;
1919    __global IGIL_KernelCurbeParams* pSSHdata           = ( __global  IGIL_KernelCurbeParams* )&pKernelReflectionChar[ pKernelAddressData->m_SSHTokensOffset ];
1920
1921    //WALKER variables that will be propagated to SLB
1922    uint3 LocalSizes;
1923    uint3 GlobalSizes;
1924    uint3 GlobalOffset;
1925    uint3 EdgeArray;
1926    uint3 XYZDim;
1927
1928    //X is always there
1929    GlobalOffset.x  = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ];
1930    GlobalSizes.x   = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ];
1931    LocalSizes.x    = ( uint )pCommandHeader->m_range.m_localWorkSize[ 0 ];
1932    EdgeArray.x     = GlobalSizes.x % LocalSizes.x;
1933    uint WorkDim    = pCommandHeader->m_range.m_dispatchDimensions;
1934    XYZDim.x        = GlobalSizes.x / LocalSizes.x;
1935
1936    if( WorkDim > 1 )
1937    {
1938        GlobalOffset.y  = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ];
1939        GlobalSizes.y   = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ];
1940        LocalSizes.y    = ( uint )pCommandHeader->m_range.m_localWorkSize[ 1 ];
1941        EdgeArray.y     = GlobalSizes.y % LocalSizes.y;
1942        XYZDim.y        = GlobalSizes.y / LocalSizes.y;
1943
1944        if( WorkDim > 2 )
1945        {
1946            GlobalOffset.z  = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ];
1947            GlobalSizes.z   = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ];
1948            LocalSizes.z    = ( uint )pCommandHeader->m_range.m_localWorkSize[ 2 ];
1949            XYZDim.z        = GlobalSizes.z / LocalSizes.z;
1950            EdgeArray.z     = GlobalSizes.z % LocalSizes.z;
1951        }
1952        else
1953        {
1954            GlobalOffset.z  = 0;
1955            GlobalSizes.z   = 1;
1956            LocalSizes.z    = 1;
1957            EdgeArray.z     = 0;
1958            XYZDim.z        = 1;
1959        }
1960    }
1961    else
1962    {
1963        GlobalOffset.y  = 0;
1964        GlobalOffset.z  = 0;
1965        GlobalSizes.y   = 1;
1966        GlobalSizes.z   = 1;
1967        LocalSizes.y    = 1;
1968        LocalSizes.z    = 1;
1969        EdgeArray.z     = 0;
1970        EdgeArray.y     = 0;
1971        XYZDim.z        = 1;
1972        XYZDim.y        = 1;
1973    }
1974
1975    if( get_local_id( 0 ) == 0 )
1976    {
1977        InitWalkerDataParallel( pWalkerEnum, WorkDim, &WalkerCount, EdgeArray, XYZDim, GlobalSizes, LocalSizes );
1978    }
1979
1980    uint SLBOffset                  = slbOffsetBase;
1981    uint DshOffset                  = dshOffsetBase;
1982    uint IntefaceDescriptorOffset   = intefaceDescriptorOffsetBase;
1983    __global uint* pArgumentIds     = NULL;
1984    __global uint* pObjectIds       = NULL;
1985    __global char* pLocalIdsOnDSH   = NULL;
1986
1987    uint SamplerHeapSize = pBlockData->m_SizeOfSamplerHeap;
1988
1989    //Object ID is in fact surface state offset for parent in case of surfaces using SSH, copy SSH from parent to child.
1990    //Copy binding table state of this kernel to allocated place on ssh
1991    GLOBAL_MEMCPY( &ssh[ btOffset ], &ssh[ pKernelAddressData->m_BTSoffset ] , pKernelAddressData->m_BTSize );
1992
1993    for( uint WalkerID = 0; WalkerID < WalkerCount; WalkerID++ )
1994    {
1995        //Update the offsets
1996        if( WalkerID > 0 )
1997        {
1998            SLBOffset += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
1999            SLBOffset %= MAX_SLB_OFFSET;
2000            IntefaceDescriptorOffset++;
2001            DshOffset += MAX_DSH_SIZE_PER_ENQUEUE;
2002        }
2003
2004        __global char* pDsh = ( __global char* )&dsh[ DshOffset ];
2005        pLocalIdsOnDSH      = &pDsh[ pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize ];
2006
2007        //Copy Sampler State  and constant buffer on all threads
2008        GLOBAL_MEMCPY( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize );
2009
2010        barrier( CLK_GLOBAL_MEM_FENCE );
2011
2012        //Update BorderColorPointer on all threads
2013        if( pBlockData->m_numberOfSamplerStates )
2014        {
2015            uint SamplerId = get_local_id( 0 );
2016            __global uint* pSamplerState;
2017            while( SamplerId < pBlockData->m_numberOfSamplerStates )
2018            {
2019                pSamplerState = ( __global uint* )&dsh[ DshOffset + pBlockData->m_SamplerStateArrayOffsetOnDSH + SamplerId * OCLRT_SIZEOF_SAMPLER_STATE ];
2020                uint PatchValue = DshOffset >> 5;
2021                patchDword( &pSamplerState[ SAMPLER_STATE_DESCRIPTOR_BORDER_COLOR_POINTER_DWORD ], 5, 31, PatchValue );
2022                SamplerId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20;
2023            }
2024        }
2025
2026        //Setup SSH if needed, do it only for first Walker as all Walkers will re-use the same binding table layout.
2027        if( ( pCommandHeader->m_numGlobalArguments > 0 ) & ( WalkerID == 0 ) )
2028        {
2029            //Global arguments are after scalars, global pointers slm sizes and events
2030            uint offset                 = pCommandHeader->m_numDependencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numOfLocalPtrSizes + ( pCommandHeader->m_numGlobalCapturedBuffer * ( sizeof( ulong ) +  sizeof( uint ) ) / sizeof( uint ) );
2031
2032            pArgumentIds = &pCommandHeader->m_data[ offset ];
2033            //Object IDS are located after Argument IDs
2034            pObjectIds   = &pCommandHeader->m_data[ offset + pCommandHeader->m_numGlobalArguments ];
2035
2036            //Setup local memory for fast access for Curbe patching
2037            uint ArgId = get_local_id( 0 );
2038
2039            //Only third group Updates ObjectIDS, this will be synchronized with condition below
2040            if( ( ArgId >> HW_GROUP_ID_SHIFT( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) ) == 2 )
2041            {
2042                ArgId = ArgId - ( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 << 1 );
2043                while( ArgId < pCommandHeader->m_numGlobalArguments )
2044                {
2045                    objectIds[ pArgumentIds[ ArgId ] ] = pObjectIds[ ArgId ];
2046                    ArgId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
2047                }
2048            }
2049#ifdef SCHEDULER_EMULATION
2050            //Synchronization needed for Emulation, ObjectIDS needs to be set by whole HW group, on GPU there is implicit synchronization in HW group
2051            barrier( CLK_GLOBAL_MEM_FENCE );
2052#endif
2053            if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 )
2054            {
2055                __global uint* pBindingTable = ( __global uint* ) &ssh[ btOffset ];
2056
2057
2058                //To properly set up binding table point to parents surface state heap
2059                for( uint ArgumentID = 0 ; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ )
2060                {
2061                    uint ArgId      = pArgumentIds[ ArgumentID ];
2062
2063                    //Locate proper Arg ID
2064                    //Get ssh offset, lookup table already provided
2065                    if( objectIds[ ArgId ] < MAX_SSH_PER_KERNEL_SIZE )
2066                    {
2067                        if( pSSHdata[ ArgId ].m_sourceOffset == ArgId )
2068                        {
2069                            pBindingTable[ pSSHdata[ ArgId ].m_patchOffset ] = objectIds[ ArgId ];
2070                        }
2071                        else
2072                        {
2073                            pQueue->m_controls.m_ErrorCode += 10;
2074                            uint CurrentArg = 0;
2075                            while( CurrentArg < pKernelAddressData->m_BTSize / 4 )
2076                            {
2077                                if( pSSHdata[ CurrentArg ].m_sourceOffset == ArgId )
2078                                {
2079                                    pBindingTable[ pSSHdata[ CurrentArg ].m_patchOffset ] = objectIds[ ArgId ];
2080                                    break;
2081                                }
2082                                CurrentArg++;
2083                            }
2084                        }
2085                    }
2086                }
2087            }
2088        }
2089
2090        if( ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) )
2091        {
2092            if( get_local_id( 0 ) == 2 * PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
2093            {
2094                for( uint ArgumentID = 0; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ )
2095                {
2096                    uint ArgId      = pArgumentIds[ ArgumentID ];
2097                    if( ( objectIds[ ArgId ] >= MAX_SSH_PER_KERNEL_SIZE ) )
2098                    {
2099                        uint SamplerCount = 0;
2100                        //Get pointer to Parent's samplers ( arguments ) data stored on KRS
2101                        __global IGIL_SamplerParams* pSamplerParamsOnKRS    = ( __global IGIL_SamplerParams* )GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerParamsOffset, pKernelReflection );
2102
2103                        //Iterate through all samplers passed from parent and copy state to proper SSA offset
2104                        while( pKernelReflection->m_ParentSamplerCount > SamplerCount )
2105                        {
2106                            //Get offset in parent's SSA from ObjectID, offset to beginning of SSA is included ( before SSA is BorderColorPointer ) so this is relative to parent's DSH heap
2107                            PatchOffset = objectIds[ ArgId ] - MAX_SSH_PER_KERNEL_SIZE;
2108
2109                            if( pSamplerParamsOnKRS->m_ArgID == ArgId )
2110                            {
2111                                IGILLOCAL_MEMCPY_GTOG( &pDsh[ pSamplerParamsOnKRS->m_SamplerStateOffset ], &dsh[ pQueue->m_controls.m_ParentDSHOffset + PatchOffset ], OCLRT_SIZEOF_SAMPLER_STATE );
2112                                break;
2113                            }
2114                            pSamplerParamsOnKRS = pSamplerParamsOnKRS + 1;
2115                            SamplerCount        = SamplerCount + 1;
2116                        }
2117                    }
2118                }
2119            }
2120        }
2121        __global    char*   pScalarData                     = ( __global char* ) ( &pCommandHeader->m_data[ NumberOfDepencies ] );
2122
2123        CurrentIndex = 0;
2124        uint TotalLocalSize = pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x *
2125                              pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y *
2126                              pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z;
2127
2128        uint NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize;
2129
2130        if( TotalLocalSize % pBlockData->m_SIMDSize != 0 )
2131        {
2132            NumberOfHWThreads++;
2133        }
2134
2135        uint ThreadPayloadSize = NumberOfHWThreads * pBlockData->m_PayloadSize;
2136
2137        //Move pointer to Constant Buffer Offset
2138        pDsh =  ( __global char* )&dsh[ DshOffset + SamplerHeapSize ];
2139
2140        if( ( get_local_id( 0 ) >= PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) & ( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + 6 ) )
2141        {
2142            if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK )
2143            {
2144                while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT )
2145                {
2146                    CurbeSize               = pKernelCurbeParams[ CurrentIndex ].m_parameterSize;
2147                    PatchOffset             = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
2148                    IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize );
2149                    pScalarData             += CurbeSize;
2150                    CurrentIndex++;
2151                }
2152            }
2153            if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK )
2154            {
2155                CurrentIndex     = PatchLocalWorkSizesParallel( CurrentIndex,
2156                                                                SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE,
2157                                                                pKernelCurbeParams,
2158                                                                pDsh,
2159                                                                LocalSizes.x,
2160                                                                LocalSizes.y,
2161                                                                LocalSizes.z,
2162                                                                pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x,
2163                                                                pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y,
2164                                                                pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z );
2165            }
2166            if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK )
2167            {
2168                CurrentIndex     = PatchDSH6TokensParallel20( CurrentIndex,
2169                                                              SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE,
2170                                                              pKernelCurbeParams,
2171                                                              pDsh,
2172                                                              GlobalSizes.x,
2173                                                              GlobalSizes.y,
2174                                                              GlobalSizes.z );
2175            }
2176            if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK )
2177            {
2178                CurrentIndex     = PatchDSH6TokensParallel20( CurrentIndex,
2179                                                              SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS,
2180                                                              pKernelCurbeParams,
2181                                                              pDsh,
2182                                                              pWalkerEnum->TotalDimSize.x,
2183                                                              pWalkerEnum->TotalDimSize.y,
2184                                                              pWalkerEnum->TotalDimSize.z );
2185            }
2186            if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK )
2187            {
2188                CurrentIndex     = PatchDSH1TokenParallel20( CurrentIndex,
2189                                                             SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS,
2190                                                             pKernelCurbeParams,
2191                                                             pDsh,
2192                                                             WorkDim );
2193            }
2194            if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK )
2195            {
2196                CurrentIndex     = PatchLocalMemEntities( CurrentIndex,
2197                                                          SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES,
2198                                                          pKernelCurbeParams,
2199                                                          pDsh,
2200                                                          pCommandHeader );
2201            }
2202            if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK )
2203            {
2204                CurrentIndex     = PatchDSH6TokensParallel20( CurrentIndex,
2205                                                              SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET,
2206                                                              pKernelCurbeParams,
2207                                                              pDsh,
2208                                                              GlobalOffset.x,
2209                                                              GlobalOffset.y,
2210                                                              GlobalOffset.z );
2211            }
2212            if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK )
2213            {
2214                CurrentIndex     = PatchDSH1TokenParallel20( CurrentIndex,
2215                                                             SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS,
2216                                                             pKernelCurbeParams,
2217                                                             pDsh,
2218                                                             NumberOfHWThreads );
2219            }
2220            if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK )
2221            {
2222                CurrentIndex     = PatchDSH1TokenParallel20( CurrentIndex,
2223                                                             SCHEDULER_DATA_PARAMETER_PARENT_EVENT,
2224                                                             pKernelCurbeParams,
2225                                                             pDsh,
2226                                                             pCommandHeader->m_event );
2227            }
2228            if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK )
2229            {
2230                CurrentIndex     = PatchDSH6TokensParallel20( CurrentIndex,
2231                                                              SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE,
2232                                                              pKernelCurbeParams,
2233                                                              pDsh,
2234                                                              LocalSizes.x,
2235                                                              LocalSizes.y,
2236                                                              LocalSizes.z );
2237            }
2238            if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER )
2239            {
2240                if( pCommandHeader->m_numGlobalCapturedBuffer > 0 )
2241                {
2242                    //Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader
2243                    __global    uint*  pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] );
2244                    __global    uint*  pGlobalPtrs    = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] );
2245                    uint        StartIndex            = CurrentIndex;
2246
2247                    //Argument in command header are not in correct sequence, that's why proper key needs to be located
2248                    for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++)
2249                    {
2250                        //Reset CurrentIndex as we need to start from the beginning.
2251                        CurrentIndex  = StartIndex;
2252                        while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
2253                        {
2254                            //Patch only if exact match occurs
2255                            if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes )
2256                            {
2257                                PatchOffset             = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
2258                                //64 bit patching
2259                                if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 )
2260                                {
2261                                    __global uint* pDst = (__global uint *) &pDsh[PatchOffset];
2262                                    pDst[0] = pGlobalPtrs[0];
2263                                    pDst[1] = pGlobalPtrs[1];
2264                                }
2265                                else
2266                                {
2267                                    __global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ];
2268                                    *pDst               = ( uint ) *pGlobalPtrs;
2269                                }
2270                            }
2271                            CurrentIndex++;
2272                        }
2273                        pGlobalPtrs += 2;
2274                        pGlobalIndexes++;
2275                    }
2276                }
2277                while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
2278                {
2279                    CurrentIndex++;
2280                }
2281            }
2282
2283            //Patch images curbe entries
2284            if( ( PatchMask & SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES ) | ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) )
2285            {
2286                if( ( pArgumentIds != NULL ) & ( pObjectIds != NULL ) )
2287                {
2288                    //pKernelReflectionChar is a global address pointer
2289                    __global IGIL_ImageParamters      *pImageParams           = ( __global IGIL_ImageParamters * ) &pKernelReflectionChar[ pKernelReflection->m_ParentImageDataOffset ];
2290                    __global IGIL_ParentSamplerParams *pParentSamplerParams   = ( __global IGIL_ParentSamplerParams* ) &pKernelReflectionChar[ pKernelReflection->m_ParentSamplerParamsOffset ];
2291                    //First obtain argument ID
2292                    uint WorkID = get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
2293                    while( WorkID + CurrentIndex < pBlockData->m_numberOfCurbeTokens )
2294                    {
2295                        uint ArgId              = pKernelCurbeParams[ CurrentIndex + WorkID ].m_sourceOffset;
2296                        uint ObjectID           = objectIds[ ArgId ];
2297                        uint CurrentImage       = 0;
2298                        uint CurrentSampler     = 0;
2299
2300                        uint PatchValue         = 0;
2301                        uint TokenType          = pKernelCurbeParams[ CurrentIndex + WorkID ].m_parameterType;
2302                        uint PatchOffset        = pKernelCurbeParams[ CurrentIndex + WorkID ].m_patchOffset;
2303                        uint PatchValueInvalid  = 0;
2304
2305                        //If Images
2306                        if( ObjectID < OCLRT_IMAGE_MAX_OBJECT_ID )
2307                        {
2308                            //Locate proper parent Image
2309                            while( ( pImageParams[ CurrentImage ].m_ObjectID != ObjectID ) & ( CurrentImage < pKernelReflection->m_ParentKernelImageCount ) )
2310                            {
2311                                CurrentImage++;
2312                            }
2313                            //Proper image is located under CurrentImage patch the token
2314
2315                            if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH )
2316                            {
2317                                PatchValue  = pImageParams[ CurrentImage ].m_Width;
2318                            }
2319                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT )
2320                            {
2321                                PatchValue  = pImageParams[ CurrentImage ].m_Height;
2322                            }
2323                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH )
2324                            {
2325                                PatchValue  = pImageParams[ CurrentImage ].m_Depth;
2326                            }
2327                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE )
2328                            {
2329                                PatchValue  = pImageParams[ CurrentImage ].m_ChannelDataType;
2330                            }
2331                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER )
2332                            {
2333                                PatchValue  = pImageParams[ CurrentImage ].m_ChannelOrder;
2334                            }
2335                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER )
2336                            {
2337                                PatchValue  = GetSRGBChannelOrder( pImageParams[ CurrentImage ].m_ChannelOrder );
2338                            }
2339                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE )
2340                            {
2341                                PatchValue  = pImageParams[ CurrentImage ].m_ArraySize;
2342                            }
2343                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES )
2344                            {
2345                                PatchValue  = pImageParams[ CurrentImage ].m_NumSamples;
2346                            }
2347                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS )
2348                            {
2349                                PatchValue  = pImageParams[ CurrentImage ].m_NumMipLevels;
2350                            }
2351                            else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID )
2352                            {
2353                                PatchValue  = ObjectID;
2354                            }
2355                            else
2356                            {
2357                                PatchValueInvalid = 1;
2358                            }
2359                        }
2360                        //If Sampler
2361                        else if( ObjectID >= OCLRT_SAMPLER_MIN_OBJECT_ID )
2362                        {
2363                            //Mark PatchValue invalid if SamplerParams will not be found
2364                            PatchValueInvalid = 1;
2365                            //Locate proper parent Image
2366                            while( CurrentSampler < pKernelReflection->m_ParentSamplerCount )
2367                            {
2368                                if( pParentSamplerParams[ CurrentSampler ].m_ObjectID == ObjectID )
2369                                {
2370                                    PatchValueInvalid = 0;
2371                                    if( TokenType == DATA_PARAMETER_SAMPLER_ADDRESS_MODE )
2372                                    {
2373                                        PatchValue = pParentSamplerParams[ CurrentSampler ].m_AddressingMode;
2374                                    }
2375                                    else if( TokenType == DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS )
2376                                    {
2377                                        PatchValue = pParentSamplerParams[ CurrentSampler ].NormalizedCoords;
2378                                    }
2379                                    else if( TokenType == DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED )
2380                                    {
2381                                        PatchValue = pParentSamplerParams[ CurrentSampler ].CoordinateSnapRequired;
2382                                    }
2383                                    else if( TokenType == SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID )
2384                                    {
2385                                        PatchValue = ObjectID;
2386                                    }
2387                                    else
2388                                    {
2389                                        PatchValueInvalid = 1;
2390                                    }
2391                                    CurrentSampler = pKernelReflection->m_ParentSamplerCount;
2392                                }
2393                                CurrentSampler++;
2394                            }
2395                        }
2396                        else
2397                        {
2398                            PatchValueInvalid = 1;
2399                        }
2400
2401                        if( PatchValueInvalid == 0 )
2402                        {
2403                            *( __global uint * ) ( &pDsh[ PatchOffset ] ) = PatchValue;
2404                        }
2405                        CurrentIndex += 6;
2406                    }
2407
2408                }
2409                else
2410                {
2411                    pQueue->m_controls.m_ErrorCode += 7;
2412                }
2413            }
2414        }
2415#ifdef SCHEDULER_EMULATION
2416        barrier( CLK_GLOBAL_MEM_FENCE );
2417#endif
2418
2419        if( get_local_id( 0 ) == 0 )
2420        {
2421#if defined WA_LRI_COMMANDS_EXIST
2422            bool ShouldDisablePreemption = false;
2423#endif
2424            //Profiling support
2425            if( pQueue->m_controls.m_IsProfilingEnabled != 0 )
2426            {
2427                bool DisableTimeStampStart = true;
2428                bool DisableTimeStampEnd   = true;
2429                if( ( ( uint )pCommandHeader->m_event != IGIL_EVENT_INVALID_HANDLE ) & ( ( WalkerID == 0 ) | ( WalkerID == WalkerCount - 1 ) ) )
2430                {
2431                    //Event is propagated to childs as "parent event", to avoid overwriting the same start value, only generate timestamp write
2432                    //For the first command for this event, this means we look for event with no children ( so compare to 1 ).
2433                    clk_event_t EventID = __builtin_astype( ( void* ) ( ( ulong ) pCommandHeader->m_event ), clk_event_t );
2434                    __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool );
2435
2436                    if( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_numChildren == 1 )
2437                    {
2438#if defined WA_LRI_COMMANDS_EXIST && defined WA_PROFILING_PREEMPTION
2439                        //This is a case, where profiling of block kernels occurs - presence of event in EM workload
2440                        //In such case, disable preemption around all WALKERs for that block kernel and event
2441                        ShouldDisablePreemption = true;
2442#endif
2443                        if( WalkerID == 0 )
2444                        {
2445                            //Emit pipecontrol with timestamp write
2446                            ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdStart );
2447                            //Timestamp start
2448                            PatchPipeControlProfilingAddres( secondaryBatchBuffer,
2449                                                             SLBOffset,
2450                                                             Address,
2451                                                             PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
2452                            DisableTimeStampStart = false;
2453                        }
2454                        if( WalkerID == WalkerCount - 1 )
2455                        {
2456                            ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdEnd );
2457                            //Timestamp end
2458                            PatchPipeControlProfilingAddres( secondaryBatchBuffer,
2459                                                             SLBOffset,
2460                                                             Address,
2461                                                             PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
2462                            DisableTimeStampEnd = false;
2463                        }
2464                    }
2465                }
2466                if( DisableTimeStampStart )
2467                {
2468                    DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
2469                                                     SLBOffset,
2470                                                     PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
2471                }
2472                if( DisableTimeStampEnd )
2473                {
2474                    DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
2475                                                     SLBOffset,
2476                                                     PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
2477                }
2478            }
2479            else
2480            {
2481                //Optimized path, in case block can be run concurently noop pipe control after such block.
2482                uint DwordOffset = SLBOffset / DWORD_SIZE_IN_BYTES;
2483
2484                if( pBlockData->m_CanRunConcurently != 0 )
2485                {
2486                    NOOPCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
2487                }
2488                else
2489                {
2490                    PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
2491                }
2492            }
2493
2494#if defined WA_LRI_COMMANDS_EXIST
2495            bool NoopPreemptionDisabling = true;
2496            bool NoopPreemptionEnabling = true;
2497
2498#if defined WA_KERNEL_PREEMPTION
2499            //This is case, where block kernel should have disabled preemption because of its sampler usage around all WALKERs of that block kernel
2500            //Preemption should be disabled when EM event profiling is used OR kernel data indicate such behavior
2501            ShouldDisablePreemption |= ( pBlockData->m_DisablePreemption != 0 );
2502#endif
2503
2504#if defined WA_PROFILING_PREEMPTION
2505            //m_EventTimestampAddress != NULL means profiling of the whole workload is enabled (preemption around whole chained BB is disabled)
2506            //So disabling preemption should be permitted only when workload profiling is off, in other cases noop all LRI commands
2507            //For m_EventTimestampAddress != NULL preemption is enabled before BB_END
2508            ShouldDisablePreemption &= ( pQueue->m_controls.m_EventTimestampAddress == 0 );
2509#endif
2510
2511            if( ShouldDisablePreemption != false )
2512            {
2513                if( WalkerID == 0 )
2514                {
2515                    SetDisablePreemptionRegister( SLBOffset, secondaryBatchBuffer );
2516                    NoopPreemptionDisabling = false;
2517                }
2518
2519                if( WalkerID == WalkerCount - 1 )
2520                {
2521                    SetEnablePreemptionRegister( SLBOffset, secondaryBatchBuffer );
2522                    NoopPreemptionEnabling = false;
2523                }
2524            }
2525
2526            if( NoopPreemptionDisabling )
2527            {
2528                NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
2529            }
2530
2531            if( NoopPreemptionEnabling )
2532            {
2533                NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
2534            }
2535#endif //WA_LRI_COMMANDS_EXIST
2536        }
2537
2538        //Witems from 0 to 16 are responsible for local ids generation.
2539        if( ( get_local_id( 0 ) < 16 ) & ( pBlockData->m_NeedLocalIDS != 0 ) )
2540        {
2541            //Now generate local IDS
2542            generateLocalIDSParallel20( pLocalIdsOnDSH, pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize, NumberOfHWThreads, pBlockData->m_SIMDSize );
2543        }
2544        //3rd HW thread will take care of patching media curbe load and GPPGU_WALKER command
2545        if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 )
2546        {
2547
2548            uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize;
2549            //Update Interface Descriptor Data with SLM size  / number of HW threads.
2550            CopyAndPatchIDData20(dsh, blockId, NumberOfHWThreads, TotalSLMSize, IntefaceDescriptorOffset,
2551                                 pQueue->m_controls.m_StartBlockID, btOffset, DshOffset,
2552                                 pBlockData->m_numberOfSamplerStates
2553#ifdef ENABLE_DEBUG_BUFFER
2554                                 ,
2555                                 DebugQueue
2556#endif
2557            );
2558
2559            patchGpGpuWalker( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, pBlockData->m_SIMDSize,
2560                              TotalLocalSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerDimSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerStartPoint,
2561                              NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, SamplerHeapSize + DshOffset );
2562
2563            PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_INITIAL );
2564            PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_SECOND );
2565        }
2566    }
2567}
2568
2569uint CheckEventStatus( __global IGIL_CommandHeader* pCommand,
2570                       __global IGIL_EventPool* eventsPool )
2571{
2572    if( pCommand->m_numDependencies == 0 )
2573    {
2574        return 0;
2575    }
2576    else
2577    {
2578         __global IGIL_DeviceEvent* pDeviceEvent;
2579         //Events are stored at the begining of command packet dynamic payload
2580         for( uint i = 0; i < pCommand->m_numDependencies; i++ )
2581         {
2582             pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] );
2583             if( pDeviceEvent->m_state != CL_COMPLETE )
2584             {
2585                 return 1;
2586             }
2587         }
2588    }
2589    return 0;
2590}
2591
2592void DecreaseEventDependenciesParallel( __global IGIL_CommandHeader* pCommand,
2593                                        __global IGIL_EventPool* eventsPool )
2594{
2595     __global IGIL_DeviceEvent* pDeviceEvent;
2596
2597     //Events are stored at the begining of command packet dynamic payload
2598     for( uint i = 0; i < pCommand->m_numDependencies; i++ )
2599     {
2600         pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] );
2601         int OldDependants = atomic_dec( &pDeviceEvent->m_numDependents );
2602
2603         if( ( pDeviceEvent->m_refCount <= 0 ) &
2604             ( ( OldDependants - 1 ) <= 0 ) &
2605             ( pDeviceEvent->m_numChildren <= 0 ) )
2606             {
2607                 TEMP_IGIL_FreeEvent( __builtin_astype( ( void* )( ( ulong )pCommand->m_data[ i ] ), clk_event_t ), eventsPool );
2608             }
2609    }
2610}
2611
2612//Update status of the event and all events that are depending on this event
2613void UpdateEventsTreeStatusParallel( clk_event_t eventId, __global IGIL_EventPool* eventsPool, bool isProfilingEnabled )
2614{
2615    __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool );
2616    __global IGIL_DeviceEvent *pEvent;
2617    do
2618    {
2619        pEvent = &events[ (uint) (size_t)__builtin_astype( eventId, void* ) ];
2620
2621        int OldNumChild = atomic_dec( &pEvent->m_numChildren );
2622        if( ( OldNumChild - 1 ) <= 0 )
2623        {
2624            pEvent->m_state = CL_COMPLETE;
2625
2626            if( ( pEvent->m_refCount <= 0 ) &
2627                ( pEvent->m_numDependents <= 0 ) &
2628                ( pEvent->m_numChildren <= 0 ) )
2629            {
2630                TEMP_IGIL_FreeEvent( eventId, eventsPool );
2631            }
2632            //This event transitions to CL_COMPLETE state, update it profiling informations.
2633            if( isProfilingEnabled != 0 )
2634            {
2635                //CL COMPLETE time is before this scheduler starts
2636                pEvent->m_profilingCmdComplete = eventsPool->m_CLcompleteTimestamp;
2637
2638                //Check if this event has profiling pointer, if so update profiling data, all times should be there atm
2639                if( pEvent->m_pProfiling != 0 )
2640                {
2641                    __global ulong* retValues = ( __global ulong * )pEvent->m_pProfiling;
2642
2643                    ulong StartTime                = pEvent->m_profilingCmdStart;
2644                    ulong EndTime                  = pEvent->m_profilingCmdEnd;
2645                    ulong CompleteTime             = pEvent->m_profilingCmdComplete;
2646                    ulong CLEndTransitionTime      = 0;
2647                    ulong CLCompleteTransitionTime = 0;
2648                    //Check if timer didn't reset by hitting max value
2649                    if( CompleteTime > StartTime )
2650                    {
2651                        CLEndTransitionTime      = EndTime - StartTime;
2652                        CLCompleteTransitionTime = CompleteTime - StartTime;
2653                    }
2654                    //If we hit this else it means that GPU timer reset to 0, compute proper delta
2655                    else
2656                    {
2657                        if( EndTime < StartTime )
2658                        {
2659                            CLEndTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + EndTime;
2660                        }
2661                        else
2662                        {
2663                            CLEndTransitionTime = EndTime - StartTime;
2664                        }
2665                        CLCompleteTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + CompleteTime;
2666                    }
2667                    //First value is END - START timestamp
2668                    retValues[ 0 ] = ( ulong )( ( float )CLEndTransitionTime * eventsPool->m_TimestampResolution );
2669                    //Second value is COMPLETE - START timestamp
2670                    retValues[ 1 ] = ( ulong )( ( float )CLCompleteTransitionTime * eventsPool->m_TimestampResolution );
2671                }
2672            }
2673            //Signal parent because we completed
2674            eventId = __builtin_astype( ( void* )( ( ulong )pEvent->m_parentEvent ), clk_event_t );
2675        }
2676    }
2677    while ( ( ( uint )(size_t)__builtin_astype( eventId, void* ) != IGIL_EVENT_INVALID_HANDLE ) & ( pEvent->m_numChildren <= 0 ) );
2678}
2679
2680void GlobalBarrier( __global volatile uint* syncSurface )
2681{
2682    //Make sure each WKG item hit the barrier.
2683    barrier( CLK_GLOBAL_MEM_FENCE );
2684
2685    //Now first thread of each wkg writes to designated place on SyncSurface
2686    if ( get_local_id( 0 ) == 0 )
2687    {
2688        syncSurface[ get_group_id( 0 ) ] = 1;
2689    }
2690    //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
2691    if( get_group_id( 0 ) == ( get_num_groups( 0 ) - 1 ) )
2692    {
2693        //24 -48 case
2694        uint Value;
2695        do
2696        {
2697            Value = 1;
2698            for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
2699            {
2700                Value = Value & syncSurface[ i ];
2701            }
2702
2703        }
2704        while( Value == 0 );
2705        barrier( CLK_GLOBAL_MEM_FENCE );
2706
2707        for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
2708        {
2709            syncSurface[ i ] = 0;
2710        }
2711    }
2712
2713    if( get_local_id( 0 ) == 0 )
2714    {
2715        while( syncSurface[ get_group_id( 0 ) ] != 0 );
2716    }
2717    barrier( CLK_GLOBAL_MEM_FENCE );
2718}
2719
2720void GlobalBarrierUpdateQueue( __global volatile uint* syncSurface, __global IGIL_CommandQueue* pQueue )
2721{
2722    //Make sure each WKG item hit the barrier.
2723    barrier( CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
2724    //Now first thread of each wkg writes to designated place on SyncSurface
2725    if ( get_local_id(0) == 0 )
2726    {
2727        syncSurface[get_group_id(0)] = 1;
2728    }
2729    //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
2730    if( get_group_id(0) == ( get_num_groups( 0 ) - 1 ) )
2731    {
2732        uint Value;
2733        do
2734        {
2735            Value = 1;
2736            for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
2737            {
2738                Value = Value & syncSurface[ i ];
2739            }
2740        }
2741        while( Value == 0 );
2742        barrier( CLK_GLOBAL_MEM_FENCE );
2743        pQueue->m_controls.m_IDTAfterFirstPhase = pQueue->m_controls.m_CurrentIDToffset;
2744
2745        barrier( CLK_GLOBAL_MEM_FENCE );
2746
2747        for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
2748        {
2749            syncSurface[ i ] = 0;
2750        }
2751    }
2752
2753    if( get_local_id(0) == 0 )
2754    {
2755        while( syncSurface[ get_group_id(0) ] != 0 );
2756    }
2757    barrier( CLK_GLOBAL_MEM_FENCE );
2758}
2759
2760
2761#ifdef SCHEDULER_EMULATION
2762__local int IDTOffset;
2763__local int DSHOffset;
2764__local int SLBOffset;
2765__local int StackOffset;
2766__local int QStorageOffset;
2767__local int MarkerOffset;
2768__local int BTSoffset;
2769__local IGIL_WalkerEnumeration WalkerEnum;
2770__local uint ObjectIDS[ MAX_GLOBAL_ARGS ];
2771#endif
2772
2773#define WA_INT_DESC_MAX 62
2774__kernel __attribute__((intel_reqd_sub_group_size(PARALLEL_SCHEDULER_COMPILATION_SIZE_20)))
2775void SchedulerParallel20(
2776    __global IGIL_CommandQueue* pQueue,
2777    __global uint* commandsStack,
2778    __global IGIL_EventPool*    eventsPool,
2779    __global uint* secondaryBatchBuffer,        //SLB that will be used to put commands in.
2780    __global char* dsh,                         //Pointer to the start of Dynamic State Heap
2781    __global IGIL_KernelDataHeader* kernelData, //This is kernel reflection surface
2782    __global volatile uint* queueStorageBuffer,
2783    __global char* ssh,                         //Pointer to Surface state heap with BT and SS
2784    __global DebugDataBuffer* debugQueue )
2785{
2786    EMULATION_ENTER_FUNCTION( );
2787#ifdef WA_DISABLE_SCHEDULERS
2788    return;
2789#endif
2790
2791#ifdef DEBUG
2792    //Early return enabled when m_SchedulerEarlyReturn is > 0,
2793    if( pQueue->m_controls.m_SchedulerEarlyReturn > 0 )
2794    {
2795        if( pQueue->m_controls.m_SchedulerEarlyReturn == 1 )
2796        {
2797            return;
2798        }
2799
2800        if( get_global_id( 0 ) == 0 )
2801        {
2802            pQueue->m_controls.m_SchedulerEarlyReturnCounter++;
2803        }
2804        GlobalBarrier( queueStorageBuffer );
2805
2806        if( pQueue->m_controls.m_SchedulerEarlyReturnCounter == pQueue->m_controls.m_SchedulerEarlyReturn )
2807        {
2808            if( ( ( get_group_id( 0 ) == 1) == ( get_num_groups( 0 ) > 1 )  ) & ( get_local_id( 0 ) == 0 ) )
2809            {
2810#ifdef ENABLE_DEBUG_BUFFER
2811                //Set START time of current (last) scheduler
2812                if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
2813                {
2814                    *( ( __global ulong * ) ( &DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] ) ) = EventsPool->m_CLcompleteTimestamp;
2815                }
2816#endif
2817                pQueue->m_controls.Temporary[ 2 ]++;
2818
2819                //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
2820                __private uint DwordOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET ) / DWORD_SIZE_IN_BYTES;
2821                //BB_START 1st DWORD
2822                secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0;
2823                DwordOffset++;
2824                //BB_START 2nd DWORD - Address, 3rd DWORD Address high
2825                secondaryBatchBuffer[ DwordOffset++ ] = (uint)(pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF);
2826                secondaryBatchBuffer[ DwordOffset ] = (uint)((pQueue->m_controls.m_CleanupSectionAddress >> 32) & 0xFFFFFFFF);
2827            }
2828            return;
2829        }
2830    }
2831#endif
2832
2833    //First check if there are any new command packets on queue_t
2834    __global IGIL_CommandHeader* pCommand = 0;
2835    uint GroupID                          = get_group_id( 0 );
2836
2837#ifndef SCHEDULER_EMULATION
2838    __local int IDTOffset;
2839    __local int DSHOffset;
2840    __local int SLBOffset;
2841    __local int StackOffset;
2842    __local int QStorageOffset;
2843    __local int MarkerOffset;
2844    __local int BTSoffset;
2845    __local IGIL_WalkerEnumeration WalkerEnum;
2846    __local uint ObjectIDS[ MAX_GLOBAL_ARGS ];
2847
2848#endif
2849
2850    if( pQueue->m_controls.m_LastScheduleEventNumber > 0 )
2851    {
2852        //Check if there are any events that needs updating, each wkg uses all hw threads in wkg to update events
2853        if( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 < pQueue->m_controls.m_LastScheduleEventNumber )
2854        {
2855            clk_event_t EventID;
2856            if( get_local_id( 0 ) % PARALLEL_SCHEDULER_COMPILATION_SIZE_20 == 0 )
2857            {
2858                uint ID = ( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 ) + ( get_local_id( 0 ) / PARALLEL_SCHEDULER_COMPILATION_SIZE_20 );
2859                while( ID < pQueue->m_controls.m_LastScheduleEventNumber )
2860                {
2861                    EventID = __builtin_astype( ( void* )( ( ulong )pQueue->m_controls.m_EventDependencies[ ID ] ), clk_event_t );
2862                    UpdateEventsTreeStatusParallel( EventID, eventsPool, ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) );
2863                    ID      += get_num_groups( 0 ) * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20;
2864                }
2865            }
2866        }
2867        GlobalBarrier( queueStorageBuffer );
2868    }
2869    //Queue parsing section
2870    uint NumberOfEnqueues = pQueue->m_controls.m_TotalNumberOfQueues - pQueue->m_controls.m_PreviousNumberOfQueues;
2871    if( NumberOfEnqueues > 0 )
2872    {
2873        uint InitialOffset              = pQueue->m_controls.m_PreviousHead;
2874        bool PacketScheduled            = true;
2875        uint offset                     = 0;
2876
2877        for( uint CurrentPacket = GroupID; CurrentPacket < NumberOfEnqueues; CurrentPacket += get_num_groups( 0 ) )
2878        {
2879            if( CurrentPacket == GroupID )
2880            {
2881                offset                  = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, InitialOffset, CurrentPacket );
2882            }
2883            else
2884            {
2885                offset                  = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, offset, get_num_groups( 0 ) );
2886            }
2887            pCommand                    = TEMP_IGIL_GetCommandHeader( pQueue, offset );
2888
2889            //Initialize command packet with proper lws
2890            if( get_local_id( 0 ) == 0 )
2891            {
2892                InitializeCommandPacket( pCommand );
2893            }
2894
2895            //Can I run this command ?
2896            if( CheckEventStatus( pCommand, eventsPool ) == 0 )
2897            {
2898                //Is it marker command ?
2899                if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER )
2900                {
2901                    //Is there enough IDT space for me ?
2902                    if( get_local_id( 0 ) == 0 )
2903                    {
2904                        int WalkerNeeded = GetWalkerCount( pCommand );
2905                        //Optimization - check if IDT has free space for me
2906                        if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded  <= WA_INT_DESC_MAX )
2907                        {
2908                            uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded );
2909                            if( Temp + WalkerNeeded <= WA_INT_DESC_MAX )
2910                            {
2911                                IDTOffset = Temp;
2912                                DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) );
2913                                SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET );
2914                                BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize );
2915                            }
2916                            else
2917                            {
2918                                IDTOffset = -1;
2919                            }
2920                        }
2921                        else
2922                        {
2923                            IDTOffset = -1;
2924                        }
2925                    }
2926                    //Now barrier and check if we can go with scheduling
2927                    barrier( CLK_LOCAL_MEM_FENCE );
2928
2929                    if( IDTOffset != -1 )
2930                    {
2931                        //This packet is all set, schedule it and we are done with it.
2932                        //Patch DSH has media curbe load and patch gpgpu walker inside
2933                        PatchDSHParallelWithDynamicDSH20( SLBOffset,
2934                                                          DSHOffset,
2935                                                          IDTOffset,
2936                                                          kernelData,
2937                                                          dsh,
2938                                                          pCommand->m_kernelId,
2939                                                          pCommand,
2940                                                          secondaryBatchBuffer,
2941                                                          pQueue,
2942                                                          eventsPool,
2943                                                          ssh,
2944                                                          BTSoffset,
2945                                                          &WalkerEnum,
2946                                                          ObjectIDS
2947#ifdef ENABLE_DEBUG_BUFFER
2948                                                          , DebugQueue
2949#endif
2950                                                          );
2951                        PacketScheduled = true;
2952                    }
2953                    else
2954                    {
2955                        PacketScheduled = false;
2956                    }
2957                }
2958                else //For marker we need to update returned event status
2959                {
2960                    //Check if there is space to track event
2961                    if( get_local_id( 0 ) == 0 )
2962                    {
2963                        uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled );
2964                        if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER )
2965                        {
2966                            MarkerOffset = Temp;
2967                        }
2968                        else
2969                        {
2970                            MarkerOffset = -1;
2971                        }
2972                    }
2973                    barrier( CLK_LOCAL_MEM_FENCE );
2974                    if( MarkerOffset != -1 )
2975                    {
2976                        PacketScheduled = true;
2977                    }
2978                    else
2979                    {
2980                        PacketScheduled = false;
2981                    }
2982                }
2983                //Update event dependencies if any, if there are event waiting for status change, put them on the list.
2984                if( PacketScheduled == true )
2985                {
2986                    if( get_local_id( 0 ) == 0 )
2987                    {
2988                        if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
2989                        {
2990                           pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
2991                        }
2992                        //Remove event dependencies setting.
2993                        if( pCommand->m_numDependencies > 0 )
2994                        {
2995                            DecreaseEventDependenciesParallel( pCommand, eventsPool );
2996                        }
2997                    }
2998                }
2999            }
3000            //Can't schedule it right now, move to storage.
3001            else
3002            {
3003                if( pQueue->m_controls.m_IsSimulation )
3004                {
3005                    barrier( CLK_LOCAL_MEM_FENCE );
3006                }
3007                PacketScheduled = false;
3008            }
3009
3010            //Allocation failure, move command to stack storage and update stack pointers
3011            if( PacketScheduled == false )
3012            {
3013                if( get_local_id( 0 ) == 0 )
3014                {
3015                    StackOffset                  = atomic_dec( &pQueue->m_controls.m_StackTop ) - 1;
3016                    QStorageOffset               = atomic_sub( &pQueue->m_controls.m_QstorageTop, pCommand->m_commandSize ) - pCommand->m_commandSize;
3017                    commandsStack[ StackOffset ] = QStorageOffset;
3018                }
3019                barrier( CLK_LOCAL_MEM_FENCE );
3020                __global char* ptrQueue = ( __global char* )pQueue;
3021                GLOBAL_MEMCPY( ( __global void* )&queueStorageBuffer[ QStorageOffset / 4 ], (__global void * )&ptrQueue[ offset ] , pCommand->m_commandSize );
3022            }
3023            else if( pQueue->m_controls.m_IsSimulation )
3024            {
3025                barrier( CLK_LOCAL_MEM_FENCE );
3026            }
3027        }
3028
3029        //In case there were new enqueues on queue_t, all work items must hit the global barrier before they can start taking items from the stack.
3030        GlobalBarrierUpdateQueue( queueStorageBuffer, pQueue );
3031    }
3032
3033    //Check stack only when there are free IDTS
3034    if( ( pQueue->m_controls.m_IDTAfterFirstPhase < WA_INT_DESC_MAX ) &
3035        ( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) )
3036    {
3037        //Start stack browsing
3038        uint MyID            = get_group_id( 0 );
3039        //Start browsing from the begining of the previous stack top
3040        uint CurrentOffset   = pQueue->m_controls.m_PreviousStackTop + MyID;
3041        uint CommandOffset   = 0;
3042
3043        while( CurrentOffset < pQueue->m_controls.m_StackSize )
3044        {
3045            CommandOffset = commandsStack[ CurrentOffset ];
3046
3047            if( CommandOffset != 0 )
3048            {
3049                pCommand = GetCommandHeaderFromStorage( ( __global uint* )queueStorageBuffer, CommandOffset );
3050
3051                //Can I run this command ?
3052                if( CheckEventStatus( pCommand, eventsPool ) == 0 )
3053                {
3054                    //Is it marker command ?
3055                    if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER )
3056                    {
3057                        //Is there enough IDT space for me ?
3058                        if( get_local_id( 0 ) == 0 )
3059                        {
3060                            int WalkerNeeded = GetWalkerCount( pCommand );
3061                            //Optimization - check if IDT has free space for me
3062                            if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded  <= WA_INT_DESC_MAX )
3063                            {
3064                                uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded );
3065                                if( Temp + WalkerNeeded <= WA_INT_DESC_MAX )
3066                                {
3067                                    IDTOffset = Temp;
3068                                    DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) );
3069                                    SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET );
3070                                    BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize );
3071                                }
3072                                else
3073                                {
3074                                    IDTOffset = -1;
3075                                }
3076                            }
3077                            else
3078                            {
3079                                 IDTOffset = -1;
3080                            }
3081                        }
3082                        //Now barrier and check if we can go with scheduling
3083                        barrier( CLK_LOCAL_MEM_FENCE );
3084                        if( IDTOffset != -1 )
3085                        {
3086                            //This packet is all set, schedule it and we are done with it.
3087                            //Patch DSH has media curbe load and patch gpgpu walker inside
3088                            PatchDSHParallelWithDynamicDSH20( SLBOffset,
3089                                                              DSHOffset,
3090                                                              IDTOffset,
3091                                                              kernelData,
3092                                                              dsh,
3093                                                              pCommand->m_kernelId,
3094                                                              pCommand,
3095                                                              secondaryBatchBuffer,
3096                                                              pQueue,
3097                                                              eventsPool,
3098                                                              ssh,
3099                                                              BTSoffset,
3100                                                              &WalkerEnum,
3101                                                              ObjectIDS
3102#ifdef ENABLE_DEBUG_BUFFER
3103                                                              , DebugQueue
3104#endif
3105                                                              );
3106                            pCommand->m_commandState = CAN_BE_RECLAIMED;
3107                            //Reset stack offset
3108                            commandsStack[ CurrentOffset ] = 0;
3109
3110                            //Update event status
3111                            if( get_local_id( 0 ) == 0 )
3112                            {
3113                                //Add events dependant on this command to list of events neeeded to be updated.
3114                                if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
3115                                {
3116                                    pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
3117                                }
3118
3119                                //Remove event dependencies setting.
3120                                if( pCommand->m_numDependencies > 0 )
3121                                {
3122                                    DecreaseEventDependenciesParallel( pCommand, eventsPool );
3123                                }
3124                            }
3125                        }
3126                    }
3127                    else // For marker we need to update returned event status
3128                    {
3129                        barrier( CLK_GLOBAL_MEM_FENCE );
3130                        //Check if there is space to track event
3131                        if( get_local_id( 0 ) == 0 )
3132                        {
3133                            uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled );
3134                            if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER )
3135                            {
3136                                pCommand->m_commandState = CAN_BE_RECLAIMED;
3137                                commandsStack[ CurrentOffset ] = 0;
3138                                //Add events dependant on this command to list of events neeeded to be updated.
3139                                if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
3140                                {
3141                                    pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
3142                                }
3143
3144                                //Remove event dependencies setting.
3145                                if( pCommand->m_numDependencies > 0 )
3146                                {
3147                                    DecreaseEventDependenciesParallel( pCommand, eventsPool );
3148                                }
3149                            }
3150                        }
3151                    }
3152                }
3153            }
3154            CurrentOffset += get_num_groups( 0 );
3155            if( pQueue->m_controls.m_IsSimulation )
3156            {
3157                barrier( CLK_LOCAL_MEM_FENCE );
3158            }
3159        }
3160    }
3161
3162    //Finish execution and check end conditons
3163    //Execute this global barrier only when needed, i.e. stack browsing was executed or new item were added on the stack
3164    if( ( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) |
3165        ( pQueue->m_controls.m_StackTop != pQueue->m_controls.m_PreviousStackTop ) )
3166    {
3167        GlobalBarrier( queueStorageBuffer );
3168    }
3169
3170    //Cleanup & resource reclamation section
3171    //We are after global sync section, we can do anything to globals right now.
3172    if( ( get_local_id( 0 ) == 0 ) & ( get_group_id( 0 ) == 0 ) )
3173    {
3174        {
3175            pQueue->m_controls.m_CurrentDSHoffset           = pQueue->m_controls.m_DynamicHeapStart;
3176            pQueue->m_controls.m_IDTAfterFirstPhase         = 1;
3177            pQueue->m_controls.m_PreviousNumberOfQueues     = pQueue->m_controls.m_TotalNumberOfQueues;
3178            pQueue->m_controls.m_LastScheduleEventNumber    = pQueue->m_controls.m_CurrentScheduleEventNumber;
3179            pQueue->m_controls.m_CurrentScheduleEventNumber = 0;
3180        }
3181    }
3182
3183    //Schedule scheduler
3184    if( ( (get_group_id( 0 ) == 1) == (get_num_groups( 0 ) > 1)  ) & ( get_local_id(0) == 0 ) )
3185    {
3186        pQueue->m_controls.m_SecondLevelBatchOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET );
3187
3188        //If we scheduled any blocks, put scheduler right after
3189        if( ( pQueue->m_controls.m_CurrentIDToffset > 1 ) | ( pQueue->m_controls.m_EnqueueMarkerScheduled > 0 ) )
3190        {
3191            AddCmdsInSLBforScheduler20Parallel( pQueue->m_controls.m_SecondLevelBatchOffset,
3192                                                pQueue,
3193                                                secondaryBatchBuffer,
3194                                                dsh );
3195
3196            //If we have profiling enabled, we need CL_COMPLETE time, which is before next scheduler starts
3197            if( pQueue->m_controls.m_IsProfilingEnabled != 0 )
3198            {
3199                ulong Address  = ( ulong ) &( eventsPool->m_CLcompleteTimestamp );
3200                //Emit pipecontrol with timestamp write
3201                PatchPipeControlProfilingAddres( secondaryBatchBuffer,
3202                                                 pQueue->m_controls.m_SecondLevelBatchOffset,
3203                                                 Address,
3204                                                 PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
3205                //Bit after scheduler may be set by some other command, reset it to 0
3206                DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
3207                                                 pQueue->m_controls.m_SecondLevelBatchOffset,
3208                                                 PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET_TO_PATCH );
3209#ifdef ENABLE_DEBUG_BUFFER
3210                if( ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
3211                {
3212                    //Store Current scheduler START time
3213                    *((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp;
3214
3215                    //Set address to store next scheduler's END time
3216                    PatchPipeControlProfilingAddres( secondaryBatchBuffer,
3217                                                        pQueue->m_controls.m_SecondLevelBatchOffset,
3218                                                        ( ulong )(&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ]),
3219                                                        PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
3220                }
3221#endif
3222            }
3223
3224            //Program pipe controls around scheduler to make sure it is not executed concurently to blocks
3225            else
3226            {
3227                //Locate previous pipe control
3228                int PreviousOffset = pQueue->m_controls.m_SecondLevelBatchOffset - SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
3229                //If offset is negative it means we are first command after chaining
3230                if( PreviousOffset < 0 )
3231                {
3232                    PreviousOffset += MAX_SLB_OFFSET;
3233                }
3234                //Tighten previous pipecontrol
3235                uint DwordOffset = PreviousOffset / DWORD_SIZE_IN_BYTES;
3236
3237                PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
3238
3239                //Now put pipe control after scheduler
3240                DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
3241
3242                PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
3243            }
3244
3245            pQueue->m_controls.m_SecondLevelBatchOffset  += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
3246            pQueue->m_controls.m_CurrentIDToffset        = 1;
3247            pQueue->m_controls.m_EnqueueMarkerScheduled  = 0;
3248            pQueue->m_controls.Temporary[1]++;
3249            pQueue->m_controls.m_CurrentSSHoffset        = pQueue->m_controls.m_BTbaseOffset;
3250        }
3251        //Nothing to schedule, return to the host
3252        else
3253        {
3254#ifdef ENABLE_DEBUG_BUFFER
3255            //Set START time of current (last) scheduler
3256            if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
3257            {
3258                *((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp;
3259            }
3260#endif
3261            pQueue->m_controls.Temporary[2]++;
3262            pQueue->m_controls.m_SLBENDoffsetInBytes = ( int ) pQueue->m_controls.m_SecondLevelBatchOffset;
3263            //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
3264            __private uint DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
3265            //BB_START 1st DWORD
3266            secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0;
3267            DwordOffset++;
3268            //BB_START 2nd DWORD - Address, 3rd DWORD Address high
3269            secondaryBatchBuffer[ DwordOffset++ ] = (uint)( pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF );
3270            secondaryBatchBuffer[ DwordOffset++ ] = (uint)( ( pQueue->m_controls.m_CleanupSectionAddress >> 32 ) & 0xFFFFFFFF );
3271        }
3272    }
3273
3274    //Parallel stack compaction
3275    if( ( ( get_group_id( 0 ) == 2 ) == ( get_num_groups( 0 ) > 2 )  ) & ( get_local_id( 0 ) == 0 ) )
3276    {
3277        uint Current = pQueue->m_controls.m_StackTop + get_local_id( 0 );
3278        uint StackSize = pQueue->m_controls.m_StackSize;
3279        uint Found = 0;
3280
3281        while( ( Current < StackSize ) && ( Found == 0 ) )
3282        {
3283            __global uint * pCmdStackBlock = (__global uint *)( commandsStack + Current );
3284            //We have found an element
3285            if( *pCmdStackBlock != 0 )
3286            {
3287                Found = 1;
3288            }
3289            else
3290            {
3291                Current += get_local_size( 0 );
3292            }
3293        }
3294
3295        if ( Found == 1 )
3296        {
3297            atomic_min( &pQueue->m_controls.m_StackTop, Current );
3298            atomic_min( &pQueue->m_controls.m_PreviousStackTop, Current );
3299        }
3300    }
3301
3302    //Qstorage compaction
3303    if( ( ( get_group_id( 0 ) == 3 ) == ( get_num_groups( 0 ) > 3 )  ) & ( get_local_id( 0 ) == 0 ) )
3304    {
3305        uint ReclaimFurhter = 1;
3306        while( ( pQueue->m_controls.m_QstorageTop < pQueue->m_controls.m_QstorageSize ) & ( ReclaimFurhter == 1 ) )
3307        {
3308            pCommand = GetCommandHeaderFromStorage( ( __global uint* ) queueStorageBuffer, pQueue->m_controls.m_QstorageTop );
3309            if( pCommand->m_commandState == CAN_BE_RECLAIMED )
3310            {
3311                pQueue->m_controls.m_QstorageTop += pCommand->m_commandSize;
3312            }
3313            else
3314            {
3315                ReclaimFurhter = 0;
3316            }
3317        }
3318
3319        pQueue->m_controls.m_PreviousStorageTop      = pQueue->m_controls.m_QstorageTop;
3320
3321#ifndef DISABLE_RESOURCE_RECLAMATION
3322        //Reclaim space on queue_t, do this only if there is enough space
3323        //1 KB is used for global barrier, make sure this space will never be used.
3324        if( pQueue->m_controls.m_QstorageTop - 1024 > pQueue->m_size )
3325        {
3326            //In this case we can take full queue_t next time we enter scheduler, so reclaim full space on queue_t
3327            pQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT;
3328        }
3329#endif
3330        pQueue->m_controls.m_PreviousHead           = pQueue->m_head;
3331    }
3332}
3333