1/* 2 * Copyright (C) 2018-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8#ifndef SCHEDULER_EMULATION 9#include "device_enqueue.h" 10#endif 11 12// float passed as int 13extern float __intel__getProfilingTimerResolution(); 14 15#ifndef EMULATION_ENTER_FUNCTION 16#define EMULATION_ENTER_FUNCTION( ) 17#endif 18 19#ifndef NULL 20#define NULL 0 21#endif 22 23#define SIMD8 0 24#define SIMD16 1 25#define SIMD32 2 26 27#define SHORT_SIZE_IN_BYTES 2 28#define DWORD_SIZE_IN_BYTES 4 29#define QWORD_SIZE_IN_BYTES 8 30 31#define MAX_GLOBAL_ARGS 255 32 33#define MASK_LOW16_BITS 0xFFFF 34//Currently setting to 8 35#define MAX_WALKERS_IN_PARALELL PARALLEL_SCHEDULER_HW_GROUPS 36//Need 4 uints per walker ( command packet offset, slb offest, dsh offset, idt offset, + 1 to store total 37#define PARALLEL_SCHEDULER_OFFSETS_NUMBER 4 38#define PARALLEL_SCHEDULER_LOCAL_MEM_SIZE ( MAX_WALKERS_IN_PARALELL * PARALLEL_SCHEDULER_OFFSETS_NUMBER + 1 ) 39//Last index 40#define TOTAL_ENQUEUES_FOUND ( PARALLEL_SCHEDULER_LOCAL_MEM_SIZE - 1 ) 41 42//CURBE STUFF, only entries that really needs to be patched 43#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT 1 44#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE 2 45#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE 3 46#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS 4 47#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS 5 48#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES 8 49#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET 16 50#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS 17 51#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT 22 52#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE 28 53 54#define SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH ( 9 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 55#define SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT ( 10 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 56#define SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH ( 11 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 57#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE ( 12 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 58#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER ( 13 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 59#define SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE ( 18 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 60#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES ( 20 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 61#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS ( 27 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 62#define SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID ( 35 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 63#define SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER ( 39 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 64 65#define DATA_PARAMETER_SAMPLER_ADDRESS_MODE ( 14 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE ) 66#define DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS ( 15 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE ) 67#define DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED ( 21 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE ) 68#define SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID ( 35 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE ) 69 70//CURBE STUFF, only entries that really needs to be patched 71#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK ( 1 << 1 ) 72#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK ( 1 << 2 ) 73#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK ( 1 << 3 ) 74#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK ( 1 << 4 ) 75#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK ( 1 << 5 ) 76#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK ( 1 << 8 ) 77#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK ( 1 << 16 ) 78#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK ( 1 << 17 ) 79#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK ( 1 << 22 ) 80#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK ( 1 << 28 ) 81#define SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES ( ( ulong ) 1 << SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT ) 82#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT ) 83 84 85#define SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT ) 86//Error codes 87#define SCHEDULER_CURBE_TOKEN_MISSED 10 88#define SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH 11 89 90#define CAN_BE_RECLAIMED 123456 91 92#define SCHEDULER_MSF_INITIAL 1 93#define SCHEDULER_MSF_SECOND 2 94 95//Uncomment to enable logging debug data 96//#define ENABLE_DEBUG_BUFFER 1 97 98#ifdef ENABLE_DEBUG_BUFFER 99//Update DebugDataInfo types in device_enqueue.h and PrintDebugDataBuffer() in cmd_queue_device.cpp 100 101//Flags 102#define DDB_HAS_DATA_INFO ( 1 << 0 ) 103#define DDB_SCHEDULER_PROFILING ( 1 << 1 ) 104 105#define DDB_ALL ( 0xffffffff ) 106 107#endif 108 109//Turn this to 1 to turn on debug calls, notice that it will cause up to 10 x longer time to build scheduler 110//#define SCHEDULER_DEBUG_MODE 1 111 112//#define DISABLE_RESOURCE_RECLAMATION 1 113/* 114Resource reclamation procedure 1151. Move all new command packets from queue_t to qstorage 1162. In case there is place in storage for whole queue, reclaim space on queue 1173. Construct stack basing on new commands added in the qstorage 1184. Browse stack to find next item for execution 1195. When you take item from the stack and schedule it , reclaim place on qstorage buffer 120*/ 121 122typedef struct 123{ 124 uint3 ActualLocalSize; 125 uint3 WalkerDimSize; 126 uint3 WalkerStartPoint; 127} IGIL_WalkerData; 128 129typedef struct 130{ 131 uint3 LocalWorkSize; 132 uint3 TotalDimSize; 133 IGIL_WalkerData WalkerArray[ 8 ]; 134} IGIL_WalkerEnumeration; 135 136inline void patchDword( __global uint* patchedDword, uint startOffset, uint endOffset, uint value ) 137{ 138 uint LeftMask = ALL_BITS_SET_DWORD_MASK >> ( DWORD_SIZE_IN_BITS - endOffset - 1 ); 139 uint RightMask = ALL_BITS_SET_DWORD_MASK << ( startOffset ); 140 uint CleanMask = ~( RightMask & LeftMask ); 141 *patchedDword &= CleanMask; 142 *patchedDword |= ( value << startOffset ); 143} 144 145inline __global IGIL_KernelAddressData* IGIL_GetKernelAddressData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId ) 146{ 147 return ( __global IGIL_KernelAddressData* ) ( &pKernelReflection->m_data[ blockId ] ); 148} 149__global IGIL_KernelData* IGIL_GetKernelData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId ) 150{ 151 __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId ); 152 uint Offset = pKernelAddressData->m_KernelDataOffset; 153 __global char* pKernelReflectionRaw = ( __global char * ) pKernelReflection; 154 return ( __global IGIL_KernelData* ) ( &pKernelReflectionRaw[ Offset ] ); 155} 156 157 158inline __global IGIL_CommandHeader* TEMP_IGIL_GetCommandHeader( __global IGIL_CommandQueue* q, uint offset ) 159{ 160 __global uchar *pQueueRaw = (__global uchar *) q; 161 162 __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset ); 163 164 return pCommand; 165} 166//Make sure enough command packets are in command queue before calling this function. 167__global IGIL_CommandHeader* TEMP_IGIL_GetNthCommandHeader( __global IGIL_CommandQueue* q, uint initialOffset, uint number ) 168{ 169 __global uchar *pQueueRaw = (__global uchar *) q; 170 171 __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset ); 172 uint Offset = initialOffset; 173 //Traverse queue_t unless nth command packet is found 174 while( number > 0 ) 175 { 176 Offset += pCommand->m_commandSize; 177 pCommand = TEMP_IGIL_GetCommandHeader( q, Offset ); 178 number--; 179 } 180 181 return pCommand; 182} 183 184//Make sure enough command packets are in command queue before calling this function. 185uint TEMP_IGIL_GetNthCommandHeaderOffset( __global IGIL_CommandQueue* q, uint initialOffset, uint number ) 186{ 187 __global uchar *pQueueRaw = (__global uchar *) q; 188 189 __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset ); 190 uint Offset = initialOffset; 191 //Traverse queue_t unless nth command packet is found 192 while( number > 0 ) 193 { 194 Offset += pCommand->m_commandSize; 195 pCommand = TEMP_IGIL_GetCommandHeader( q, Offset ); 196 number--; 197 } 198 199 return Offset; 200} 201 202__global IGIL_CommandHeader* GetCommandHeaderFromStorage( __global uint* queueStorage, uint offset ) 203{ 204 __global uchar *pQueueRaw = ( __global uchar * ) queueStorage; 205 206 __global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset ); 207 208 return pCommand; 209} 210 211inline queue_t TEMP_IGIL_GetQueueT( IGIL_CommandHeader * queuePtr ) 212{ 213 return __builtin_astype( queuePtr, queue_t ); 214} 215 216inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvents( __global IGIL_EventPool *pool ) 217{ 218 return ( __global IGIL_DeviceEvent * )( pool + 1 ); 219} 220 221inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvent( __global IGIL_EventPool *pool, uint eventId ) 222{ 223 __global IGIL_DeviceEvent * pEvent = ( __global IGIL_DeviceEvent * )( pool + 1 ); 224 225 return ( __global IGIL_DeviceEvent * )( pEvent + eventId ); 226} 227bool SetEventState( __global IGIL_EventPool *pool, uint eventId, int state ) 228{ 229 __global IGIL_DeviceEvent* pDeviceEvent = TEMP_IGIL_GetDeviceEvent( pool, eventId ); 230 pDeviceEvent->m_state = state; 231 return true; 232} 233 234void TEMP_IGIL_FreeEvent( clk_event_t event, __global IGIL_EventPool *pool ) 235{ 236 //Offset into the event data in the pool 237 __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( pool ); 238 239 atomic_xchg( &events[ ( uint )(size_t)__builtin_astype( event, void* ) ].m_state, IGIL_EVENT_UNUSED ); 240} 241 242void IGILLOCAL_MEMCPY_GTOG( __global void* pDst, __global void* pSrc, int numBytes ) 243{ 244 numBytes = numBytes >> 2; 245 for( int i = 0; i < numBytes; i++ ) 246 { 247 ( ( __global uint* )pDst ) [ i ] = ( ( __global int* )pSrc )[ i ]; 248 } 249} 250 251//Global memcpy running on all witems possible, make sure it's run from all hw threads. 252void GLOBAL_MEMCPYUINT( __global void* pDst, __global void* pSrc, int numBytes ) 253{ 254 uint total_local_size = get_local_size( 0 ); 255 uint LoopCtr = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES ); 256 uint LeftOver = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES ); 257 uint Lid = get_local_id( 0 ); 258 uint i = 0; 259 260 //Main copy 261 for( i = 0; i < LoopCtr; i++ ) 262 { 263 ( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ]; 264 } 265 //Copy what's left 266 if( LeftOver != 0 ) 267 { 268 if( Lid * DWORD_SIZE_IN_BYTES < LeftOver ) 269 { 270 ( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ]; 271 } 272 } 273} 274 275//In SIMD8 to fully use cachelines copy in portions of uint2 , 8 bytes x 8 witems = 64 bytes cacheline size. 276//Global memcpy running on all witems possible, make sure it's run from all hw threads. 277void GLOBAL_MEMCPY( __global void* pDst, __global void* pSrc, int numBytes ) 278{ 279 //In case I need dword copy use uint version of this function. 280 if( ( numBytes % ( DWORD_SIZE_IN_BYTES * 2 ) ) != 0 ) 281 { 282 GLOBAL_MEMCPYUINT( pDst, pSrc, numBytes ); 283 } 284 else 285 { 286 uint total_local_size = get_local_size( 0 ); 287 uint LoopCtr = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES * 2 ); 288 uint LeftOver = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES * 2 ); 289 uint Lid = get_local_id( 0 ); 290 uint i = 0; 291 292 //Main copy 293 for( i = 0; i < LoopCtr; i++ ) 294 { 295 ( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ]; 296 } 297 //Copy what's left 298 if( LeftOver != 0 ) 299 { 300 if( Lid * DWORD_SIZE_IN_BYTES * 2 < LeftOver ) 301 { 302 ( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ]; 303 } 304 } 305 } 306} 307 308//This works only for 32 bit types 309uint GetNextPowerof2( uint number ) 310{ 311 number --; 312 number |= number >> 1; 313 number |= number >> 2; 314 number |= number >> 4; 315 number |= number >> 8; 316 number |= number >> 16; 317 number ++; 318 return number; 319} 320 321#define OCLRT_ALIGN( a, b ) ( ( ( ( a ) % ( b ) ) != 0 ) ? ( ( a ) - ( ( a ) % ( b ) ) + ( b ) ) : ( a ) ) 322#define OCLRT_MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) ) 323 324#ifndef SCHEDULER_EMULATION 325#include "scheduler_definitions.h" 326#endif 327 328#ifdef ENABLE_DEBUG_BUFFER 329//Adds private uint* data to debug buffer 330// ddb - global buffer to keep data 331// src - source data 332// numberOfElements 333// localID - WI ID , when 0xffffffff all WI copy data 334// returns 0 when data was copied, else -1 335int AddToDebugBufferParallel( __global DebugDataBuffer* ddb, uint* pSrc, uint numberOfElements, uint localID ) 336{ 337 if( ddb->m_flags == 0 ) 338 { 339 //All work items in local group copies data 340 if( localID == 0xffffffff ) 341 { 342 //Check if there is enough place for new data in ddb for every workitem in local group 343 if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements ) 344 { 345 uint startIndex = atomic_add( &ddb->m_offset, numberOfElements ); 346 uint i; 347 int srcPos; 348 for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ ) 349 { 350 ddb->m_data[ i ] = pSrc[ srcPos ]; 351 } 352 return 0; 353 } 354 } 355 else 356 { 357 //Check if there is enough place for new data in ddb for one workitem in local group 358 if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements ) 359 { 360 if( get_local_id(0) == localID ) 361 { 362 uint startIndex = atomic_add( &ddb->m_offset, numberOfElements ); 363 uint i; 364 int srcPos; 365 for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ ) 366 { 367 ddb->m_data[ i ] = pSrc[ srcPos ]; 368 } 369 return 0; 370 } 371 } 372 } 373 } 374 return -1; 375} 376 377//Adds private uint to debug buffer 378int AddToDBParallel(__global DebugDataBuffer* ddb, uint pSrc, uint localID) 379{ 380 return AddToDebugBufferParallel(ddb, &pSrc, 1, localID); 381} 382 383//Adds global uint* data to debug buffer 384// ddb - global buffer to keep data 385// src - source data 386// numberOfElements 387// localID - WI ID , when 0xffffffff all WI copy data 388// returns 0 when data was copied, else -1 389int AddGlobalToDebugBufferParallel( __global DebugDataBuffer* ddb, __global uint* pSrc, uint numberOfElements, uint localID ) 390{ 391 if( ddb->m_flags == 0 ) 392 { 393 //All work items in local group copies data 394 if( localID == 0xffffffff ) 395 { 396 //Check if there is enough place for new data in ddb for every workitem in local group 397 if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements ) 398 { 399 uint startIndex = atomic_add( &ddb->m_offset, numberOfElements ); 400 uint i; 401 int srcPos; 402 for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ ) 403 { 404 ddb->m_data[ i ] = pSrc[ srcPos ]; 405 } 406 return 0; 407 } 408 } 409 else 410 { 411 //Check if there is enough place for new data in ddb for one workitem in local group 412 if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements ) 413 { 414 if( get_local_id(0) == localID ) 415 { 416 uint startIndex = atomic_add( &ddb->m_offset, numberOfElements ); 417 uint i; 418 int srcPos; 419 for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ ) 420 { 421 ddb->m_data[ i ] = pSrc[ srcPos ]; 422 } 423 return 0; 424 } 425 } 426 } 427 } 428 return -1; 429} 430 431 432 433//Adds private uint data to debug buffer 434// ddb - global buffer to keep data 435// src - source data 436// dataType - enum defining added data type 437// returns 0 when data was copied, else -1 438int AddToDebugBuffer( __global DebugDataBuffer* ddb, __private ulong src, uint dataType, uint localID ) 439{ 440 if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO ) 441 { 442 //All work items in local group copies data 443 if( localID == 0xffffffff ) 444 { 445 //Check if there is enough place for new data in ddb 446 if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 4 * get_local_size( 0 ) ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) ) 447 { 448 //Check flags 449 if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO ) 450 { 451 uint startIndex = atomic_add( &ddb->m_offset, 1 ); 452 uint dataIndex = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize ); 453 uint stackTop = atomic_add( &ddb->m_stackTop, 8 ); 454 455 __global uchar* pCharDebugQueue = ( __global uchar * )ddb; 456 __global uint* dest = ( __global uint* )&pCharDebugQueue[ stackTop ]; 457 __global DebugDataInfo* debugInfo; 458 459 dest[ 0 ] = ( uint )src & 0xffffffff; 460 dest[ 1 ] = ( src & 0xffffffff00000000 ) >> 32; 461 debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] ); 462 debugInfo->m_dataType = ( DebugDataTypes )dataType; 463 debugInfo->m_dataSize = 8; 464 ddb->m_flags |= DDB_HAS_DATA_INFO; 465 return 0; 466 } 467 } 468 } 469 else 470 { 471 //Check if there is enough place for new data in ddb 472 if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 8 ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) ) 473 { 474 if( get_local_id( 0 ) == localID ) 475 { 476 uint startIndex = atomic_add( &ddb->m_offset, 1 ); 477 uint dataIndex = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize ); 478 uint stackTop = atomic_add( &ddb->m_stackTop, 8 ); 479 480 __global uchar* pCharDebugQueue = ( __global uchar * )ddb; 481 __global uint* dest = ( __global uint* )&pCharDebugQueue[ stackTop ]; 482 __global DebugDataInfo* debugInfo; 483 484 dest[ 0 ] = ( uint )src & 0xffffffff; 485 dest[ 1 ] = ( src & 0xffffffff00000000 ) >> 32; 486 debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] ); 487 debugInfo->m_dataType = ( DebugDataTypes )dataType; 488 debugInfo->m_dataSize = 8; 489 ddb->m_flags |= DDB_HAS_DATA_INFO; 490 return 0; 491 } 492 493 } 494 } 495 } 496 return -1; 497} 498 499//Adds data to debug buffer 500// ddb - global buffer to keep data 501// src - source 502// bytes - number of bytes from src to put into ddb 503// dataType - enum defining added data type 504// returns 0 when data was copied, else -1 505int AddGlobalToDebugBuffer( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType ) 506{ 507 if( get_local_id( 0 ) ) 508 { 509 //Check if there is enough place for new data in ddb 510 if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) ) 511 { 512 //Check flags 513 if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO ) 514 { 515 __global uchar* pCharDebugQueue = ( __global uchar * )ddb; 516 __global DebugDataInfo* debugInfo; 517 IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes ); 518 debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] ); 519 debugInfo->m_dataType = ( DebugDataTypes )dataType; 520 debugInfo->m_dataSize = bytes; 521 522 ddb->m_dataInfoTop = ddb->m_dataInfoTop - ddb->m_dataInfoSize; 523 ddb->m_stackTop = ddb->m_stackTop + bytes; 524 ddb->m_offset = ddb->m_offset + ( bytes / 4 ); 525 ddb->m_flags |= DDB_HAS_DATA_INFO; 526 return 0; 527 } 528 } 529 } 530 return -1; 531} 532 533int AddGlobalToDebugBufferAllIds( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType, uint localID ) 534{ 535 if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO ) 536 { 537 //Check if there is enough place for new data in ddb 538 if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) ) 539 { 540 //Check flags 541 if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO ) 542 { 543 __global uchar* pCharDebugQueue = ( __global uchar * )ddb; 544 __global DebugDataInfo* debugInfo; 545 IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes ); 546 debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] ); 547 debugInfo->m_dataType = ( DebugDataTypes )dataType; 548 debugInfo->m_dataSize = bytes; 549 550 ddb->m_dataInfoTop = ddb->m_dataInfoTop - ddb->m_dataInfoSize; 551 ddb->m_stackTop = ddb->m_stackTop + bytes; 552 ddb->m_offset = ddb->m_offset + ( bytes / 4 ); 553 ddb->m_flags |= DDB_HAS_DATA_INFO; 554 return 0; 555 } 556 } 557 } 558 return -1; 559} 560 561#endif 562 563#define MAX_SLB_OFFSET ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * SECOND_LEVEL_BUFFER_NUMBER_OF_ENQUEUES ) 564 565#ifndef SCHEDULER_EMULATION 566#include "scheduler_builtin_kernel.inl" 567#endif 568 569//SOME COMMON CODE FUNCTIONS 570//COMMON CODE STARTS HERE 571//Not thread safe - make sure it's called in thread safe fashion. 572 573void patchIDData( __global char* dsh, 574 uint blockId, 575 uint numberOfHwThreads, 576 uint slmSize ) 577{ 578 __global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + 1 ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) ); 579 __global uint* DSHIntData = ( __global uint* )( DSHIData ); 580 //Barrier enable is pre-patched on the host. 581 patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads ); 582 583 //Patch SLM. 584 uint SLMPatchValue = GetPatchValueForSLMSize( slmSize ); 585 patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue ); 586} 587/* 588this is how it works : 589When constructing primary batch, first IDT table is also constructed, for all blocks, it is constructed as follows: 590[0] - parent id 591[1 .. x ] block id 592[last aligned ] scheduler 593 594now when we enter SLB, we forgot about first IDT, and we point all interface descriptor loads to point at scheduler which was last in the first IDT, to be first in the new IDT. 595 596This way we can copy Interface Descriptors for blocks from the first IDT and assign Interface Descriptors dynamically in scheduler. 597*/ 598 599void CopyAndPatchIDData( __global char* dsh, 600 uint blockId, 601 uint numberOfHwThreads, 602 uint slmSize, 603 uint interfaceDescriptorOffset, 604 uint blockStartId ) 605{ 606 __global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) ); 607 __global uint* DSHIntData = ( __global uint* )( DSHIData ); 608 609 //Copy to ID InterfaceDescriptorOffset 610 __global char* DSHDestIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) ); 611 __global uint* DSHDestIntData = ( __global uint* )( DSHDestIData ); 612 __global uint* DSHDestIntStartData = DSHDestIntData; 613 614 for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ ) 615 { 616 DSHDestIntData[ i ] = DSHIntData[ i ]; 617 } 618 619 //Barrier enable is pre-patched on the host. 620 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads ); 621 622 //Patch SLM. 623 uint SLMPatchValue = GetPatchValueForSLMSize( slmSize ); 624 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue ); 625} 626 627void CopyAndPatchIDData20( __global char* dsh, 628 uint blockId, 629 uint numberOfHwThreads, 630 uint slmSize, 631 uint interfaceDescriptorOffset, 632 uint blockStartId, 633 uint bToffset, 634 uint dshOffset, 635 uint numOfSamplers 636#ifdef ENABLE_DEBUG_BUFFER 637 , __global DebugDataBuffer* DebugQueue 638#endif 639 ) 640{ 641 EMULATION_ENTER_FUNCTION( ); 642 643 __global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) ); 644 __global uint* DSHIntData = ( __global uint* )( DSHIData ); 645 646 //Copy to ID InterfaceDescriptorOffset 647 __global char* DSHDestIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) ); 648 __global uint* DSHDestIntData = ( __global uint* )( DSHDestIData ); 649 __global uint* DSHDestIntStartData = DSHDestIntData; 650 651 for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ ) 652 { 653 DSHDestIntData[ i ] = DSHIntData[ i ]; 654 } 655 656 //Barrier enable is pre-patched on the host. 657 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads ); 658 659 //Patch BT offset 660 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_BINDING_TABLE_POINTER_DWORD ] ), 5, 15, ( bToffset >> 5 ) ); 661 662 //Patch SLM. 663 uint PatchValue = GetPatchValueForSLMSize( slmSize ); 664 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, PatchValue ); 665 666 PatchValue = ( DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] & 0xffffffe0 ) + ( dshOffset ); 667 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 5, 31, ( ( PatchValue ) >> 5 ) ); 668 669 //Samplers in multiple of 4 670 numOfSamplers = ( numOfSamplers + 3 ) / 4; 671 patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 2, 4, numOfSamplers ); 672} 673 674 675void patchGpGpuWalker( 676 uint secondLevelBatchOffset, 677 __global uint* secondaryBatchBuffer, 678 uint interfaceDescriptorOffset, 679 uint simdSize, 680 uint totalLocalWorkSize, 681 uint3 dimSize, 682 uint3 startPoint, 683 uint numberOfHwThreadsPerWg, 684 uint indirectPayloadSize, 685 uint ioHoffset ) 686{ 687 EMULATION_ENTER_FUNCTION( ); 688 689 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 690 uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 691 //INTERFACE_DESCRIPTOR for GPGPU_WALKER 692 //INTERFACE DESCRIPTOR is one plus the block id 693 uint PatchOffset = CmdPacketStart + GPGPU_WALKER_INTERFACE_DESCRIPTOR_ID_OFFSET; 694 //Patch id data 695 patchDword( &( secondaryBatchBuffer[ PatchOffset ] ), 696 0, 5, ( interfaceDescriptorOffset ) ); 697 PatchOffset = CmdPacketStart + GPGPU_WALKER_THREAD_WIDTH_DWORD; 698 //THREAD_WIDTH for GPGPU_WALKER 699 patchDword( &( secondaryBatchBuffer[ PatchOffset ] ), 700 0, 5, ( numberOfHwThreadsPerWg - 1 ) ); 701 702 PatchOffset = CmdPacketStart + GPGPU_WALKER_SIMDSIZE_DWORD; 703 704 //SIMD SIZE for GPGPU_WALKER 705 //Double Check the bits for SIMDSize 706 if( simdSize == 8 ) 707 { 708 patchDword( &( secondaryBatchBuffer[ PatchOffset ] ), 709 30, 31, SIMD8 ); 710 } 711 else if ( simdSize == 16 ) 712 { 713 patchDword( &( secondaryBatchBuffer[ PatchOffset ] ), 714 30, 31, SIMD16 ); 715 } 716 else 717 { 718 patchDword( &( secondaryBatchBuffer[ PatchOffset ] ), 719 30, 31, SIMD32 ); 720 } 721 722 //XDIM for GPGPU_WALKER 723 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XDIM_DWORD ] = dimSize.x; 724 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_X ] = startPoint.x; 725 //YDIM 726 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YDIM_DWORD ] = dimSize.y; 727 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Y ] = startPoint.y; 728 //ZDIM for GPGPU_WALKER 729 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_ZDIM_DWORD ] = dimSize.z; 730 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Z ] = startPoint.z; 731 732 //XMASK for GPGPU_WALKER 733 uint mask = ( 1 << ( totalLocalWorkSize % simdSize ) ) - 1; 734 if( mask == 0 ) 735 mask = ~0; 736 737 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XMASK_DWORD ] = mask; 738 739 //YMASK for GPGPU_WALKER 740 uint YMask = ~0; 741 742 secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YMASK_DWORD ] = YMask; 743 744 745 patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_DATA_LENGTH_OFFSET ] ), 746 0, 16, indirectPayloadSize ); 747 748 patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_START_ADDRESS_OFFSET ] ), 749 0, 31, ioHoffset ); 750} 751 752int PatchMediaStateFlush( 753 uint secondLevelBatchOffset, 754 __global uint* secondaryBatchBuffer, 755 uint interfaceDescriptorOffset, 756 uint msfNumber ) 757{ 758 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 759 uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 760 uint MsfOffset; 761 762 if( msfNumber == SCHEDULER_MSF_INITIAL ) 763 { 764 MsfOffset = MEDIA_STATE_FLUSH_INITIAL_INTERFACE_DESCRIPTOR_OFFSET; 765 } 766 else if ( msfNumber == SCHEDULER_MSF_SECOND ) 767 { 768 MsfOffset = MEDIA_STATE_FLUSH_INTERFACE_DESCRIPTOR_OFFSET; 769 } 770 else 771 { 772 return -1; 773 } 774 patchDword( &( secondaryBatchBuffer[ CmdPacketStart + MsfOffset ] ), 0, 5, interfaceDescriptorOffset ); 775 776 return 0; 777} 778 779#if defined WA_LRI_COMMANDS_EXIST 780void PatchMiLoadRegisterImm( 781 uint secondLevelBatchOffset, 782 __global uint* secondaryBatchBuffer, 783 uint enqueueOffset, 784 uint registerAddress, 785 uint value ) 786{ 787 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 788 uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 789 790 secondaryBatchBuffer[ CmdPacketStart + enqueueOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD; 791 patchDword( &( secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_ADDRESS_DWORD_OFFSET ] ), 2, 22, registerAddress >> 2 ); 792 secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_VALUE_DWORD_OFFSET ] = value; 793} 794 795void AddMiLoadRegisterImm( 796 __global uint* secondaryBatchBuffer, 797 __private uint* dwordOffset, 798 uint value ) 799{ 800 secondaryBatchBuffer[ *dwordOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD; 801 ( *dwordOffset )++; 802 secondaryBatchBuffer[ *dwordOffset ] = 0; 803 patchDword( &( secondaryBatchBuffer[ *dwordOffset ] ), 2, 22, CTXT_PREMP_DBG_ADDRESS_VALUE >> 2 ); 804 ( *dwordOffset )++; 805 secondaryBatchBuffer[ *dwordOffset ] = value; //CTXT_PREMP_ON_MI_ARB_CHECK_ONLY or CTXT_PREMP_DEFAULT_VALUE 806 ( *dwordOffset )++; 807} 808 809void SetDisablePreemptionRegister( 810 uint secondLevelBatchOffset, 811 __global uint* secondaryBatchBuffer ) 812{ 813 PatchMiLoadRegisterImm( secondLevelBatchOffset, 814 secondaryBatchBuffer, 815 IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, 816 CTXT_PREMP_DBG_ADDRESS_VALUE, 817 CTXT_PREMP_ON_MI_ARB_CHECK_ONLY ); 818} 819 820void SetEnablePreemptionRegister( 821 uint secondLevelBatchOffset, 822 __global uint* secondaryBatchBuffer ) 823{ 824 PatchMiLoadRegisterImm( secondLevelBatchOffset, 825 secondaryBatchBuffer, 826 IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, 827 CTXT_PREMP_DBG_ADDRESS_VALUE, 828 CTXT_PREMP_DEFAULT_VALUE ); 829} 830 831void NoopPreemptionCommand( 832 uint secondLevelBatchOffset, 833 uint cmdOffset, 834 __global uint* secondaryBatchBuffer ) 835{ 836 uint CmdPacketStart = cmdOffset + secondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 837 for( int i = 0; i < OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET; i++ ) 838 { 839 secondaryBatchBuffer[ CmdPacketStart + i ] = 0; 840 } 841} 842#endif //WA_LRI_COMMANDS_EXIST 843 844 845//PQueue is needed for SLBOffset 846void AddCmdsInSLBforScheduler20Parallel( uint slbOffset, 847 __global IGIL_CommandQueue* pQueue, 848 __global uint * secondaryBatchBuffer, 849 __global char * dsh ) 850{ 851 EMULATION_ENTER_FUNCTION( ); 852#ifdef SCHEDULER_EMULATION 853 uint3 StartPoint = { 0, 0, 0 }; 854 uint3 DimSize = { get_num_groups( 0 ), 1, 1 }; 855#else 856 uint3 StartPoint = ( uint3 )( 0 ); 857 uint3 DimSize = ( uint3 )( get_num_groups( 0 ), 1, 1 ); 858#endif 859 patchGpGpuWalker( slbOffset, 860 secondaryBatchBuffer, 861 0, 862 PARALLEL_SCHEDULER_COMPILATION_SIZE_20, 863 get_local_size(0), 864 DimSize, 865 StartPoint, 866 PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20, 867 SIZEOF_3GRFS * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 + pQueue->m_controls.m_SchedulerConstantBufferSize, 868 pQueue->m_controls.m_SchedulerDSHOffset ); 869 870 PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_INITIAL ); 871 PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_SECOND ); 872 873//When commands exists and scheduler does not require preemption off, noop the commands space 874#if defined WA_LRI_COMMANDS_EXIST 875#if defined WA_SCHEDULER_PREEMPTION 876 if( pQueue->m_controls.m_EventTimestampAddress == 0u ) 877 { 878 SetEnablePreemptionRegister( slbOffset, secondaryBatchBuffer ); 879 SetDisablePreemptionRegister( slbOffset, secondaryBatchBuffer ); 880 } 881 else 882 { 883 NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 884 NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 885 } 886#else 887 //This is case, where LRI preemption is not required around scheduler WALKERs, but space for LRI commands exists, make sure they are nooped then 888 NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 889 NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 890#endif //WA_SCHEDULER_PREEMPTION 891#endif //WA_LRI_COMMANDS_EXIST 892} 893 894int generateLocalIDSParallel20( 895 __global char* dsh, 896 uint3 localSize, 897 uint hwThreads, 898 uint simdSize ) 899{ 900 uint it, currX, currY, currZ, FlattendID; 901 902 uint Max = 1; 903 if( simdSize == 32 ) 904 { 905 Max = 2; 906 } 907 908 //Update full GRFs, each WI generate ID for one work item in x,y and z 909 //in case we generate SIMD8 payload using 16 wi , idle half of them 910 if( get_local_id( 0 ) < simdSize ) 911 { 912 for( it = 0; it < hwThreads; it++ ) 913 { 914 for( uint multip = 0; multip < Max; multip++ ) 915 { 916 //We are in simd 8, each wi process generation for 1 wi 917 FlattendID = get_local_id( 0 ) + it * simdSize + 16 * ( multip ); 918 919 currX = FlattendID % localSize.x; 920 currY = ( FlattendID / localSize.x ) % localSize.y; 921 currZ = ( FlattendID / ( localSize.x * localSize.y ) );//not needed % localSize.z; 922 923 *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + multip * GRF_SIZE ) = ( ushort )currX; 924 *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + multip * GRF_SIZE ) = ( ushort )currY; 925 *( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + GRF_SIZE * Max + multip * GRF_SIZE ) = ( ushort )currZ; 926 } 927 } 928 } 929 return 0; 930} 931 932//Function generate local ids. 933//SIMD16 version 934int generateLocalIDSsimd16( 935 __global char* dsh, 936 uint3 localSize, 937 uint hwThreads) 938{ 939 typedef union 940 { 941 ushort16 vectors; 942 ushort varray[ 16 ]; 943 }vectorUnion; 944 945 __private vectorUnion LidX; 946 __private vectorUnion LidY; 947 __private vectorUnion LidZ; 948 949 __private ushort currX = 0; 950 __private ushort currY = 0; 951 __private ushort currZ = 0; 952 953 //Assuming full load of hw thread , remainder done separately 954 for(uint it = 0; it < hwThreads; it++ ) 955 { 956 //This will be unrolled by compiler 957 for(uint x = 0; x < 16; x++ ) 958 { 959 LidX.varray[ x ] = currX++; 960 LidY.varray[ x ] = currY; 961 LidZ.varray[ x ] = currZ; 962 963 if( currX == localSize.x ) 964 { 965 currX = 0; 966 currY++; 967 } 968 969 if( currY == localSize.y ) 970 { 971 currY = 0; 972 currZ++; 973 } 974 } 975 976 *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 ) = LidX.vectors; 977 *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE ) = LidY.vectors; 978 *( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE ) = LidZ.vectors; 979 980 } 981 982 return 0; 983} 984 985//Function generate local ids. 986//SIMD8 version 987int generateLocalIDSsimd8( 988 __global char* dsh, 989 uint3 localSize, 990 uint hwThreads) 991{ 992 typedef union 993 { 994 ushort8 vectors; 995 ushort varray[ 8 ]; 996 }vectorUnion; 997 998 __private vectorUnion LidX; 999 __private vectorUnion LidY; 1000 __private vectorUnion LidZ; 1001 1002 __private ushort currX = 0; 1003 __private ushort currY = 0; 1004 __private ushort currZ = 0; 1005 1006 //Assuming full load of hw thread , remainder done separately 1007 for(uint it = 0; it < hwThreads; it++ ) 1008 { 1009 //This will be unrolled by compiler 1010 for(uint x = 0; x < 8; x++ ) 1011 { 1012 LidX.varray[ x ] = currX++; 1013 LidY.varray[ x ] = currY; 1014 LidZ.varray[ x ] = currZ; 1015 1016 if( currX == localSize.x ) 1017 { 1018 currX = 0; 1019 currY++; 1020 } 1021 1022 if( currY == localSize.y ) 1023 { 1024 currY = 0; 1025 currZ++; 1026 } 1027 } 1028 1029 *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 ) = LidX.vectors; 1030 *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE ) = LidY.vectors; 1031 *( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE ) = LidZ.vectors; 1032 } 1033 1034 return 0; 1035} 1036 1037//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once 1038int PatchDSH1Token( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1039 uint value ) 1040{ 1041 EMULATION_ENTER_FUNCTION( ); 1042 1043 uint PatchOffset; 1044#if SCHEDULER_DEBUG_MODE 1045 //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away 1046 if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType ) 1047 { 1048 return -1; 1049 } 1050#endif 1051 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1052 *( __global uint * )( &pDsh[ PatchOffset ] ) = value; 1053 1054 currentIndex++; 1055 return currentIndex; 1056} 1057 1058int PatchLocalMemEntities( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1059 __global IGIL_CommandHeader* pCommand ) 1060{ 1061 uint PatchOffset; 1062#if SCHEDULER_DEBUG_MODE 1063 //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away 1064 if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType ) 1065 { 1066 return -1; 1067 } 1068#endif 1069 //First patch is with 0 1070 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1071 1072 //SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch 1073 if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) ) 1074 { 1075 *( __global ulong * )( &pDsh[PatchOffset] ) = 0; 1076 } 1077 else 1078 { 1079 *( __global uint * )( &pDsh[ PatchOffset ] ) = 0; 1080 } 1081 1082 1083 currentIndex++; 1084 uint Alignement; 1085 uint iter = 0; 1086 uint CurrentSum = 0; 1087 uint CurrentValue; 1088 //For each global captured there will be uint with index and ulong with address. 1089 uint GlobalPointersSize = ( pCommand->m_numGlobalCapturedBuffer * ( sizeof( ulong ) + sizeof( uint ) ) ) / sizeof( uint ); 1090 1091 __global uint* pLocalMemSizes = &pCommand->m_data[ pCommand->m_numDependencies + pCommand->m_numScalarArguments + GlobalPointersSize ]; 1092 //Check if there is second surface 1093 while( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType ) 1094 { 1095 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1096 1097 //Value needs to be aligned to the value stored in sourceoffset 1098 Alignement = OCLRT_MAX( DWORD_SIZE_IN_BYTES, pKernelCurbeParams[ currentIndex ].m_sourceOffset ); 1099 1100 CurrentValue = pLocalMemSizes[ iter ]; 1101 CurrentValue = OCLRT_ALIGN( CurrentValue, Alignement ); 1102 CurrentSum += CurrentValue; 1103 1104 //SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch 1105 if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) ) 1106 { 1107 *( __global ulong * )( &pDsh[PatchOffset] ) = ( ulong )CurrentSum; 1108 } 1109 else 1110 { 1111 *( __global uint * )( &pDsh[ PatchOffset ] ) = CurrentSum; 1112 } 1113 1114 currentIndex++; 1115 iter++; 1116 } 1117 return currentIndex; 1118} 1119 1120//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once 1121int PatchDSH1TokenParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1122 uint value ) 1123{ 1124 EMULATION_ENTER_FUNCTION( ); 1125 1126 uint PatchOffset; 1127 if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) 1128 { 1129 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1130 *( __global uint * ) ( &pDsh[ PatchOffset ] ) = value; 1131 } 1132 currentIndex++; 1133 return currentIndex; 1134} 1135 1136//Function patches a curbe parametr, this version of function works on 3d curbe tokens 1137//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed 1138int PatchDSH6TokensParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1139 uint value1, uint value2, uint value3 ) 1140{ 1141 EMULATION_ENTER_FUNCTION( ); 1142 1143 uint PatchOffset, SourceOffset; 1144 uint WorkingOffset; 1145 uint ShiftSize; 1146 1147 //Check if we patch 3 or 6 curbe tokens 1148 if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType ) 1149 { 1150 ShiftSize = 6; 1151 } 1152 else 1153 { 1154 ShiftSize = 3; 1155 } 1156 1157 if( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + ShiftSize ) 1158 { 1159 WorkingOffset = currentIndex + get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20; 1160 PatchOffset = pKernelCurbeParams[ WorkingOffset ].m_patchOffset; 1161 SourceOffset = pKernelCurbeParams[ WorkingOffset ].m_sourceOffset; 1162 1163 if( SourceOffset == 0 ) 1164 { 1165 *( __global uint * )( &pDsh[ PatchOffset ] ) = value1; 1166 } 1167 else if( SourceOffset == 4 ) 1168 { 1169 *( __global uint * )( &pDsh[ PatchOffset ] ) = value2; 1170 } 1171 else if( SourceOffset == 8 ) 1172 { 1173 *( __global uint * )( &pDsh[ PatchOffset ] ) = value3; 1174 } 1175 } 1176 1177 currentIndex += ShiftSize; 1178 return currentIndex; 1179} 1180 1181int PatchLocalWorkSizes( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1182 uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ ) 1183{ 1184 EMULATION_ENTER_FUNCTION( ); 1185 1186 uint PatchOffset, SourceOffset; 1187 1188 //Tokens are sorted by m_sourceOffset, it means that first 3 keys are always used to compute global_id and are always present 1189 for( uint it = 0; it < 3; it++ ) 1190 { 1191 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1192 SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset; 1193 1194 if( SourceOffset == 0 ) 1195 { 1196 *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalX; 1197 } 1198 else if( SourceOffset == 4 ) 1199 { 1200 *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalY; 1201 } 1202 else if( SourceOffset == 8 ) 1203 { 1204 *( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalZ; 1205 } 1206 currentIndex++; 1207 } 1208 //If there are 3 more tokens, it means that get_local_size is used within a kernel, to deal with it patch with the second set of variables 1209 if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType ) 1210 { 1211 for( uint it = 0; it < 3; it++ ) 1212 { 1213 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1214 SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset; 1215 1216 if( SourceOffset == 0 ) 1217 { 1218 *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalX; 1219 } 1220 else if( SourceOffset == 4 ) 1221 { 1222 *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalY; 1223 } 1224 else if( SourceOffset == 8 ) 1225 { 1226 *( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalZ; 1227 } 1228 currentIndex++; 1229 } 1230 } 1231 return currentIndex; 1232} 1233 1234//Function patches a curbe parametr, this version of function works on 3d curbe tokens 1235//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed 1236int PatchLocalWorkSizesParallel( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1237 uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ ) 1238{ 1239 EMULATION_ENTER_FUNCTION( ); 1240 1241 uint ShiftSize; 1242 1243 //Check if we patch 3 or 6 curbe tokens 1244 if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType ) 1245 { 1246 ShiftSize = 6; 1247 } 1248 else 1249 { 1250 ShiftSize = 3; 1251 } 1252 1253 //Use single threaded version 1254 if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) 1255 { 1256 PatchLocalWorkSizes( currentIndex, SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE, pKernelCurbeParams, pDsh, enqLocalX, enqLocalY, enqLocalZ, cutLocalX, cutLocalY, cutLocalZ ); 1257 } 1258 1259 currentIndex += ShiftSize; 1260 return currentIndex; 1261} 1262 1263//Function patches a curbe parametr, this version of function works on 3d curbe tokens 1264//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed 1265int PatchDSH6Tokens( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh, 1266 uint value1, uint value2, uint value3 ) 1267{ 1268 EMULATION_ENTER_FUNCTION( ); 1269 1270 uint PatchOffset, SourceOffset; 1271#if SCHEDULER_DEBUG_MODE 1272 //If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away 1273 if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType ) 1274 { 1275 return -1; 1276 } 1277#endif 1278 for( uint it = 0; it < 3; it++ ) 1279 { 1280 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1281 SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset; 1282 1283 if( SourceOffset == 0 ) 1284 { 1285 *( __global uint * )( &pDsh[ PatchOffset ] ) = value1; 1286 } 1287 else if( SourceOffset == 4 ) 1288 { 1289 *( __global uint * )( &pDsh[ PatchOffset ] ) = value2; 1290 } 1291 else if( SourceOffset == 8 ) 1292 { 1293 *( __global uint * )( &pDsh[ PatchOffset ] ) = value3; 1294 } 1295 currentIndex++; 1296 } 1297 //Check if there are 3 more. 1298 if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType ) 1299 { 1300 for( uint it = 0; it < 3; it++ ) 1301 { 1302 PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset; 1303 SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset; 1304 1305 if( SourceOffset == 0 ) 1306 { 1307 *( __global uint * )( &pDsh[ PatchOffset ] ) = value1; 1308 } 1309 else if( SourceOffset == 4 ) 1310 { 1311 *( __global uint * )( &pDsh[ PatchOffset ] ) = value2; 1312 } 1313 else if( SourceOffset == 8 ) 1314 { 1315 *( __global uint * )( &pDsh[ PatchOffset ] ) = value3; 1316 } 1317 currentIndex++; 1318 } 1319 } 1320 1321 return currentIndex; 1322} 1323//Common code 1324 1325inline __global char* GetPtrToCurbeData( uint offset, __global IGIL_KernelDataHeader * pKernelReflection ) 1326{ 1327 __global char * pRawKernelReflection = ( __global char * )pKernelReflection; 1328 return ( pRawKernelReflection + offset ); 1329} 1330 1331__global char* GetPtrToKernelReflectionOffset( uint offset, __global IGIL_KernelDataHeader * pKernelReflection ) 1332{ 1333 __global char * pRawKernelReflection = ( __global char * )pKernelReflection; 1334 return ( pRawKernelReflection + offset ); 1335} 1336 1337void InitWalkerDataParallel( __local IGIL_WalkerEnumeration* pWalkerEnumData, 1338 uint workDim, 1339 uint* pWalkerCount, 1340 uint3 edgeArray, 1341 uint3 globalDim, 1342 uint3 globalSizes, 1343 uint3 localSizes ) 1344{ 1345 EMULATION_ENTER_FUNCTION( ); 1346 1347 pWalkerEnumData->TotalDimSize.x = globalDim.x; 1348 pWalkerEnumData->TotalDimSize.y = globalDim.y; 1349 pWalkerEnumData->TotalDimSize.z = globalDim.z; 1350 1351 pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.x = localSizes.x; 1352 pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.x = 0; 1353 pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.x = globalDim.x; 1354 1355 pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.y = localSizes.y; 1356 pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.y = 0; 1357 pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.y = globalDim.y; 1358 1359 pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.z = localSizes.z; 1360 pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.z = 0; 1361 pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.z = globalDim.z; 1362 1363 uint WalkerCount = 1; 1364 1365 if( edgeArray.x != 0 ) 1366 { 1367 pWalkerEnumData->TotalDimSize.x++; 1368 1369 pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.x = edgeArray.x; 1370 pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.x = globalDim.x; 1371 pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x; 1372 1373 pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.y = localSizes.y; 1374 pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.y = 0; 1375 pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.y = globalDim.y; 1376 1377 pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.z = localSizes.z; 1378 pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.z = 0; 1379 pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.z = globalDim.z; 1380 1381 WalkerCount++; 1382 } 1383 1384 if( workDim > 1 ) 1385 { 1386 if( edgeArray.y != 0 ) 1387 { 1388 pWalkerEnumData->TotalDimSize.y++; 1389 1390 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x; 1391 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0; 1392 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x; 1393 1394 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y; 1395 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y; 1396 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y; 1397 1398 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z; 1399 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0; 1400 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z; 1401 1402 WalkerCount++; 1403 } 1404 1405 if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) ) 1406 { 1407 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x; 1408 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x; 1409 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x; 1410 1411 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y; 1412 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y; 1413 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y; 1414 1415 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z; 1416 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0; 1417 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z; 1418 1419 WalkerCount++; 1420 } 1421 if( workDim > 2 ) 1422 { 1423 if( edgeArray.z != 0 ) 1424 { 1425 pWalkerEnumData->TotalDimSize.z++; 1426 1427 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x; 1428 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0; 1429 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x; 1430 1431 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y; 1432 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0; 1433 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y; 1434 1435 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z; 1436 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z; 1437 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z; 1438 1439 WalkerCount++; 1440 } 1441 if( ( edgeArray.x != 0 ) & ( edgeArray.z != 0 ) ) 1442 { 1443 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x; 1444 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x; 1445 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x; 1446 1447 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y; 1448 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0; 1449 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y; 1450 1451 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z; 1452 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z; 1453 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z; 1454 1455 WalkerCount++; 1456 } 1457 1458 if( ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) ) 1459 { 1460 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x; 1461 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0; 1462 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x; 1463 1464 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y; 1465 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y; 1466 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y; 1467 1468 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z; 1469 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z; 1470 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z; 1471 1472 WalkerCount++; 1473 } 1474 if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) ) 1475 { 1476 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x; 1477 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x; 1478 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x; 1479 1480 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y; 1481 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y; 1482 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y; 1483 1484 pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z; 1485 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z; 1486 pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z; 1487 1488 WalkerCount++; 1489 } 1490 } 1491 } 1492 1493 *pWalkerCount = WalkerCount; 1494} 1495 1496//Compute number of Walkers needed for this command packet, this function assumes that command packet is initialized 1497inline int GetWalkerCount( __global IGIL_CommandHeader* pCommand ) 1498{ 1499 int WalkerCount = 1; 1500 for( uint dim = 0; ( dim < pCommand->m_range.m_dispatchDimensions ); dim++ ) 1501 { 1502 if( ( pCommand->m_range.m_globalWorkSize[ dim ] % pCommand->m_range.m_localWorkSize[ dim ] ) != 0 ) 1503 { 1504 WalkerCount *= 2; 1505 } 1506 } 1507 return WalkerCount; 1508} 1509 1510//This function intializes command packet, checks for null case, sets proper LWS sizes and return WalkerCount needed for this packet 1511inline void InitializeCommandPacket( __global IGIL_CommandHeader* pCommand ) 1512{ 1513 EMULATION_ENTER_FUNCTION( ); 1514 1515 //Check for NULL case 1516 if( pCommand->m_range.m_localWorkSize[ 0 ] == 0 ) 1517 { 1518 //If null case detected use 16 x 1 x 1 lws 1519 if( pCommand->m_range.m_globalWorkSize[ 0 ] >= 16 ) 1520 { 1521 pCommand->m_range.m_localWorkSize[ 0 ] = 16; 1522 } 1523 else 1524 { 1525 pCommand->m_range.m_localWorkSize[ 0 ] = pCommand->m_range.m_globalWorkSize[ 0 ]; 1526 } 1527 1528 pCommand->m_range.m_localWorkSize[ 1 ] = 1; 1529 pCommand->m_range.m_localWorkSize[ 2 ] = 1; 1530 } 1531 1532} 1533 1534//Patches the address for pipe control 1535void PatchPipeControlProfilingAddres( __global uint* secondaryBatchBuffer, uint slBoffset, ulong address, uint pipeControlOffset ) 1536{ 1537 EMULATION_ENTER_FUNCTION( ); 1538 1539 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 1540 uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD; 1541 uint DwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_ADDRESS_FIELD_DWORD; 1542 //Patch P_C event timestamp address in SLB in 3rd and 4th dword 1543 secondaryBatchBuffer[ DwordOffset ] = 0; 1544 patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_END_BIT, ( uint )( address >> PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT ) ); 1545 DwordOffset++; 1546 secondaryBatchBuffer[ DwordOffset ] = 0; 1547 patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_END_BIT, ( address >> 32 ) ); 1548 1549 //Patch Timestamp bit 1550 patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_GENERATE_TIME_STAMP ); 1551} 1552 1553void DisablePostSyncBitInPipeControl( __global uint* secondaryBatchBuffer, uint slBoffset, uint pipeControlOffset ) 1554{ 1555 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 1556 uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD; 1557 //Patch P_C event timestamp address in SLB in 3rd and 4th dword 1558 patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_NO_POSTSYNC_OPERATION ); 1559} 1560 1561 1562int PatchDSH( __global IGIL_CommandQueue* pQueue, 1563 __global IGIL_KernelDataHeader * pKernelReflection, 1564 __global char* dsh, 1565 uint blockId, 1566 __global IGIL_CommandHeader* pCommandHeader, 1567 __global uint* secondaryBatchBuffer, 1568 uint dshOffset, 1569 uint intefaceDescriptorOffset, 1570 __local IGIL_WalkerEnumeration* pWalkerMain, 1571 uint walkerPos ) 1572{ 1573 EMULATION_ENTER_FUNCTION( ); 1574 1575 __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId ); 1576 __global IGIL_KernelData* pBlockData = IGIL_GetKernelData( pKernelReflection, blockId ); 1577 ulong PatchMask = pBlockData->m_PatchTokensMask; 1578 uint CurrentIndex = 0; 1579 1580 __global char* pDsh = ( __global char* )&dsh[ dshOffset ]; 1581 __global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data; 1582 1583 uint NumberOfDepencies = pCommandHeader->m_numDependencies; 1584 uint PatchOffset; 1585 __global char* pScalarData = ( __global char* )( &pCommandHeader->m_data[ NumberOfDepencies ] ); 1586 __global char *pDshOnKRS = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection ); 1587 1588 uint SizeOfScalarsFromCurbe = 0; 1589 uint CurbeSize; 1590 1591 uint TotalLocalSize; 1592 uint ThreadPayloadSize; 1593 uint NumberOfHWThreads; 1594 uint WorkDim; 1595 uint3 GlobalOffset; 1596 uint3 GlobalSizes; 1597 uint3 ActualLocalSize; 1598 1599 GlobalOffset.x = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ]; 1600 GlobalOffset.y = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ]; 1601 GlobalOffset.z = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ]; 1602 1603 GlobalSizes.x = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ]; 1604 GlobalSizes.y = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ]; 1605 GlobalSizes.z = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ]; 1606 1607 ActualLocalSize.x = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.x; 1608 ActualLocalSize.y = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.y; 1609 ActualLocalSize.z = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.z; 1610 1611 WorkDim = pCommandHeader->m_range.m_dispatchDimensions; 1612 TotalLocalSize = ActualLocalSize.x * ActualLocalSize.y * ActualLocalSize.z; 1613 1614 NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize; 1615 if( TotalLocalSize % pBlockData->m_SIMDSize ) 1616 { 1617 NumberOfHWThreads++; 1618 } 1619 1620 ThreadPayloadSize = NumberOfHWThreads * 3 * GRF_SIZE; 1621 //Copy constant buffer to designated area on DSH. 1622 1623 //pDshOnKRS seems to be in the global address space, not the private address space 1624 //Copy SamplerState and Constant Buffer at once 1625 IGILLOCAL_MEMCPY_GTOG( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + pBlockData->m_SizeOfSamplerHeap ); 1626 1627 if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK ) 1628 { 1629 while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT ) 1630 { 1631 CurbeSize = pKernelCurbeParams[ CurrentIndex ].m_parameterSize; 1632 SizeOfScalarsFromCurbe += CurbeSize; 1633 PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset; 1634 1635 //pScalarData is in the global address space, not the private address space 1636 IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize ); 1637 pScalarData += CurbeSize; 1638 CurrentIndex++; 1639 } 1640#if SCHEDULER_DEBUG_MODE 1641 if( pCommandHeader->m_sizeOfScalarArguments != SizeOfScalarsFromCurbe ) 1642 { 1643 pCommandHeader->m_commandState = SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH; 1644 return -1; 1645 } 1646#endif 1647 } 1648 1649 if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK ) 1650 { 1651 CurrentIndex = PatchLocalWorkSizes( CurrentIndex, 1652 SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE, 1653 pKernelCurbeParams, 1654 pDsh, 1655 pWalkerMain->LocalWorkSize.x, 1656 pWalkerMain->LocalWorkSize.y, 1657 pWalkerMain->LocalWorkSize.z, 1658 ActualLocalSize.x, 1659 ActualLocalSize.y, 1660 ActualLocalSize.z ); 1661#if SCHEDULER_DEBUG_MODE 1662 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1663 { 1664 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1665 return -1; 1666 } 1667#endif 1668 } 1669 1670 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK ) 1671 { 1672 CurrentIndex = PatchDSH6Tokens( CurrentIndex, 1673 SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE, 1674 pKernelCurbeParams, 1675 pDsh, 1676 GlobalSizes.x, 1677 GlobalSizes.y, 1678 GlobalSizes.z ); 1679#if SCHEDULER_DEBUG_MODE 1680 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1681 { 1682 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1683 return -1; 1684 } 1685#endif 1686 } 1687 if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK ) 1688 { 1689 CurrentIndex = PatchDSH6Tokens( CurrentIndex, 1690 SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS, 1691 pKernelCurbeParams, 1692 pDsh, 1693 pWalkerMain->TotalDimSize.x, 1694 pWalkerMain->TotalDimSize.y, 1695 pWalkerMain->TotalDimSize.z ); 1696#if SCHEDULER_DEBUG_MODE 1697 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1698 { 1699 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1700 return -1; 1701 } 1702#endif 1703 } 1704 if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK ) 1705 { 1706 CurrentIndex = PatchDSH1Token( CurrentIndex, 1707 SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS, 1708 pKernelCurbeParams, 1709 pDsh, 1710 WorkDim ); 1711#if SCHEDULER_DEBUG_MODE 1712 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1713 { 1714 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1715 return -1; 1716 } 1717#endif 1718 } 1719 1720 if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK ) 1721 { 1722 CurrentIndex = PatchLocalMemEntities( CurrentIndex, 1723 SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES, 1724 pKernelCurbeParams, 1725 pDsh, 1726 pCommandHeader ); 1727#if SCHEDULER_DEBUG_MODE 1728 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1729 { 1730 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1731 return -1; 1732 } 1733#endif 1734 } 1735 1736 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK ) 1737 { 1738 CurrentIndex = PatchDSH6Tokens( CurrentIndex, 1739 SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET, 1740 pKernelCurbeParams, 1741 pDsh, 1742 GlobalOffset.x, 1743 GlobalOffset.y, 1744 GlobalOffset.z ); 1745#if SCHEDULER_DEBUG_MODE 1746 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1747 { 1748 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1749 return -1; 1750 } 1751#endif 1752 } 1753 1754 if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK ) 1755 { 1756 CurrentIndex = PatchDSH1Token( CurrentIndex, 1757 SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS, 1758 pKernelCurbeParams, 1759 pDsh, 1760 NumberOfHWThreads ); 1761#if SCHEDULER_DEBUG_MODE 1762 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1763 { 1764 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1765 return -1; 1766 } 1767#endif 1768 } 1769 1770 if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK ) 1771 { 1772 CurrentIndex = PatchDSH1Token( CurrentIndex, 1773 SCHEDULER_DATA_PARAMETER_PARENT_EVENT, 1774 pKernelCurbeParams, 1775 pDsh, 1776 pCommandHeader->m_event ); 1777#if SCHEDULER_DEBUG_MODE 1778 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1779 { 1780 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1781 return -1; 1782 } 1783#endif 1784 } 1785 1786 if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK ) 1787 { 1788 CurrentIndex = PatchDSH6Tokens( CurrentIndex, 1789 SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE, 1790 pKernelCurbeParams, 1791 pDsh, 1792 pWalkerMain->LocalWorkSize.x, 1793 pWalkerMain->LocalWorkSize.y, 1794 pWalkerMain->LocalWorkSize.z ); 1795#if SCHEDULER_DEBUG_MODE 1796 if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) ) 1797 { 1798 pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED; 1799 return -1; 1800 } 1801#endif 1802 } 1803 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER ) 1804 { 1805 if( pCommandHeader->m_numGlobalCapturedBuffer > 0 ) 1806 { 1807 //Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader 1808 __global uint* pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] ); 1809 __global uint* pGlobalPtrs = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] ); 1810 uint StartIndex = CurrentIndex; 1811 1812 //Argument in command header are not in correct sequence, that's why proper key needs to be located 1813 for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++) 1814 { 1815 //Reset CurrentIndex as we need to start from the beginning. 1816 CurrentIndex = StartIndex; 1817 while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE ) 1818 { 1819 //Patch only if exact match occurs 1820 if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes ) 1821 { 1822 PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset; 1823 //64 bit patching 1824 if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 ) 1825 { 1826 __global uint* pDst = (__global uint *) &pDsh[PatchOffset]; 1827 pDst[ 0 ] = pGlobalPtrs[ 0 ]; 1828 pDst[ 1 ] = pGlobalPtrs[ 1 ]; 1829 } 1830 else 1831 { 1832 __global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ]; 1833 *pDst = ( uint ) *pGlobalPtrs; 1834 } 1835 } 1836 CurrentIndex++; 1837 } 1838 pGlobalPtrs += 2; 1839 pGlobalIndexes++; 1840 } 1841 } 1842 } 1843 1844 //Now generate local IDS 1845 if( pBlockData->m_SIMDSize == 8 ) 1846 { 1847 generateLocalIDSsimd8( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads ); 1848 } 1849 else 1850 { 1851 generateLocalIDSsimd16( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads ); 1852 } 1853 1854 uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize; 1855 1856 //Update Interface Descriptor Data with SLM size / number of HW threads. 1857 CopyAndPatchIDData( dsh, blockId, NumberOfHWThreads, TotalSLMSize, intefaceDescriptorOffset, pQueue->m_controls.m_StartBlockID ); 1858 1859 //Add WalkerStartSize 1860 patchGpGpuWalker( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, pBlockData->m_SIMDSize, 1861 TotalLocalSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerDimSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerStartPoint, 1862 NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, dshOffset ); 1863 1864 PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_INITIAL ); 1865 PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_SECOND ); 1866 1867 return 0; 1868} 1869 1870//Returns: isSRGB(ChannelOrder) ? ChannelOrder : 0; 1871inline uint GetSRGBChannelOrder( uint channelOrder ) 1872{ 1873 const uint AsSrgb = channelOrder - CL_sRGB; 1874 const uint NumSrgbFormats = CL_sBGRA - CL_sRGB; 1875 if( AsSrgb < NumSrgbFormats ) 1876 return channelOrder; 1877 else 1878 return 0; 1879} 1880 1881void PatchDSHParallelWithDynamicDSH20( uint slbOffsetBase, 1882 uint dshOffsetBase, 1883 uint intefaceDescriptorOffsetBase, 1884 __global IGIL_KernelDataHeader * pKernelReflection, 1885 __global char* dsh, 1886 uint blockId, 1887 __global IGIL_CommandHeader* pCommandHeader, 1888 __global uint* secondaryBatchBuffer, 1889 __global IGIL_CommandQueue* pQueue, 1890 __global IGIL_EventPool* eventsPool, 1891 __global char* ssh, 1892 uint btOffset, 1893 __local IGIL_WalkerEnumeration* pWalkerEnum, 1894 __local uint* objectIds 1895#ifdef ENABLE_DEBUG_BUFFER 1896 , __global DebugDataBuffer* DebugQueue 1897#endif 1898 ) 1899{ 1900 EMULATION_ENTER_FUNCTION( ); 1901 1902 __global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId ); 1903 __global IGIL_KernelData* pBlockData = IGIL_GetKernelData( pKernelReflection, blockId ); 1904 ulong PatchMask = pBlockData->m_PatchTokensMask; 1905 uint CurrentIndex = 0; 1906 1907 __global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data; 1908 1909 uint NumberOfDepencies = pCommandHeader->m_numDependencies; 1910 uint PatchOffset; 1911 1912 uint CurbeSize; 1913 1914 //Get pointer to the Sampler State 1915 __global char *pDshOnKRS = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection ); 1916 1917 uint WalkerCount = GetWalkerCount( pCommandHeader ); 1918 __global char *pKernelReflectionChar = ( __global char * ) pKernelReflection; 1919 __global IGIL_KernelCurbeParams* pSSHdata = ( __global IGIL_KernelCurbeParams* )&pKernelReflectionChar[ pKernelAddressData->m_SSHTokensOffset ]; 1920 1921 //WALKER variables that will be propagated to SLB 1922 uint3 LocalSizes; 1923 uint3 GlobalSizes; 1924 uint3 GlobalOffset; 1925 uint3 EdgeArray; 1926 uint3 XYZDim; 1927 1928 //X is always there 1929 GlobalOffset.x = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ]; 1930 GlobalSizes.x = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ]; 1931 LocalSizes.x = ( uint )pCommandHeader->m_range.m_localWorkSize[ 0 ]; 1932 EdgeArray.x = GlobalSizes.x % LocalSizes.x; 1933 uint WorkDim = pCommandHeader->m_range.m_dispatchDimensions; 1934 XYZDim.x = GlobalSizes.x / LocalSizes.x; 1935 1936 if( WorkDim > 1 ) 1937 { 1938 GlobalOffset.y = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ]; 1939 GlobalSizes.y = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ]; 1940 LocalSizes.y = ( uint )pCommandHeader->m_range.m_localWorkSize[ 1 ]; 1941 EdgeArray.y = GlobalSizes.y % LocalSizes.y; 1942 XYZDim.y = GlobalSizes.y / LocalSizes.y; 1943 1944 if( WorkDim > 2 ) 1945 { 1946 GlobalOffset.z = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ]; 1947 GlobalSizes.z = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ]; 1948 LocalSizes.z = ( uint )pCommandHeader->m_range.m_localWorkSize[ 2 ]; 1949 XYZDim.z = GlobalSizes.z / LocalSizes.z; 1950 EdgeArray.z = GlobalSizes.z % LocalSizes.z; 1951 } 1952 else 1953 { 1954 GlobalOffset.z = 0; 1955 GlobalSizes.z = 1; 1956 LocalSizes.z = 1; 1957 EdgeArray.z = 0; 1958 XYZDim.z = 1; 1959 } 1960 } 1961 else 1962 { 1963 GlobalOffset.y = 0; 1964 GlobalOffset.z = 0; 1965 GlobalSizes.y = 1; 1966 GlobalSizes.z = 1; 1967 LocalSizes.y = 1; 1968 LocalSizes.z = 1; 1969 EdgeArray.z = 0; 1970 EdgeArray.y = 0; 1971 XYZDim.z = 1; 1972 XYZDim.y = 1; 1973 } 1974 1975 if( get_local_id( 0 ) == 0 ) 1976 { 1977 InitWalkerDataParallel( pWalkerEnum, WorkDim, &WalkerCount, EdgeArray, XYZDim, GlobalSizes, LocalSizes ); 1978 } 1979 1980 uint SLBOffset = slbOffsetBase; 1981 uint DshOffset = dshOffsetBase; 1982 uint IntefaceDescriptorOffset = intefaceDescriptorOffsetBase; 1983 __global uint* pArgumentIds = NULL; 1984 __global uint* pObjectIds = NULL; 1985 __global char* pLocalIdsOnDSH = NULL; 1986 1987 uint SamplerHeapSize = pBlockData->m_SizeOfSamplerHeap; 1988 1989 //Object ID is in fact surface state offset for parent in case of surfaces using SSH, copy SSH from parent to child. 1990 //Copy binding table state of this kernel to allocated place on ssh 1991 GLOBAL_MEMCPY( &ssh[ btOffset ], &ssh[ pKernelAddressData->m_BTSoffset ] , pKernelAddressData->m_BTSize ); 1992 1993 for( uint WalkerID = 0; WalkerID < WalkerCount; WalkerID++ ) 1994 { 1995 //Update the offsets 1996 if( WalkerID > 0 ) 1997 { 1998 SLBOffset += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE; 1999 SLBOffset %= MAX_SLB_OFFSET; 2000 IntefaceDescriptorOffset++; 2001 DshOffset += MAX_DSH_SIZE_PER_ENQUEUE; 2002 } 2003 2004 __global char* pDsh = ( __global char* )&dsh[ DshOffset ]; 2005 pLocalIdsOnDSH = &pDsh[ pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize ]; 2006 2007 //Copy Sampler State and constant buffer on all threads 2008 GLOBAL_MEMCPY( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize ); 2009 2010 barrier( CLK_GLOBAL_MEM_FENCE ); 2011 2012 //Update BorderColorPointer on all threads 2013 if( pBlockData->m_numberOfSamplerStates ) 2014 { 2015 uint SamplerId = get_local_id( 0 ); 2016 __global uint* pSamplerState; 2017 while( SamplerId < pBlockData->m_numberOfSamplerStates ) 2018 { 2019 pSamplerState = ( __global uint* )&dsh[ DshOffset + pBlockData->m_SamplerStateArrayOffsetOnDSH + SamplerId * OCLRT_SIZEOF_SAMPLER_STATE ]; 2020 uint PatchValue = DshOffset >> 5; 2021 patchDword( &pSamplerState[ SAMPLER_STATE_DESCRIPTOR_BORDER_COLOR_POINTER_DWORD ], 5, 31, PatchValue ); 2022 SamplerId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20; 2023 } 2024 } 2025 2026 //Setup SSH if needed, do it only for first Walker as all Walkers will re-use the same binding table layout. 2027 if( ( pCommandHeader->m_numGlobalArguments > 0 ) & ( WalkerID == 0 ) ) 2028 { 2029 //Global arguments are after scalars, global pointers slm sizes and events 2030 uint offset = pCommandHeader->m_numDependencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numOfLocalPtrSizes + ( pCommandHeader->m_numGlobalCapturedBuffer * ( sizeof( ulong ) + sizeof( uint ) ) / sizeof( uint ) ); 2031 2032 pArgumentIds = &pCommandHeader->m_data[ offset ]; 2033 //Object IDS are located after Argument IDs 2034 pObjectIds = &pCommandHeader->m_data[ offset + pCommandHeader->m_numGlobalArguments ]; 2035 2036 //Setup local memory for fast access for Curbe patching 2037 uint ArgId = get_local_id( 0 ); 2038 2039 //Only third group Updates ObjectIDS, this will be synchronized with condition below 2040 if( ( ArgId >> HW_GROUP_ID_SHIFT( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) ) == 2 ) 2041 { 2042 ArgId = ArgId - ( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 << 1 ); 2043 while( ArgId < pCommandHeader->m_numGlobalArguments ) 2044 { 2045 objectIds[ pArgumentIds[ ArgId ] ] = pObjectIds[ ArgId ]; 2046 ArgId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20; 2047 } 2048 } 2049#ifdef SCHEDULER_EMULATION 2050 //Synchronization needed for Emulation, ObjectIDS needs to be set by whole HW group, on GPU there is implicit synchronization in HW group 2051 barrier( CLK_GLOBAL_MEM_FENCE ); 2052#endif 2053 if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 ) 2054 { 2055 __global uint* pBindingTable = ( __global uint* ) &ssh[ btOffset ]; 2056 2057 2058 //To properly set up binding table point to parents surface state heap 2059 for( uint ArgumentID = 0 ; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ ) 2060 { 2061 uint ArgId = pArgumentIds[ ArgumentID ]; 2062 2063 //Locate proper Arg ID 2064 //Get ssh offset, lookup table already provided 2065 if( objectIds[ ArgId ] < MAX_SSH_PER_KERNEL_SIZE ) 2066 { 2067 if( pSSHdata[ ArgId ].m_sourceOffset == ArgId ) 2068 { 2069 pBindingTable[ pSSHdata[ ArgId ].m_patchOffset ] = objectIds[ ArgId ]; 2070 } 2071 else 2072 { 2073 pQueue->m_controls.m_ErrorCode += 10; 2074 uint CurrentArg = 0; 2075 while( CurrentArg < pKernelAddressData->m_BTSize / 4 ) 2076 { 2077 if( pSSHdata[ CurrentArg ].m_sourceOffset == ArgId ) 2078 { 2079 pBindingTable[ pSSHdata[ CurrentArg ].m_patchOffset ] = objectIds[ ArgId ]; 2080 break; 2081 } 2082 CurrentArg++; 2083 } 2084 } 2085 } 2086 } 2087 } 2088 } 2089 2090 if( ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) ) 2091 { 2092 if( get_local_id( 0 ) == 2 * PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) 2093 { 2094 for( uint ArgumentID = 0; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ ) 2095 { 2096 uint ArgId = pArgumentIds[ ArgumentID ]; 2097 if( ( objectIds[ ArgId ] >= MAX_SSH_PER_KERNEL_SIZE ) ) 2098 { 2099 uint SamplerCount = 0; 2100 //Get pointer to Parent's samplers ( arguments ) data stored on KRS 2101 __global IGIL_SamplerParams* pSamplerParamsOnKRS = ( __global IGIL_SamplerParams* )GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerParamsOffset, pKernelReflection ); 2102 2103 //Iterate through all samplers passed from parent and copy state to proper SSA offset 2104 while( pKernelReflection->m_ParentSamplerCount > SamplerCount ) 2105 { 2106 //Get offset in parent's SSA from ObjectID, offset to beginning of SSA is included ( before SSA is BorderColorPointer ) so this is relative to parent's DSH heap 2107 PatchOffset = objectIds[ ArgId ] - MAX_SSH_PER_KERNEL_SIZE; 2108 2109 if( pSamplerParamsOnKRS->m_ArgID == ArgId ) 2110 { 2111 IGILLOCAL_MEMCPY_GTOG( &pDsh[ pSamplerParamsOnKRS->m_SamplerStateOffset ], &dsh[ pQueue->m_controls.m_ParentDSHOffset + PatchOffset ], OCLRT_SIZEOF_SAMPLER_STATE ); 2112 break; 2113 } 2114 pSamplerParamsOnKRS = pSamplerParamsOnKRS + 1; 2115 SamplerCount = SamplerCount + 1; 2116 } 2117 } 2118 } 2119 } 2120 } 2121 __global char* pScalarData = ( __global char* ) ( &pCommandHeader->m_data[ NumberOfDepencies ] ); 2122 2123 CurrentIndex = 0; 2124 uint TotalLocalSize = pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x * 2125 pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y * 2126 pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z; 2127 2128 uint NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize; 2129 2130 if( TotalLocalSize % pBlockData->m_SIMDSize != 0 ) 2131 { 2132 NumberOfHWThreads++; 2133 } 2134 2135 uint ThreadPayloadSize = NumberOfHWThreads * pBlockData->m_PayloadSize; 2136 2137 //Move pointer to Constant Buffer Offset 2138 pDsh = ( __global char* )&dsh[ DshOffset + SamplerHeapSize ]; 2139 2140 if( ( get_local_id( 0 ) >= PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) & ( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + 6 ) ) 2141 { 2142 if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK ) 2143 { 2144 while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT ) 2145 { 2146 CurbeSize = pKernelCurbeParams[ CurrentIndex ].m_parameterSize; 2147 PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset; 2148 IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize ); 2149 pScalarData += CurbeSize; 2150 CurrentIndex++; 2151 } 2152 } 2153 if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK ) 2154 { 2155 CurrentIndex = PatchLocalWorkSizesParallel( CurrentIndex, 2156 SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE, 2157 pKernelCurbeParams, 2158 pDsh, 2159 LocalSizes.x, 2160 LocalSizes.y, 2161 LocalSizes.z, 2162 pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x, 2163 pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y, 2164 pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z ); 2165 } 2166 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK ) 2167 { 2168 CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex, 2169 SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE, 2170 pKernelCurbeParams, 2171 pDsh, 2172 GlobalSizes.x, 2173 GlobalSizes.y, 2174 GlobalSizes.z ); 2175 } 2176 if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK ) 2177 { 2178 CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex, 2179 SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS, 2180 pKernelCurbeParams, 2181 pDsh, 2182 pWalkerEnum->TotalDimSize.x, 2183 pWalkerEnum->TotalDimSize.y, 2184 pWalkerEnum->TotalDimSize.z ); 2185 } 2186 if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK ) 2187 { 2188 CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex, 2189 SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS, 2190 pKernelCurbeParams, 2191 pDsh, 2192 WorkDim ); 2193 } 2194 if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK ) 2195 { 2196 CurrentIndex = PatchLocalMemEntities( CurrentIndex, 2197 SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES, 2198 pKernelCurbeParams, 2199 pDsh, 2200 pCommandHeader ); 2201 } 2202 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK ) 2203 { 2204 CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex, 2205 SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET, 2206 pKernelCurbeParams, 2207 pDsh, 2208 GlobalOffset.x, 2209 GlobalOffset.y, 2210 GlobalOffset.z ); 2211 } 2212 if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK ) 2213 { 2214 CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex, 2215 SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS, 2216 pKernelCurbeParams, 2217 pDsh, 2218 NumberOfHWThreads ); 2219 } 2220 if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK ) 2221 { 2222 CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex, 2223 SCHEDULER_DATA_PARAMETER_PARENT_EVENT, 2224 pKernelCurbeParams, 2225 pDsh, 2226 pCommandHeader->m_event ); 2227 } 2228 if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK ) 2229 { 2230 CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex, 2231 SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE, 2232 pKernelCurbeParams, 2233 pDsh, 2234 LocalSizes.x, 2235 LocalSizes.y, 2236 LocalSizes.z ); 2237 } 2238 if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER ) 2239 { 2240 if( pCommandHeader->m_numGlobalCapturedBuffer > 0 ) 2241 { 2242 //Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader 2243 __global uint* pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] ); 2244 __global uint* pGlobalPtrs = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] ); 2245 uint StartIndex = CurrentIndex; 2246 2247 //Argument in command header are not in correct sequence, that's why proper key needs to be located 2248 for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++) 2249 { 2250 //Reset CurrentIndex as we need to start from the beginning. 2251 CurrentIndex = StartIndex; 2252 while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE ) 2253 { 2254 //Patch only if exact match occurs 2255 if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes ) 2256 { 2257 PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset; 2258 //64 bit patching 2259 if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 ) 2260 { 2261 __global uint* pDst = (__global uint *) &pDsh[PatchOffset]; 2262 pDst[0] = pGlobalPtrs[0]; 2263 pDst[1] = pGlobalPtrs[1]; 2264 } 2265 else 2266 { 2267 __global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ]; 2268 *pDst = ( uint ) *pGlobalPtrs; 2269 } 2270 } 2271 CurrentIndex++; 2272 } 2273 pGlobalPtrs += 2; 2274 pGlobalIndexes++; 2275 } 2276 } 2277 while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE ) 2278 { 2279 CurrentIndex++; 2280 } 2281 } 2282 2283 //Patch images curbe entries 2284 if( ( PatchMask & SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES ) | ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) ) 2285 { 2286 if( ( pArgumentIds != NULL ) & ( pObjectIds != NULL ) ) 2287 { 2288 //pKernelReflectionChar is a global address pointer 2289 __global IGIL_ImageParamters *pImageParams = ( __global IGIL_ImageParamters * ) &pKernelReflectionChar[ pKernelReflection->m_ParentImageDataOffset ]; 2290 __global IGIL_ParentSamplerParams *pParentSamplerParams = ( __global IGIL_ParentSamplerParams* ) &pKernelReflectionChar[ pKernelReflection->m_ParentSamplerParamsOffset ]; 2291 //First obtain argument ID 2292 uint WorkID = get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20; 2293 while( WorkID + CurrentIndex < pBlockData->m_numberOfCurbeTokens ) 2294 { 2295 uint ArgId = pKernelCurbeParams[ CurrentIndex + WorkID ].m_sourceOffset; 2296 uint ObjectID = objectIds[ ArgId ]; 2297 uint CurrentImage = 0; 2298 uint CurrentSampler = 0; 2299 2300 uint PatchValue = 0; 2301 uint TokenType = pKernelCurbeParams[ CurrentIndex + WorkID ].m_parameterType; 2302 uint PatchOffset = pKernelCurbeParams[ CurrentIndex + WorkID ].m_patchOffset; 2303 uint PatchValueInvalid = 0; 2304 2305 //If Images 2306 if( ObjectID < OCLRT_IMAGE_MAX_OBJECT_ID ) 2307 { 2308 //Locate proper parent Image 2309 while( ( pImageParams[ CurrentImage ].m_ObjectID != ObjectID ) & ( CurrentImage < pKernelReflection->m_ParentKernelImageCount ) ) 2310 { 2311 CurrentImage++; 2312 } 2313 //Proper image is located under CurrentImage patch the token 2314 2315 if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH ) 2316 { 2317 PatchValue = pImageParams[ CurrentImage ].m_Width; 2318 } 2319 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT ) 2320 { 2321 PatchValue = pImageParams[ CurrentImage ].m_Height; 2322 } 2323 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH ) 2324 { 2325 PatchValue = pImageParams[ CurrentImage ].m_Depth; 2326 } 2327 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE ) 2328 { 2329 PatchValue = pImageParams[ CurrentImage ].m_ChannelDataType; 2330 } 2331 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER ) 2332 { 2333 PatchValue = pImageParams[ CurrentImage ].m_ChannelOrder; 2334 } 2335 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER ) 2336 { 2337 PatchValue = GetSRGBChannelOrder( pImageParams[ CurrentImage ].m_ChannelOrder ); 2338 } 2339 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE ) 2340 { 2341 PatchValue = pImageParams[ CurrentImage ].m_ArraySize; 2342 } 2343 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES ) 2344 { 2345 PatchValue = pImageParams[ CurrentImage ].m_NumSamples; 2346 } 2347 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS ) 2348 { 2349 PatchValue = pImageParams[ CurrentImage ].m_NumMipLevels; 2350 } 2351 else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID ) 2352 { 2353 PatchValue = ObjectID; 2354 } 2355 else 2356 { 2357 PatchValueInvalid = 1; 2358 } 2359 } 2360 //If Sampler 2361 else if( ObjectID >= OCLRT_SAMPLER_MIN_OBJECT_ID ) 2362 { 2363 //Mark PatchValue invalid if SamplerParams will not be found 2364 PatchValueInvalid = 1; 2365 //Locate proper parent Image 2366 while( CurrentSampler < pKernelReflection->m_ParentSamplerCount ) 2367 { 2368 if( pParentSamplerParams[ CurrentSampler ].m_ObjectID == ObjectID ) 2369 { 2370 PatchValueInvalid = 0; 2371 if( TokenType == DATA_PARAMETER_SAMPLER_ADDRESS_MODE ) 2372 { 2373 PatchValue = pParentSamplerParams[ CurrentSampler ].m_AddressingMode; 2374 } 2375 else if( TokenType == DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS ) 2376 { 2377 PatchValue = pParentSamplerParams[ CurrentSampler ].NormalizedCoords; 2378 } 2379 else if( TokenType == DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED ) 2380 { 2381 PatchValue = pParentSamplerParams[ CurrentSampler ].CoordinateSnapRequired; 2382 } 2383 else if( TokenType == SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID ) 2384 { 2385 PatchValue = ObjectID; 2386 } 2387 else 2388 { 2389 PatchValueInvalid = 1; 2390 } 2391 CurrentSampler = pKernelReflection->m_ParentSamplerCount; 2392 } 2393 CurrentSampler++; 2394 } 2395 } 2396 else 2397 { 2398 PatchValueInvalid = 1; 2399 } 2400 2401 if( PatchValueInvalid == 0 ) 2402 { 2403 *( __global uint * ) ( &pDsh[ PatchOffset ] ) = PatchValue; 2404 } 2405 CurrentIndex += 6; 2406 } 2407 2408 } 2409 else 2410 { 2411 pQueue->m_controls.m_ErrorCode += 7; 2412 } 2413 } 2414 } 2415#ifdef SCHEDULER_EMULATION 2416 barrier( CLK_GLOBAL_MEM_FENCE ); 2417#endif 2418 2419 if( get_local_id( 0 ) == 0 ) 2420 { 2421#if defined WA_LRI_COMMANDS_EXIST 2422 bool ShouldDisablePreemption = false; 2423#endif 2424 //Profiling support 2425 if( pQueue->m_controls.m_IsProfilingEnabled != 0 ) 2426 { 2427 bool DisableTimeStampStart = true; 2428 bool DisableTimeStampEnd = true; 2429 if( ( ( uint )pCommandHeader->m_event != IGIL_EVENT_INVALID_HANDLE ) & ( ( WalkerID == 0 ) | ( WalkerID == WalkerCount - 1 ) ) ) 2430 { 2431 //Event is propagated to childs as "parent event", to avoid overwriting the same start value, only generate timestamp write 2432 //For the first command for this event, this means we look for event with no children ( so compare to 1 ). 2433 clk_event_t EventID = __builtin_astype( ( void* ) ( ( ulong ) pCommandHeader->m_event ), clk_event_t ); 2434 __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool ); 2435 2436 if( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_numChildren == 1 ) 2437 { 2438#if defined WA_LRI_COMMANDS_EXIST && defined WA_PROFILING_PREEMPTION 2439 //This is a case, where profiling of block kernels occurs - presence of event in EM workload 2440 //In such case, disable preemption around all WALKERs for that block kernel and event 2441 ShouldDisablePreemption = true; 2442#endif 2443 if( WalkerID == 0 ) 2444 { 2445 //Emit pipecontrol with timestamp write 2446 ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdStart ); 2447 //Timestamp start 2448 PatchPipeControlProfilingAddres( secondaryBatchBuffer, 2449 SLBOffset, 2450 Address, 2451 PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET ); 2452 DisableTimeStampStart = false; 2453 } 2454 if( WalkerID == WalkerCount - 1 ) 2455 { 2456 ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdEnd ); 2457 //Timestamp end 2458 PatchPipeControlProfilingAddres( secondaryBatchBuffer, 2459 SLBOffset, 2460 Address, 2461 PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 2462 DisableTimeStampEnd = false; 2463 } 2464 } 2465 } 2466 if( DisableTimeStampStart ) 2467 { 2468 DisablePostSyncBitInPipeControl( secondaryBatchBuffer, 2469 SLBOffset, 2470 PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET ); 2471 } 2472 if( DisableTimeStampEnd ) 2473 { 2474 DisablePostSyncBitInPipeControl( secondaryBatchBuffer, 2475 SLBOffset, 2476 PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 2477 } 2478 } 2479 else 2480 { 2481 //Optimized path, in case block can be run concurently noop pipe control after such block. 2482 uint DwordOffset = SLBOffset / DWORD_SIZE_IN_BYTES; 2483 2484 if( pBlockData->m_CanRunConcurently != 0 ) 2485 { 2486 NOOPCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 2487 } 2488 else 2489 { 2490 PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 2491 } 2492 } 2493 2494#if defined WA_LRI_COMMANDS_EXIST 2495 bool NoopPreemptionDisabling = true; 2496 bool NoopPreemptionEnabling = true; 2497 2498#if defined WA_KERNEL_PREEMPTION 2499 //This is case, where block kernel should have disabled preemption because of its sampler usage around all WALKERs of that block kernel 2500 //Preemption should be disabled when EM event profiling is used OR kernel data indicate such behavior 2501 ShouldDisablePreemption |= ( pBlockData->m_DisablePreemption != 0 ); 2502#endif 2503 2504#if defined WA_PROFILING_PREEMPTION 2505 //m_EventTimestampAddress != NULL means profiling of the whole workload is enabled (preemption around whole chained BB is disabled) 2506 //So disabling preemption should be permitted only when workload profiling is off, in other cases noop all LRI commands 2507 //For m_EventTimestampAddress != NULL preemption is enabled before BB_END 2508 ShouldDisablePreemption &= ( pQueue->m_controls.m_EventTimestampAddress == 0 ); 2509#endif 2510 2511 if( ShouldDisablePreemption != false ) 2512 { 2513 if( WalkerID == 0 ) 2514 { 2515 SetDisablePreemptionRegister( SLBOffset, secondaryBatchBuffer ); 2516 NoopPreemptionDisabling = false; 2517 } 2518 2519 if( WalkerID == WalkerCount - 1 ) 2520 { 2521 SetEnablePreemptionRegister( SLBOffset, secondaryBatchBuffer ); 2522 NoopPreemptionEnabling = false; 2523 } 2524 } 2525 2526 if( NoopPreemptionDisabling ) 2527 { 2528 NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 2529 } 2530 2531 if( NoopPreemptionEnabling ) 2532 { 2533 NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer ); 2534 } 2535#endif //WA_LRI_COMMANDS_EXIST 2536 } 2537 2538 //Witems from 0 to 16 are responsible for local ids generation. 2539 if( ( get_local_id( 0 ) < 16 ) & ( pBlockData->m_NeedLocalIDS != 0 ) ) 2540 { 2541 //Now generate local IDS 2542 generateLocalIDSParallel20( pLocalIdsOnDSH, pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize, NumberOfHWThreads, pBlockData->m_SIMDSize ); 2543 } 2544 //3rd HW thread will take care of patching media curbe load and GPPGU_WALKER command 2545 if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 ) 2546 { 2547 2548 uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize; 2549 //Update Interface Descriptor Data with SLM size / number of HW threads. 2550 CopyAndPatchIDData20(dsh, blockId, NumberOfHWThreads, TotalSLMSize, IntefaceDescriptorOffset, 2551 pQueue->m_controls.m_StartBlockID, btOffset, DshOffset, 2552 pBlockData->m_numberOfSamplerStates 2553#ifdef ENABLE_DEBUG_BUFFER 2554 , 2555 DebugQueue 2556#endif 2557 ); 2558 2559 patchGpGpuWalker( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, pBlockData->m_SIMDSize, 2560 TotalLocalSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerDimSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerStartPoint, 2561 NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, SamplerHeapSize + DshOffset ); 2562 2563 PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_INITIAL ); 2564 PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_SECOND ); 2565 } 2566 } 2567} 2568 2569uint CheckEventStatus( __global IGIL_CommandHeader* pCommand, 2570 __global IGIL_EventPool* eventsPool ) 2571{ 2572 if( pCommand->m_numDependencies == 0 ) 2573 { 2574 return 0; 2575 } 2576 else 2577 { 2578 __global IGIL_DeviceEvent* pDeviceEvent; 2579 //Events are stored at the begining of command packet dynamic payload 2580 for( uint i = 0; i < pCommand->m_numDependencies; i++ ) 2581 { 2582 pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] ); 2583 if( pDeviceEvent->m_state != CL_COMPLETE ) 2584 { 2585 return 1; 2586 } 2587 } 2588 } 2589 return 0; 2590} 2591 2592void DecreaseEventDependenciesParallel( __global IGIL_CommandHeader* pCommand, 2593 __global IGIL_EventPool* eventsPool ) 2594{ 2595 __global IGIL_DeviceEvent* pDeviceEvent; 2596 2597 //Events are stored at the begining of command packet dynamic payload 2598 for( uint i = 0; i < pCommand->m_numDependencies; i++ ) 2599 { 2600 pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] ); 2601 int OldDependants = atomic_dec( &pDeviceEvent->m_numDependents ); 2602 2603 if( ( pDeviceEvent->m_refCount <= 0 ) & 2604 ( ( OldDependants - 1 ) <= 0 ) & 2605 ( pDeviceEvent->m_numChildren <= 0 ) ) 2606 { 2607 TEMP_IGIL_FreeEvent( __builtin_astype( ( void* )( ( ulong )pCommand->m_data[ i ] ), clk_event_t ), eventsPool ); 2608 } 2609 } 2610} 2611 2612//Update status of the event and all events that are depending on this event 2613void UpdateEventsTreeStatusParallel( clk_event_t eventId, __global IGIL_EventPool* eventsPool, bool isProfilingEnabled ) 2614{ 2615 __global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool ); 2616 __global IGIL_DeviceEvent *pEvent; 2617 do 2618 { 2619 pEvent = &events[ (uint) (size_t)__builtin_astype( eventId, void* ) ]; 2620 2621 int OldNumChild = atomic_dec( &pEvent->m_numChildren ); 2622 if( ( OldNumChild - 1 ) <= 0 ) 2623 { 2624 pEvent->m_state = CL_COMPLETE; 2625 2626 if( ( pEvent->m_refCount <= 0 ) & 2627 ( pEvent->m_numDependents <= 0 ) & 2628 ( pEvent->m_numChildren <= 0 ) ) 2629 { 2630 TEMP_IGIL_FreeEvent( eventId, eventsPool ); 2631 } 2632 //This event transitions to CL_COMPLETE state, update it profiling informations. 2633 if( isProfilingEnabled != 0 ) 2634 { 2635 //CL COMPLETE time is before this scheduler starts 2636 pEvent->m_profilingCmdComplete = eventsPool->m_CLcompleteTimestamp; 2637 2638 //Check if this event has profiling pointer, if so update profiling data, all times should be there atm 2639 if( pEvent->m_pProfiling != 0 ) 2640 { 2641 __global ulong* retValues = ( __global ulong * )pEvent->m_pProfiling; 2642 2643 ulong StartTime = pEvent->m_profilingCmdStart; 2644 ulong EndTime = pEvent->m_profilingCmdEnd; 2645 ulong CompleteTime = pEvent->m_profilingCmdComplete; 2646 ulong CLEndTransitionTime = 0; 2647 ulong CLCompleteTransitionTime = 0; 2648 //Check if timer didn't reset by hitting max value 2649 if( CompleteTime > StartTime ) 2650 { 2651 CLEndTransitionTime = EndTime - StartTime; 2652 CLCompleteTransitionTime = CompleteTime - StartTime; 2653 } 2654 //If we hit this else it means that GPU timer reset to 0, compute proper delta 2655 else 2656 { 2657 if( EndTime < StartTime ) 2658 { 2659 CLEndTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + EndTime; 2660 } 2661 else 2662 { 2663 CLEndTransitionTime = EndTime - StartTime; 2664 } 2665 CLCompleteTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + CompleteTime; 2666 } 2667 //First value is END - START timestamp 2668 retValues[ 0 ] = ( ulong )( ( float )CLEndTransitionTime * eventsPool->m_TimestampResolution ); 2669 //Second value is COMPLETE - START timestamp 2670 retValues[ 1 ] = ( ulong )( ( float )CLCompleteTransitionTime * eventsPool->m_TimestampResolution ); 2671 } 2672 } 2673 //Signal parent because we completed 2674 eventId = __builtin_astype( ( void* )( ( ulong )pEvent->m_parentEvent ), clk_event_t ); 2675 } 2676 } 2677 while ( ( ( uint )(size_t)__builtin_astype( eventId, void* ) != IGIL_EVENT_INVALID_HANDLE ) & ( pEvent->m_numChildren <= 0 ) ); 2678} 2679 2680void GlobalBarrier( __global volatile uint* syncSurface ) 2681{ 2682 //Make sure each WKG item hit the barrier. 2683 barrier( CLK_GLOBAL_MEM_FENCE ); 2684 2685 //Now first thread of each wkg writes to designated place on SyncSurface 2686 if ( get_local_id( 0 ) == 0 ) 2687 { 2688 syncSurface[ get_group_id( 0 ) ] = 1; 2689 } 2690 //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner. 2691 if( get_group_id( 0 ) == ( get_num_groups( 0 ) - 1 ) ) 2692 { 2693 //24 -48 case 2694 uint Value; 2695 do 2696 { 2697 Value = 1; 2698 for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) ) 2699 { 2700 Value = Value & syncSurface[ i ]; 2701 } 2702 2703 } 2704 while( Value == 0 ); 2705 barrier( CLK_GLOBAL_MEM_FENCE ); 2706 2707 for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) ) 2708 { 2709 syncSurface[ i ] = 0; 2710 } 2711 } 2712 2713 if( get_local_id( 0 ) == 0 ) 2714 { 2715 while( syncSurface[ get_group_id( 0 ) ] != 0 ); 2716 } 2717 barrier( CLK_GLOBAL_MEM_FENCE ); 2718} 2719 2720void GlobalBarrierUpdateQueue( __global volatile uint* syncSurface, __global IGIL_CommandQueue* pQueue ) 2721{ 2722 //Make sure each WKG item hit the barrier. 2723 barrier( CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 2724 //Now first thread of each wkg writes to designated place on SyncSurface 2725 if ( get_local_id(0) == 0 ) 2726 { 2727 syncSurface[get_group_id(0)] = 1; 2728 } 2729 //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner. 2730 if( get_group_id(0) == ( get_num_groups( 0 ) - 1 ) ) 2731 { 2732 uint Value; 2733 do 2734 { 2735 Value = 1; 2736 for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) ) 2737 { 2738 Value = Value & syncSurface[ i ]; 2739 } 2740 } 2741 while( Value == 0 ); 2742 barrier( CLK_GLOBAL_MEM_FENCE ); 2743 pQueue->m_controls.m_IDTAfterFirstPhase = pQueue->m_controls.m_CurrentIDToffset; 2744 2745 barrier( CLK_GLOBAL_MEM_FENCE ); 2746 2747 for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) ) 2748 { 2749 syncSurface[ i ] = 0; 2750 } 2751 } 2752 2753 if( get_local_id(0) == 0 ) 2754 { 2755 while( syncSurface[ get_group_id(0) ] != 0 ); 2756 } 2757 barrier( CLK_GLOBAL_MEM_FENCE ); 2758} 2759 2760 2761#ifdef SCHEDULER_EMULATION 2762__local int IDTOffset; 2763__local int DSHOffset; 2764__local int SLBOffset; 2765__local int StackOffset; 2766__local int QStorageOffset; 2767__local int MarkerOffset; 2768__local int BTSoffset; 2769__local IGIL_WalkerEnumeration WalkerEnum; 2770__local uint ObjectIDS[ MAX_GLOBAL_ARGS ]; 2771#endif 2772 2773#define WA_INT_DESC_MAX 62 2774__kernel __attribute__((intel_reqd_sub_group_size(PARALLEL_SCHEDULER_COMPILATION_SIZE_20))) 2775void SchedulerParallel20( 2776 __global IGIL_CommandQueue* pQueue, 2777 __global uint* commandsStack, 2778 __global IGIL_EventPool* eventsPool, 2779 __global uint* secondaryBatchBuffer, //SLB that will be used to put commands in. 2780 __global char* dsh, //Pointer to the start of Dynamic State Heap 2781 __global IGIL_KernelDataHeader* kernelData, //This is kernel reflection surface 2782 __global volatile uint* queueStorageBuffer, 2783 __global char* ssh, //Pointer to Surface state heap with BT and SS 2784 __global DebugDataBuffer* debugQueue ) 2785{ 2786 EMULATION_ENTER_FUNCTION( ); 2787#ifdef WA_DISABLE_SCHEDULERS 2788 return; 2789#endif 2790 2791#ifdef DEBUG 2792 //Early return enabled when m_SchedulerEarlyReturn is > 0, 2793 if( pQueue->m_controls.m_SchedulerEarlyReturn > 0 ) 2794 { 2795 if( pQueue->m_controls.m_SchedulerEarlyReturn == 1 ) 2796 { 2797 return; 2798 } 2799 2800 if( get_global_id( 0 ) == 0 ) 2801 { 2802 pQueue->m_controls.m_SchedulerEarlyReturnCounter++; 2803 } 2804 GlobalBarrier( queueStorageBuffer ); 2805 2806 if( pQueue->m_controls.m_SchedulerEarlyReturnCounter == pQueue->m_controls.m_SchedulerEarlyReturn ) 2807 { 2808 if( ( ( get_group_id( 0 ) == 1) == ( get_num_groups( 0 ) > 1 ) ) & ( get_local_id( 0 ) == 0 ) ) 2809 { 2810#ifdef ENABLE_DEBUG_BUFFER 2811 //Set START time of current (last) scheduler 2812 if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) ) 2813 { 2814 *( ( __global ulong * ) ( &DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] ) ) = EventsPool->m_CLcompleteTimestamp; 2815 } 2816#endif 2817 pQueue->m_controls.Temporary[ 2 ]++; 2818 2819 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 2820 __private uint DwordOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET ) / DWORD_SIZE_IN_BYTES; 2821 //BB_START 1st DWORD 2822 secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0; 2823 DwordOffset++; 2824 //BB_START 2nd DWORD - Address, 3rd DWORD Address high 2825 secondaryBatchBuffer[ DwordOffset++ ] = (uint)(pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF); 2826 secondaryBatchBuffer[ DwordOffset ] = (uint)((pQueue->m_controls.m_CleanupSectionAddress >> 32) & 0xFFFFFFFF); 2827 } 2828 return; 2829 } 2830 } 2831#endif 2832 2833 //First check if there are any new command packets on queue_t 2834 __global IGIL_CommandHeader* pCommand = 0; 2835 uint GroupID = get_group_id( 0 ); 2836 2837#ifndef SCHEDULER_EMULATION 2838 __local int IDTOffset; 2839 __local int DSHOffset; 2840 __local int SLBOffset; 2841 __local int StackOffset; 2842 __local int QStorageOffset; 2843 __local int MarkerOffset; 2844 __local int BTSoffset; 2845 __local IGIL_WalkerEnumeration WalkerEnum; 2846 __local uint ObjectIDS[ MAX_GLOBAL_ARGS ]; 2847 2848#endif 2849 2850 if( pQueue->m_controls.m_LastScheduleEventNumber > 0 ) 2851 { 2852 //Check if there are any events that needs updating, each wkg uses all hw threads in wkg to update events 2853 if( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 < pQueue->m_controls.m_LastScheduleEventNumber ) 2854 { 2855 clk_event_t EventID; 2856 if( get_local_id( 0 ) % PARALLEL_SCHEDULER_COMPILATION_SIZE_20 == 0 ) 2857 { 2858 uint ID = ( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 ) + ( get_local_id( 0 ) / PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ); 2859 while( ID < pQueue->m_controls.m_LastScheduleEventNumber ) 2860 { 2861 EventID = __builtin_astype( ( void* )( ( ulong )pQueue->m_controls.m_EventDependencies[ ID ] ), clk_event_t ); 2862 UpdateEventsTreeStatusParallel( EventID, eventsPool, ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) ); 2863 ID += get_num_groups( 0 ) * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20; 2864 } 2865 } 2866 } 2867 GlobalBarrier( queueStorageBuffer ); 2868 } 2869 //Queue parsing section 2870 uint NumberOfEnqueues = pQueue->m_controls.m_TotalNumberOfQueues - pQueue->m_controls.m_PreviousNumberOfQueues; 2871 if( NumberOfEnqueues > 0 ) 2872 { 2873 uint InitialOffset = pQueue->m_controls.m_PreviousHead; 2874 bool PacketScheduled = true; 2875 uint offset = 0; 2876 2877 for( uint CurrentPacket = GroupID; CurrentPacket < NumberOfEnqueues; CurrentPacket += get_num_groups( 0 ) ) 2878 { 2879 if( CurrentPacket == GroupID ) 2880 { 2881 offset = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, InitialOffset, CurrentPacket ); 2882 } 2883 else 2884 { 2885 offset = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, offset, get_num_groups( 0 ) ); 2886 } 2887 pCommand = TEMP_IGIL_GetCommandHeader( pQueue, offset ); 2888 2889 //Initialize command packet with proper lws 2890 if( get_local_id( 0 ) == 0 ) 2891 { 2892 InitializeCommandPacket( pCommand ); 2893 } 2894 2895 //Can I run this command ? 2896 if( CheckEventStatus( pCommand, eventsPool ) == 0 ) 2897 { 2898 //Is it marker command ? 2899 if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER ) 2900 { 2901 //Is there enough IDT space for me ? 2902 if( get_local_id( 0 ) == 0 ) 2903 { 2904 int WalkerNeeded = GetWalkerCount( pCommand ); 2905 //Optimization - check if IDT has free space for me 2906 if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded <= WA_INT_DESC_MAX ) 2907 { 2908 uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded ); 2909 if( Temp + WalkerNeeded <= WA_INT_DESC_MAX ) 2910 { 2911 IDTOffset = Temp; 2912 DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) ); 2913 SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET ); 2914 BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize ); 2915 } 2916 else 2917 { 2918 IDTOffset = -1; 2919 } 2920 } 2921 else 2922 { 2923 IDTOffset = -1; 2924 } 2925 } 2926 //Now barrier and check if we can go with scheduling 2927 barrier( CLK_LOCAL_MEM_FENCE ); 2928 2929 if( IDTOffset != -1 ) 2930 { 2931 //This packet is all set, schedule it and we are done with it. 2932 //Patch DSH has media curbe load and patch gpgpu walker inside 2933 PatchDSHParallelWithDynamicDSH20( SLBOffset, 2934 DSHOffset, 2935 IDTOffset, 2936 kernelData, 2937 dsh, 2938 pCommand->m_kernelId, 2939 pCommand, 2940 secondaryBatchBuffer, 2941 pQueue, 2942 eventsPool, 2943 ssh, 2944 BTSoffset, 2945 &WalkerEnum, 2946 ObjectIDS 2947#ifdef ENABLE_DEBUG_BUFFER 2948 , DebugQueue 2949#endif 2950 ); 2951 PacketScheduled = true; 2952 } 2953 else 2954 { 2955 PacketScheduled = false; 2956 } 2957 } 2958 else //For marker we need to update returned event status 2959 { 2960 //Check if there is space to track event 2961 if( get_local_id( 0 ) == 0 ) 2962 { 2963 uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled ); 2964 if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER ) 2965 { 2966 MarkerOffset = Temp; 2967 } 2968 else 2969 { 2970 MarkerOffset = -1; 2971 } 2972 } 2973 barrier( CLK_LOCAL_MEM_FENCE ); 2974 if( MarkerOffset != -1 ) 2975 { 2976 PacketScheduled = true; 2977 } 2978 else 2979 { 2980 PacketScheduled = false; 2981 } 2982 } 2983 //Update event dependencies if any, if there are event waiting for status change, put them on the list. 2984 if( PacketScheduled == true ) 2985 { 2986 if( get_local_id( 0 ) == 0 ) 2987 { 2988 if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE ) 2989 { 2990 pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event; 2991 } 2992 //Remove event dependencies setting. 2993 if( pCommand->m_numDependencies > 0 ) 2994 { 2995 DecreaseEventDependenciesParallel( pCommand, eventsPool ); 2996 } 2997 } 2998 } 2999 } 3000 //Can't schedule it right now, move to storage. 3001 else 3002 { 3003 if( pQueue->m_controls.m_IsSimulation ) 3004 { 3005 barrier( CLK_LOCAL_MEM_FENCE ); 3006 } 3007 PacketScheduled = false; 3008 } 3009 3010 //Allocation failure, move command to stack storage and update stack pointers 3011 if( PacketScheduled == false ) 3012 { 3013 if( get_local_id( 0 ) == 0 ) 3014 { 3015 StackOffset = atomic_dec( &pQueue->m_controls.m_StackTop ) - 1; 3016 QStorageOffset = atomic_sub( &pQueue->m_controls.m_QstorageTop, pCommand->m_commandSize ) - pCommand->m_commandSize; 3017 commandsStack[ StackOffset ] = QStorageOffset; 3018 } 3019 barrier( CLK_LOCAL_MEM_FENCE ); 3020 __global char* ptrQueue = ( __global char* )pQueue; 3021 GLOBAL_MEMCPY( ( __global void* )&queueStorageBuffer[ QStorageOffset / 4 ], (__global void * )&ptrQueue[ offset ] , pCommand->m_commandSize ); 3022 } 3023 else if( pQueue->m_controls.m_IsSimulation ) 3024 { 3025 barrier( CLK_LOCAL_MEM_FENCE ); 3026 } 3027 } 3028 3029 //In case there were new enqueues on queue_t, all work items must hit the global barrier before they can start taking items from the stack. 3030 GlobalBarrierUpdateQueue( queueStorageBuffer, pQueue ); 3031 } 3032 3033 //Check stack only when there are free IDTS 3034 if( ( pQueue->m_controls.m_IDTAfterFirstPhase < WA_INT_DESC_MAX ) & 3035 ( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) ) 3036 { 3037 //Start stack browsing 3038 uint MyID = get_group_id( 0 ); 3039 //Start browsing from the begining of the previous stack top 3040 uint CurrentOffset = pQueue->m_controls.m_PreviousStackTop + MyID; 3041 uint CommandOffset = 0; 3042 3043 while( CurrentOffset < pQueue->m_controls.m_StackSize ) 3044 { 3045 CommandOffset = commandsStack[ CurrentOffset ]; 3046 3047 if( CommandOffset != 0 ) 3048 { 3049 pCommand = GetCommandHeaderFromStorage( ( __global uint* )queueStorageBuffer, CommandOffset ); 3050 3051 //Can I run this command ? 3052 if( CheckEventStatus( pCommand, eventsPool ) == 0 ) 3053 { 3054 //Is it marker command ? 3055 if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER ) 3056 { 3057 //Is there enough IDT space for me ? 3058 if( get_local_id( 0 ) == 0 ) 3059 { 3060 int WalkerNeeded = GetWalkerCount( pCommand ); 3061 //Optimization - check if IDT has free space for me 3062 if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded <= WA_INT_DESC_MAX ) 3063 { 3064 uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded ); 3065 if( Temp + WalkerNeeded <= WA_INT_DESC_MAX ) 3066 { 3067 IDTOffset = Temp; 3068 DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) ); 3069 SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET ); 3070 BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize ); 3071 } 3072 else 3073 { 3074 IDTOffset = -1; 3075 } 3076 } 3077 else 3078 { 3079 IDTOffset = -1; 3080 } 3081 } 3082 //Now barrier and check if we can go with scheduling 3083 barrier( CLK_LOCAL_MEM_FENCE ); 3084 if( IDTOffset != -1 ) 3085 { 3086 //This packet is all set, schedule it and we are done with it. 3087 //Patch DSH has media curbe load and patch gpgpu walker inside 3088 PatchDSHParallelWithDynamicDSH20( SLBOffset, 3089 DSHOffset, 3090 IDTOffset, 3091 kernelData, 3092 dsh, 3093 pCommand->m_kernelId, 3094 pCommand, 3095 secondaryBatchBuffer, 3096 pQueue, 3097 eventsPool, 3098 ssh, 3099 BTSoffset, 3100 &WalkerEnum, 3101 ObjectIDS 3102#ifdef ENABLE_DEBUG_BUFFER 3103 , DebugQueue 3104#endif 3105 ); 3106 pCommand->m_commandState = CAN_BE_RECLAIMED; 3107 //Reset stack offset 3108 commandsStack[ CurrentOffset ] = 0; 3109 3110 //Update event status 3111 if( get_local_id( 0 ) == 0 ) 3112 { 3113 //Add events dependant on this command to list of events neeeded to be updated. 3114 if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE ) 3115 { 3116 pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event; 3117 } 3118 3119 //Remove event dependencies setting. 3120 if( pCommand->m_numDependencies > 0 ) 3121 { 3122 DecreaseEventDependenciesParallel( pCommand, eventsPool ); 3123 } 3124 } 3125 } 3126 } 3127 else // For marker we need to update returned event status 3128 { 3129 barrier( CLK_GLOBAL_MEM_FENCE ); 3130 //Check if there is space to track event 3131 if( get_local_id( 0 ) == 0 ) 3132 { 3133 uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled ); 3134 if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER ) 3135 { 3136 pCommand->m_commandState = CAN_BE_RECLAIMED; 3137 commandsStack[ CurrentOffset ] = 0; 3138 //Add events dependant on this command to list of events neeeded to be updated. 3139 if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE ) 3140 { 3141 pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event; 3142 } 3143 3144 //Remove event dependencies setting. 3145 if( pCommand->m_numDependencies > 0 ) 3146 { 3147 DecreaseEventDependenciesParallel( pCommand, eventsPool ); 3148 } 3149 } 3150 } 3151 } 3152 } 3153 } 3154 CurrentOffset += get_num_groups( 0 ); 3155 if( pQueue->m_controls.m_IsSimulation ) 3156 { 3157 barrier( CLK_LOCAL_MEM_FENCE ); 3158 } 3159 } 3160 } 3161 3162 //Finish execution and check end conditons 3163 //Execute this global barrier only when needed, i.e. stack browsing was executed or new item were added on the stack 3164 if( ( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) | 3165 ( pQueue->m_controls.m_StackTop != pQueue->m_controls.m_PreviousStackTop ) ) 3166 { 3167 GlobalBarrier( queueStorageBuffer ); 3168 } 3169 3170 //Cleanup & resource reclamation section 3171 //We are after global sync section, we can do anything to globals right now. 3172 if( ( get_local_id( 0 ) == 0 ) & ( get_group_id( 0 ) == 0 ) ) 3173 { 3174 { 3175 pQueue->m_controls.m_CurrentDSHoffset = pQueue->m_controls.m_DynamicHeapStart; 3176 pQueue->m_controls.m_IDTAfterFirstPhase = 1; 3177 pQueue->m_controls.m_PreviousNumberOfQueues = pQueue->m_controls.m_TotalNumberOfQueues; 3178 pQueue->m_controls.m_LastScheduleEventNumber = pQueue->m_controls.m_CurrentScheduleEventNumber; 3179 pQueue->m_controls.m_CurrentScheduleEventNumber = 0; 3180 } 3181 } 3182 3183 //Schedule scheduler 3184 if( ( (get_group_id( 0 ) == 1) == (get_num_groups( 0 ) > 1) ) & ( get_local_id(0) == 0 ) ) 3185 { 3186 pQueue->m_controls.m_SecondLevelBatchOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET ); 3187 3188 //If we scheduled any blocks, put scheduler right after 3189 if( ( pQueue->m_controls.m_CurrentIDToffset > 1 ) | ( pQueue->m_controls.m_EnqueueMarkerScheduled > 0 ) ) 3190 { 3191 AddCmdsInSLBforScheduler20Parallel( pQueue->m_controls.m_SecondLevelBatchOffset, 3192 pQueue, 3193 secondaryBatchBuffer, 3194 dsh ); 3195 3196 //If we have profiling enabled, we need CL_COMPLETE time, which is before next scheduler starts 3197 if( pQueue->m_controls.m_IsProfilingEnabled != 0 ) 3198 { 3199 ulong Address = ( ulong ) &( eventsPool->m_CLcompleteTimestamp ); 3200 //Emit pipecontrol with timestamp write 3201 PatchPipeControlProfilingAddres( secondaryBatchBuffer, 3202 pQueue->m_controls.m_SecondLevelBatchOffset, 3203 Address, 3204 PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET ); 3205 //Bit after scheduler may be set by some other command, reset it to 0 3206 DisablePostSyncBitInPipeControl( secondaryBatchBuffer, 3207 pQueue->m_controls.m_SecondLevelBatchOffset, 3208 PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET_TO_PATCH ); 3209#ifdef ENABLE_DEBUG_BUFFER 3210 if( ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) ) 3211 { 3212 //Store Current scheduler START time 3213 *((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp; 3214 3215 //Set address to store next scheduler's END time 3216 PatchPipeControlProfilingAddres( secondaryBatchBuffer, 3217 pQueue->m_controls.m_SecondLevelBatchOffset, 3218 ( ulong )(&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ]), 3219 PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 3220 } 3221#endif 3222 } 3223 3224 //Program pipe controls around scheduler to make sure it is not executed concurently to blocks 3225 else 3226 { 3227 //Locate previous pipe control 3228 int PreviousOffset = pQueue->m_controls.m_SecondLevelBatchOffset - SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE; 3229 //If offset is negative it means we are first command after chaining 3230 if( PreviousOffset < 0 ) 3231 { 3232 PreviousOffset += MAX_SLB_OFFSET; 3233 } 3234 //Tighten previous pipecontrol 3235 uint DwordOffset = PreviousOffset / DWORD_SIZE_IN_BYTES; 3236 3237 PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 3238 3239 //Now put pipe control after scheduler 3240 DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 3241 3242 PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET ); 3243 } 3244 3245 pQueue->m_controls.m_SecondLevelBatchOffset += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE; 3246 pQueue->m_controls.m_CurrentIDToffset = 1; 3247 pQueue->m_controls.m_EnqueueMarkerScheduled = 0; 3248 pQueue->m_controls.Temporary[1]++; 3249 pQueue->m_controls.m_CurrentSSHoffset = pQueue->m_controls.m_BTbaseOffset; 3250 } 3251 //Nothing to schedule, return to the host 3252 else 3253 { 3254#ifdef ENABLE_DEBUG_BUFFER 3255 //Set START time of current (last) scheduler 3256 if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) ) 3257 { 3258 *((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp; 3259 } 3260#endif 3261 pQueue->m_controls.Temporary[2]++; 3262 pQueue->m_controls.m_SLBENDoffsetInBytes = ( int ) pQueue->m_controls.m_SecondLevelBatchOffset; 3263 //SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords 3264 __private uint DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES; 3265 //BB_START 1st DWORD 3266 secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0; 3267 DwordOffset++; 3268 //BB_START 2nd DWORD - Address, 3rd DWORD Address high 3269 secondaryBatchBuffer[ DwordOffset++ ] = (uint)( pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF ); 3270 secondaryBatchBuffer[ DwordOffset++ ] = (uint)( ( pQueue->m_controls.m_CleanupSectionAddress >> 32 ) & 0xFFFFFFFF ); 3271 } 3272 } 3273 3274 //Parallel stack compaction 3275 if( ( ( get_group_id( 0 ) == 2 ) == ( get_num_groups( 0 ) > 2 ) ) & ( get_local_id( 0 ) == 0 ) ) 3276 { 3277 uint Current = pQueue->m_controls.m_StackTop + get_local_id( 0 ); 3278 uint StackSize = pQueue->m_controls.m_StackSize; 3279 uint Found = 0; 3280 3281 while( ( Current < StackSize ) && ( Found == 0 ) ) 3282 { 3283 __global uint * pCmdStackBlock = (__global uint *)( commandsStack + Current ); 3284 //We have found an element 3285 if( *pCmdStackBlock != 0 ) 3286 { 3287 Found = 1; 3288 } 3289 else 3290 { 3291 Current += get_local_size( 0 ); 3292 } 3293 } 3294 3295 if ( Found == 1 ) 3296 { 3297 atomic_min( &pQueue->m_controls.m_StackTop, Current ); 3298 atomic_min( &pQueue->m_controls.m_PreviousStackTop, Current ); 3299 } 3300 } 3301 3302 //Qstorage compaction 3303 if( ( ( get_group_id( 0 ) == 3 ) == ( get_num_groups( 0 ) > 3 ) ) & ( get_local_id( 0 ) == 0 ) ) 3304 { 3305 uint ReclaimFurhter = 1; 3306 while( ( pQueue->m_controls.m_QstorageTop < pQueue->m_controls.m_QstorageSize ) & ( ReclaimFurhter == 1 ) ) 3307 { 3308 pCommand = GetCommandHeaderFromStorage( ( __global uint* ) queueStorageBuffer, pQueue->m_controls.m_QstorageTop ); 3309 if( pCommand->m_commandState == CAN_BE_RECLAIMED ) 3310 { 3311 pQueue->m_controls.m_QstorageTop += pCommand->m_commandSize; 3312 } 3313 else 3314 { 3315 ReclaimFurhter = 0; 3316 } 3317 } 3318 3319 pQueue->m_controls.m_PreviousStorageTop = pQueue->m_controls.m_QstorageTop; 3320 3321#ifndef DISABLE_RESOURCE_RECLAMATION 3322 //Reclaim space on queue_t, do this only if there is enough space 3323 //1 KB is used for global barrier, make sure this space will never be used. 3324 if( pQueue->m_controls.m_QstorageTop - 1024 > pQueue->m_size ) 3325 { 3326 //In this case we can take full queue_t next time we enter scheduler, so reclaim full space on queue_t 3327 pQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT; 3328 } 3329#endif 3330 pQueue->m_controls.m_PreviousHead = pQueue->m_head; 3331 } 3332} 3333