1 /*
2 * Copyright (c) 2007-2017, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file cm_kernel_rt.cpp
24 //! \brief Contains CmKernelRT definitions.
25 //!
26
27 #include "cm_kernel_rt.h"
28
29 #include "cm_program.h"
30 #include "cm_device_rt.h"
31 #include "cm_surface_manager.h"
32 #include "cm_surface_2d_up_rt.h"
33 #include "cm_surface_3d_rt.h"
34 #include "cm_buffer_rt.h"
35 #include "cm_mov_inst.h"
36 #include "cm_kernel_data.h"
37 #include "cm_thread_space_rt.h"
38 #include "cm_state_buffer.h"
39 #include "cm_surface_vme.h"
40 #include "cm_debug.h"
41 #include "cm_surface_sampler8x8.h"
42 #include "cm_surface_sampler.h"
43 #include "cm_group_space.h"
44 #include "cm_surface_2d_rt.h"
45 #include "cm_sampler8x8_state_rt.h"
46 #include "cm_visa.h"
47 #include "cm_extension_creator.h"
48 #include "cm_execution_adv.h"
49
50 #define GENERATE_GLOBAL_SURFACE_INDEX
51
52 #define READ_FIELD_FROM_BUF( dst, type ) \
53 dst = *((type *) &buf[bytePosition]); \
54 bytePosition += sizeof(type);
55
56 #define PER_ARG_SIZE_IN_DWORD 3
57 #define KERNEL_INFO_SIZE_IN_DWORD 4
58
59 #define DW_ALIGNMENT( byte_address ) \
60 if( byte_address % 4 ) \
61 byte_address = ( byte_address / 4 + 1 ) * 4;
62
63 #define GRF_ALIGNMENT( byte_address ) \
64 if( byte_address % 32 ) \
65 byte_address = ( byte_address / 32 + 1 ) * 32;
66
67 // To check if surface type nType is equal to the surface type list in argument ...
68 #define CHECK_SURFACE_TYPE( nType, ... ) ( _CheckSurfaceType( nType, __VA_ARGS__, -1 ) )
69
70 #define IsKernelArg(arg) ((arg).unitCount == 1)
71
72 // Warning : x must be uint32_t
73 #define SET_MEMORY_OBJECT_CONTROL(x, memCtl) \
74 x = ((uint16_t)(memCtl.mem_ctrl<< 8 | memCtl.mem_type << 4 | memCtl.age)) << 16 | (x);
75
76 #define ADD_INTO_VME_INDEX_ARRAY(value) \
77 vmeIndexArray[vmeIndexArrayPosition] = value ; \
78 vmeIndexArrayPosition ++;
79
80 #define ADD_INTO_VME_CM_INDEX_ARRAY(value) ; \
81 vmeCmIndexArray[vmeCmIndexArrayPosition] = value ; \
82 vmeCmIndexArrayPosition ++;
83
84 typedef CM_ARG* PCM_ARG;
85
86 #define CM_KERNEL_DATA_CLEAN 0 // kernel data clean
87 #define CM_KERNEL_DATA_KERNEL_ARG_DIRTY 1 // per kernel arg dirty
88 #define CM_KERNEL_DATA_THREAD_ARG_DIRTY (1 << 1) // per thread arg dirty
89 #define CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY (1 << 2) // indirect payload data dirty
90 #define CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY (1 << 3) // indirect payload data size changes
91 #define CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY (1 << 4) // global surface dirty
92 #define CM_KERNEL_DATA_THREAD_COUNT_DIRTY (1 << 5) // thread count dirty, reset() be called
93 #define cMKERNELDATASAMPLERBTIDIRTY (1 << 6) // sampler bti dirty
94 #define CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY (1 << 7) // threadgroupspace dirty
95
Partition(PCM_ARG * args,int32_t p,int32_t r)96 int32_t Partition( PCM_ARG* args, int32_t p, int32_t r )
97 {
98 uint16_t x = args[p]->unitOffsetInPayload;
99 int32_t i = p - 1;
100 int32_t j = r + 1;
101 while( 1 )
102 {
103 do {
104 j --;
105 } while( args[j]->unitOffsetInPayload > x );
106
107 do {
108 i ++;
109 } while( args[i]->unitOffsetInPayload < x );
110
111 if( i < j )
112 {
113 PCM_ARG tmpP = args[i];
114 args[i] = args[j];
115 args[j] = tmpP;
116 }
117 else
118 {
119 return j;
120 }
121 }
122 }
123
124 // Cannot be called directly! use macro CHECK_SURFACE_TYPE!
_CheckSurfaceType(int nType,...)125 bool _CheckSurfaceType( int nType, ... )
126 {
127 bool match = false;
128 va_list ap;
129 va_start( ap, nType );
130 int type = 0;
131
132 while ( ( type = va_arg( ap, int ) ) >= 0 )
133 {
134 if( type == nType )
135 {
136 match = true;
137 break;
138 }
139 }
140 va_end(ap);
141
142 return match;
143 }
144
QuickSort(PCM_ARG * args,int32_t p,int32_t r)145 void QuickSort( PCM_ARG* args, int32_t p, int32_t r )
146 {
147 if( p < r )
148 {
149 int32_t q = Partition( args, p, r );
150 QuickSort( args, p, q );
151 QuickSort( args, q + 1, r );
152 }
153 }
154
155 namespace CMRT_UMD
156 {
157 static bool bCmMovInstRegistered = CmExtensionCreator<CmMovInstConstructor>::RegisterClass<CmMovInstConstructor>();
158 //*-----------------------------------------------------------------------------
159 //| Purpose: Create object for mov instructions
160 //| instructions will be copied into DstMem
161 //*-----------------------------------------------------------------------------
ConstructObjMovs(uint32_t dstOffset,uint32_t srcOffset,uint32_t size,CmDynamicArray & movInsts,uint32_t index,bool isBdw,bool isHwDebug)162 uint32_t CmMovInstConstructor::ConstructObjMovs(uint32_t dstOffset, uint32_t srcOffset, uint32_t size, CmDynamicArray &movInsts, uint32_t index, bool isBdw, bool isHwDebug)
163 {
164 return MovInst_RT::CreateMoves(dstOffset, srcOffset, size, movInsts, index, isBdw, isHwDebug);
165 }
166
167 //*-----------------------------------------------------------------------------
168 //| Purpose: Create CM Kernel
169 //| Arguments :
170 //| device [in] Pointer to device
171 //| program [in] Pointer to cm Program
172 //| kernelName [in] Name of kernel
173 //| kernelId [in] Kernel's ID
174 //| kernel [in/out] Reference Pointer to CM Kernel
175 //| options [in] jitter, or non-jitter
176 //| Returns: Result of the operation.
177 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmProgramRT * program,const char * kernelName,uint32_t kernelIndex,uint32_t kernelSeqNum,CmKernelRT * & kernel,const char * options)178 int32_t CmKernelRT::Create(CmDeviceRT *device,
179 CmProgramRT *program,
180 const char *kernelName,
181 uint32_t kernelIndex,
182 uint32_t kernelSeqNum,
183 CmKernelRT* &kernel,
184 const char *options)
185 {
186 int32_t result = CM_SUCCESS;
187 CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)device->GetAccelData())->cmHalState;
188
189 if (state && state->advExecutor)
190 {
191 kernel = state->advExecutor->CreateKernelRT(device, program, kernelIndex, kernelSeqNum);
192 }
193 else
194 {
195 kernel = new (std::nothrow) CmKernelRT(device, program, kernelIndex, kernelSeqNum);
196 }
197
198 if( kernel )
199 {
200 kernel->Acquire();
201 result = kernel->Initialize( kernelName, options );
202 if( result != CM_SUCCESS )
203 {
204 CmKernelRT::Destroy( kernel, program);
205 return result;
206 }
207 }
208 else
209 {
210 CM_ASSERTMESSAGE("Error: Failed to create CmKernel due to out of system memory.");
211 return CM_OUT_OF_HOST_MEMORY;
212 }
213 if (options)
214 {
215 if (strcmp(options, "PredefinedGPUCopyKernel") == 0)
216 {
217 kernel->m_blCreatingGPUCopyKernel = true;
218 }
219 else
220 {
221 kernel->m_blCreatingGPUCopyKernel = false;
222 }
223 }
224
225 #if USE_EXTENSION_CODE
226 result = kernel->InitForGTPin(device, program, kernel);
227 #endif
228
229 return result;
230 }
231
232 //*-----------------------------------------------------------------------------
233 //| Purpose: Destory Kernel
234 //| Returns: Result of the operation.
235 //*-----------------------------------------------------------------------------
Destroy(CmKernelRT * & kernel,CmProgramRT * & program)236 int32_t CmKernelRT::Destroy( CmKernelRT* &kernel, CmProgramRT *&program )
237 {
238 uint32_t refCount = kernel->SafeRelease();
239 if (refCount == 0)
240 {
241 kernel = nullptr;
242 }
243
244 refCount = program->SafeRelease();
245 if (refCount == 0)
246 {
247 program = nullptr;
248 }
249 return CM_SUCCESS;
250 }
251
252 //*-----------------------------------------------------------------------------
253 //| Purpose: Acuqire Kernel: increment refcount
254 //| Returns: Result of the operation.
255 //*-----------------------------------------------------------------------------
Acquire(void)256 int32_t CmKernelRT::Acquire( void)
257 {
258 m_refcount ++;
259 return m_refcount;
260 }
261
262 //*-----------------------------------------------------------------------------
263 //| Purpose: SafeRelease Kernel: Delete the instance
264 //| Returns: Result of the operation.
265 //*-----------------------------------------------------------------------------
SafeRelease(void)266 int32_t CmKernelRT::SafeRelease( void)
267 {
268 --m_refcount;
269 if (m_refcount == 0)
270 {
271 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
272 PCM_HAL_STATE state = cmData->cmHalState;
273 if (state->dshEnabled)
274 {
275 state->pfnDSHUnregisterKernel(state, m_id);
276 }
277 delete this;
278 return 0;
279 }
280 return m_refcount;
281 }
282
283 //*-----------------------------------------------------------------------------
284 //| Purpose: Kernel constructor
285 //| Returns: Result of the operation.
286 //*-----------------------------------------------------------------------------
CmKernelRT(CmDeviceRT * device,CmProgramRT * program,uint32_t kernelIndex,uint32_t kernelSeqNum)287 CmKernelRT::CmKernelRT(CmDeviceRT *device,
288 CmProgramRT *program,
289 uint32_t kernelIndex,
290 uint32_t kernelSeqNum):
291 m_device( device ),
292 m_surfaceMgr( nullptr ),
293 m_program( program ),
294 m_options( nullptr ),
295 m_binary( nullptr ),
296 m_binaryOrig(nullptr),
297 m_binarySize(0),
298 m_binarySizeOrig(0),
299 m_threadCount( 0 ),
300 m_lastThreadCount( 0 ),
301 m_sizeInCurbe( 0 ),
302 m_sizeInPayload( 0 ),
303 m_argCount( 0 ),
304 m_args( nullptr ),
305 m_kernelInfo(nullptr),
306 m_kernelIndexInProgram( CM_INVALID_KERNEL_INDEX ),
307 m_curbeEnabled( true ),
308 m_nonstallingScoreboardEnabled(false),
309 m_dirty( CM_KERNEL_DATA_CLEAN ),
310 m_lastKernelData( nullptr ),
311 m_lastKernelDataSize( 0 ),
312 m_indexInTask(0),
313 m_threadSpaceAssociated(false),
314 m_perThreadArgExists(false),
315 m_perKernelArgExists( false ),
316 m_threadSpace( nullptr ),
317 m_adjustScoreboardY( 0 ),
318 m_lastAdjustScoreboardY( 0 ),
319 m_blCreatingGPUCopyKernel( false),
320 m_usKernelPayloadDataSize( 0 ),
321 m_kernelPayloadData( nullptr ),
322 m_usKernelPayloadSurfaceCount( 0 ),
323 m_samplerBtiCount( 0 ),
324 m_refcount(0),
325 m_halMaxValues( nullptr ),
326 m_halMaxValuesEx( nullptr ),
327 m_surfaceArray(nullptr),
328 m_threadGroupSpace( nullptr ),
329 m_vmeSurfaceCount( 0 ),
330 m_maxSurfaceIndexAllocated(0),
331 m_barrierMode(CM_LOCAL_BARRIER),
332 m_isClonedKernel(false),
333 m_cloneKernelID(0),
334 m_hasClones( false ),
335 m_stateBufferBounded( CM_STATE_BUFFER_NONE ),
336 m_movInstConstructor(nullptr)
337 {
338 program->Acquire();
339 m_program = program;
340
341 device->GetSurfaceManager(m_surfaceMgr);
342
343 m_id = kernelSeqNum; // Unique number for each kernel. This ID is used in Batch buffer.
344 m_id <<= 32;
345 m_kernelIndex = kernelIndex;
346
347 for (int i = 0; i < CM_GLOBAL_SURFACE_NUMBER; i++)
348 {
349 m_globalSurfaces[i] = nullptr;
350 m_globalCmIndex[i] = 0;
351 }
352
353 m_blhwDebugEnable = program->IsHwDebugEnabled();
354
355 CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, sizeof(m_pKernelPayloadSurfaceArray));
356 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, sizeof(m_IndirectSurfaceInfoArray));
357 CmSafeMemSet( m_samplerBtiEntry, 0, sizeof( m_samplerBtiEntry ) );
358
359 if (m_samplerBtiCount > 0)
360 {
361 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
362 m_samplerBtiCount = 0;
363 }
364
365 ResetKernelSurfaces();
366 }
367
368 //*-----------------------------------------------------------------------------
369 //| Purpose: Destructor of Class CmKernel
370 //| Returns: None.
371 //*-----------------------------------------------------------------------------
~CmKernelRT(void)372 CmKernelRT::~CmKernelRT( void )
373 {
374 MosSafeDeleteArray(m_options);
375
376 DestroyArgs();
377
378 if(m_lastKernelData)
379 {
380 CmKernelData::Destroy( m_lastKernelData );
381 }
382
383 if( m_device->CheckGTPinEnabled() && !m_blCreatingGPUCopyKernel)
384 {
385 MosSafeDeleteArray(m_binary);
386 }
387
388 if( CM_INVALID_KERNEL_INDEX != m_kernelIndexInProgram )
389 {
390 m_program->ReleaseKernelInfo(m_kernelIndexInProgram);
391 }
392
393 for(int i=0; i< CM_GLOBAL_SURFACE_NUMBER; i++)
394 {
395 SurfaceIndex *surfIndex = m_globalSurfaces[i];
396 MosSafeDelete(surfIndex);
397 }
398
399 MosSafeDeleteArray(m_kernelPayloadData);
400 MosSafeDeleteArray(m_surfaceArray);
401 MosSafeDelete(m_movInstConstructor);
402 }
403
404 //*-----------------------------------------------------------------------------
405 //| Purpose: Initialize CM kernel
406 //| Returns: Result of the operation.
407 //*-----------------------------------------------------------------------------
Initialize(const char * kernelName,const char * options)408 int32_t CmKernelRT::Initialize( const char* kernelName, const char* options )
409 {
410 if( kernelName == nullptr )
411 {
412 CM_ASSERTMESSAGE("Error: Kernel name is null.");
413 return CM_NULL_POINTER;
414 }
415
416 size_t length = strnlen( kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE );
417 if( length >= CM_MAX_KERNEL_NAME_SIZE_IN_BYTE )
418 {
419 CM_ASSERTMESSAGE("Error: Kernel name size is too long.");
420 return CM_FAILURE;
421 }
422
423 uint32_t kernelCount = 0;
424 m_program->GetKernelCount( kernelCount );
425
426 CM_KERNEL_INFO* kernelInfo = nullptr;
427 uint32_t i = 0;
428 for( i = 0; i < kernelCount; i ++ )
429 {
430 m_program->GetKernelInfo( i, kernelInfo );
431 if( !kernelInfo )
432 {
433 CM_ASSERTMESSAGE("Error: Invalid kernel info.");
434 return CM_NULL_POINTER;
435 }
436 if( strcmp( kernelInfo->kernelName, kernelName ) == 0 )
437 {
438 break;
439 }
440 }
441
442 if( i == kernelCount )
443 {
444 CM_ASSERTMESSAGE("Error: Invalid kernel count.");
445 return CM_FAILURE;
446 }
447
448 m_device->GetHalMaxValues( m_halMaxValues, m_halMaxValuesEx);
449
450 m_program->AcquireKernelInfo(i);
451 m_kernelInfo = kernelInfo;
452 m_kernelIndexInProgram = i;
453
454 if( options )
455 {
456 size_t length = strnlen( options, CM_MAX_OPTION_SIZE_IN_BYTE );
457 if(length >= CM_MAX_OPTION_SIZE_IN_BYTE)
458 {
459 CM_ASSERTMESSAGE("Error: Option string is too long.");
460 return CM_INVALID_ARG_VALUE;
461 }
462 else
463 {
464 m_options = MOS_NewArray(char, (length+1));
465 if( !m_options )
466 {
467 CM_ASSERTMESSAGE("Error: Out of system memory.");
468 return CM_OUT_OF_HOST_MEMORY;
469
470 }
471 CmSafeMemCopy( m_options, options, length);
472 m_options[ length ] = '\0';
473
474 char* tmp = strstr( m_options, "nocurbe" );
475 if( tmp )
476 {
477 m_curbeEnabled = false;
478 }
479 }
480 }
481
482 m_nonstallingScoreboardEnabled = true;
483
484 void* commonISACode = nullptr;
485 uint32_t commonISACodeSize = 0;
486 m_program->GetCommonISACode(commonISACode, commonISACodeSize);
487 if ((commonISACode == nullptr) || (commonISACodeSize <= 0))
488 {
489 CM_ASSERTMESSAGE("Error: Invalid VISA.");
490 return CM_INVALID_COMMON_ISA;
491 }
492
493 bool useVisaApi = true;
494 vISA::ISAfile *isaFile = nullptr;
495 vISA::KernelBody *kernelBody = nullptr;
496
497 auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
498 if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 2))
499 {
500 useVisaApi = false;
501 }
502 else
503 {
504 isaFile = m_program->getISAfile();
505 if (!isaFile)
506 {
507 CM_ASSERTMESSAGE("Error: Invalid VISA.");
508 return CM_INVALID_COMMON_ISA;
509 }
510 kernelBody = isaFile->getKernelsData().at(m_kernelIndexInProgram);
511 }
512
513 uint8_t *buf = (uint8_t*)commonISACode;
514 uint32_t bytePosition = m_kernelInfo->kernelIsaOffset;
515
516 uint32_t kernelInfoRefCount = 0;
517 m_program->GetKernelInfoRefCount(m_kernelIndexInProgram, kernelInfoRefCount);
518 if (kernelInfoRefCount <= 2) //Only read for 1st time Kernel creation, later we reuse them
519 {
520 if (useVisaApi)
521 {
522 m_kernelInfo->globalStringCount = kernelBody->getStringCount();
523 }
524 {
525 READ_FIELD_FROM_BUF(m_kernelInfo->globalStringCount, unsigned short);
526 }
527
528 m_kernelInfo->globalStrings = (const char**) malloc( m_kernelInfo->globalStringCount * sizeof(char*) );
529 if(m_kernelInfo->globalStrings == nullptr)
530 {
531 CM_ASSERTMESSAGE("Error: Out of system memory.");
532 return CM_OUT_OF_HOST_MEMORY;
533 }
534 CmSafeMemSet(m_kernelInfo->globalStrings, 0, m_kernelInfo->globalStringCount * sizeof(char*) );
535
536 if (useVisaApi)
537 {
538 int i = 0;
539 for (vISA::StringPool *globalString : kernelBody->getStringPool())
540 {
541 size_t stringLength = std::strlen(globalString->getString());
542 char *string = (char*)malloc(stringLength + 1);
543 if (string == nullptr)
544 {
545 CM_ASSERTMESSAGE("Error: Out of system memory.");
546 return CM_OUT_OF_HOST_MEMORY;
547 }
548 CmSafeMemCopy(string, globalString->getString(), stringLength);
549 string[stringLength] = '\0';
550 m_kernelInfo->globalStrings[i] = string;
551 i++;
552 }
553 }
554 else
555 {
556 for (int i = 0; i < (int)m_kernelInfo->globalStringCount; i++)
557 {
558 char* string = (char*)malloc(CM_MAX_KERNEL_STRING_IN_BYTE + 1);
559 if (string == nullptr)
560 {
561 CM_ASSERTMESSAGE("Error: Out of system memory.");
562 return CM_OUT_OF_HOST_MEMORY;
563 }
564 int j = 0;
565 while (buf[bytePosition] != '\0' && j < CM_MAX_KERNEL_STRING_IN_BYTE) {
566 string[j++] = buf[bytePosition++];
567 }
568 string[j] = '\0';
569 bytePosition++;
570 m_kernelInfo->globalStrings[i] = string;
571 }
572 }
573 }
574
575 uint32_t count = 0;
576 if (useVisaApi)
577 {
578 count = kernelBody->getNumInputs();
579 }
580 else
581 {
582 bytePosition = m_kernelInfo->inputCountOffset;
583
584 uint8_t countTemp = 0;
585 READ_FIELD_FROM_BUF(countTemp, uint8_t);
586 count = countTemp;
587 }
588
589 if( count > m_halMaxValues->maxArgsPerKernel )
590 {
591 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
592 return CM_EXCEED_KERNEL_ARG_AMOUNT;
593 }
594
595 m_args = MOS_NewArray(CM_ARG, count);
596 if( (!m_args) && (count != 0) )
597 {
598 CM_ASSERTMESSAGE("Error: Out of system memory.");
599 MosSafeDeleteArray(m_options);
600 return CM_OUT_OF_HOST_MEMORY;
601 }
602 CmSafeMemSet(m_args, 0, sizeof(CM_ARG) * count);
603 m_argCount = count;
604
605 uint32_t preDefinedSurfNum;
606 if ( (m_program->m_cisaMajorVersion > 3) || ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion >=1)) ) //CISA 3.1 +
607 {
608 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_3_1;
609 }
610 else if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion == 0))
611 {
612 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2_1;
613 }
614 else //CISA 2.0
615 {
616 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2;
617 }
618
619 uint32_t argSize = 0;
620
621 for (uint32_t i = 0; i < m_argCount; i++)
622 {
623 vISA::InputInfo *inputInfo = nullptr;
624 uint8_t kind = 0;
625
626 if (useVisaApi)
627 {
628 inputInfo = kernelBody->getInputInfo()[i];
629 kind = inputInfo->getKind();
630 }
631 else
632 {
633 READ_FIELD_FROM_BUF(kind, uint8_t);
634 }
635
636 if (kind == 0x2) // compiler value for surface
637 {
638 kind = ARG_KIND_SURFACE;
639 // runtime value for surface. surface will be further classified to 1D/2D/3D
640 }
641 else if (kind == 0x3) // compiler value for vme index
642 {
643 kind = ARG_KIND_VME_INDEX;
644 }
645 else if (kind == 0x8)
646 {
647 kind = ARG_KIND_IMPLICT_LOCALSIZE;
648 m_args[i].isSet = true;
649 m_args[i].unitCount = 1;
650 }
651 else if (kind == 0x10) {
652 kind = ARG_KIND_IMPLICT_GROUPSIZE;
653 m_args[i].isSet = true;
654 m_args[i].unitCount = 1;
655 }
656 else if (kind == 0x18) {
657 kind = ARG_KIND_IMPLICIT_LOCALID;
658 m_args[i].isSet = true;
659 m_args[i].unitCount = 1;
660 m_perKernelArgExists = true; //only VISA3.3+, can come here, so, no matter it is there any explicit arg, implicit arg exits
661 }
662 else if (kind == 0x2A) {
663 kind = ARG_KIND_SURFACE_2D_SCOREBOARD;
664 }
665 else if (kind == 0x20) {
666 kind = ARG_KIND_GENERAL_DEPVEC;
667 }
668 else if (kind == 0x30) {
669 kind = ARG_KIND_GENERAL_DEPCNT;
670 }
671 else if (kind == 0x80) {
672 // IMP_PSEUDO_INPUT = 0x80 is pseudo input. All inputs after this
673 // will be ignored by CMRT without checking and payload copied.
674 // This resizes the argument count to achieve this.
675 m_argCount = i;
676 break;
677 }
678
679 m_args[i].unitKind = kind;
680 m_args[i].unitKindOrig = kind;
681
682 if (kind == ARG_KIND_SURFACE && m_kernelInfo->surfaceCount)
683 {
684 m_args[i].surfaceKind = DATA_PORT_SURF;
685 }
686
687 if (useVisaApi)
688 {
689 m_args[i].unitOffsetInPayload = inputInfo->getOffset();
690 m_args[i].unitOffsetInPayloadOrig = inputInfo->getOffset();
691
692 m_args[i].unitSize = inputInfo->getSize();
693 m_args[i].unitSizeOrig = inputInfo->getSize();
694 }
695 else
696 {
697 uint32_t varID;
698 READ_FIELD_FROM_BUF(varID, uint16_t);
699
700 uint16_t tmpW;
701 READ_FIELD_FROM_BUF(tmpW, uint16_t);
702 m_args[i].unitOffsetInPayload = tmpW;
703 m_args[i].unitOffsetInPayloadOrig = tmpW;
704
705 READ_FIELD_FROM_BUF(tmpW, uint16_t);
706 m_args[i].unitSize = tmpW;
707 m_args[i].unitSizeOrig = tmpW;
708 }
709
710 argSize += m_args[i].unitSize;
711 }
712 //////////////////////////////////////////////////////////////////////////
713
714 if (kernelInfoRefCount <= 2) //Only read for 1st time Kernel creation, later we reuse them
715 {
716 uint16_t attributeCount = 0;
717 if (useVisaApi)
718 {
719 attributeCount = kernelBody->getAttributeCount();
720 }
721 else
722 {
723 /////////////////////////////////////////////////////////////////////////
724 // Get pre-defined kernel attributes, Start
725 //skipping size and entry
726 bytePosition += 8;
727
728 READ_FIELD_FROM_BUF(attributeCount, uint16_t);
729 }
730
731 for (int i = 0; i < attributeCount; i++)
732 {
733 vISA::AttributeInfo *attribute = nullptr;
734 uint32_t nameIndex = 0;
735 uint8_t size = 0;
736
737 if (useVisaApi)
738 {
739 attribute = kernelBody->getAttributeInfo()[i];
740 nameIndex = attribute->getName();
741 size = attribute->getSize();
742 }
743 else
744 {
745 READ_FIELD_FROM_BUF(nameIndex, uint16_t);
746 READ_FIELD_FROM_BUF(size, uint8_t);
747 }
748
749 if( strcmp( m_kernelInfo->globalStrings[nameIndex], "AsmName" ) == 0 )
750 {
751 if (useVisaApi)
752 {
753 CmSafeMemCopy(m_kernelInfo->kernelASMName, attribute->getValue(), size);
754 }
755 else
756 {
757 CmSafeMemCopy(m_kernelInfo->kernelASMName, &buf[bytePosition], size);
758 bytePosition += size;
759 }
760 }
761 else if (strcmp( m_kernelInfo->globalStrings[nameIndex], "SLMSize" ) == 0)
762 {
763 if (useVisaApi)
764 {
765 m_kernelInfo->kernelSLMSize = attribute->getValue()[0];
766 }
767 else
768 {
769 READ_FIELD_FROM_BUF(m_kernelInfo->kernelSLMSize, uint8_t);
770 }
771
772 /* Notes by Stony@2014-04-09
773 * <=CISA3.1: the size is number of 4KB
774 * > CISA3.1: the size is number of 1KB
775 * Here convert it to the number of 1KB if <=CISA 3.1
776 */
777 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion <= 1))
778 {
779 m_kernelInfo->kernelSLMSize = m_kernelInfo->kernelSLMSize * 4;
780 }
781
782 // align to power of 2
783 uint32_t v = m_kernelInfo->kernelSLMSize;
784 v--;
785 v |= v >> 1;
786 v |= v >> 2;
787 v |= v >> 4;
788 v |= v >> 8;
789 v |= v >> 16;
790 v++;
791 m_kernelInfo->kernelSLMSize = ( uint8_t )v;
792 }
793 else if (strcmp(m_kernelInfo->globalStrings[nameIndex], "NoBarrier") == 0)
794 {
795 m_kernelInfo->blNoBarrier = true;
796 if (!useVisaApi)
797 {
798 bytePosition += size;
799 }
800 }
801 else
802 {
803 if (!useVisaApi)
804 {
805 bytePosition += size;
806 }
807 }
808 }
809 }
810
811 if(argSize > m_halMaxValues->maxArgByteSizePerKernel)
812 {
813 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
814 return CM_EXCEED_KERNEL_ARG_SIZE_IN_BYTE;
815 }
816
817 buf = (uint8_t*)commonISACode;
818
819 if(m_program->IsJitterEnabled())
820 {
821 //m_JitterEnable = true;
822 char *programOptions;
823 m_program->GetKernelOptions(programOptions);
824 //if no options or same options, copy load program's binary. else re-jitter
825 {
826 m_binary = (char *)m_kernelInfo->jitBinaryCode;
827 m_binarySize = m_kernelInfo->jitBinarySize;
828 m_kernelInfo->origBinary = m_kernelInfo->jitBinaryCode;
829 m_kernelInfo->origBinarySize = m_kernelInfo->jitBinarySize;
830 }
831 }
832 else
833 {
834 char* binary = (char*)(buf + m_kernelInfo->genxBinaryOffset );
835
836 //No copy, point to the binary offset in CISA code.
837 m_binary = binary;
838 m_binarySize = m_kernelInfo->genxBinarySize;
839
840 m_kernelInfo->origBinary = binary;
841 m_kernelInfo->origBinarySize = m_kernelInfo->genxBinarySize;
842 }
843
844 if (m_kernelInfo->blNoBarrier)
845 {
846 m_barrierMode = CM_NO_BARRIER;
847 }
848
849 m_movInstConstructor = CmExtensionCreator<CmMovInstConstructor>::CreateClass();
850 if (m_movInstConstructor == nullptr)
851 {
852 CM_ASSERTMESSAGE("Error: Failed to allocate movInstConstructor due to out of system memory.");
853 return CM_OUT_OF_HOST_MEMORY;
854 }
855
856 CmNotifierGroup *notifiers = m_device->GetNotifiers();
857 if (notifiers != nullptr)
858 {
859 notifiers->NotifyKernelCreated(this);
860 }
861
862 return CM_SUCCESS;
863 }
864
865 //*-----------------------------------------------------------------------------
866 //! A CmKernel can run in multiple threads concurrently. This
867 //! fucntion is to set the number of threads.
868 //! INPUT:
869 //! number of threads
870 //! OUTPUT:
871 //! CM_SUCCESS or
872 //! CM_INVALID_ARG_VALUE if the number is larger than CmKernel's capacity
873 //*-----------------------------------------------------------------------------
SetThreadCount(uint32_t count)874 CM_RT_API int32_t CmKernelRT::SetThreadCount(uint32_t count )
875 {
876 INSERT_API_CALL_LOG();
877 // Check per kernel, per task check will be at enqueue time
878 if ((int)count <= 0)
879 return CM_INVALID_ARG_VALUE;
880
881 if (m_threadSpace == nullptr)
882 {
883 if (m_threadCount)
884 {
885 // Setting threadCount twice with different values will cause reset of kernels
886 if (m_threadCount != count)
887 {
888 Reset();
889 m_threadCount = count;
890 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
891 }
892 }
893 else // first time
894 {
895 m_threadCount = count;
896 }
897 }
898 return CM_SUCCESS;
899 }
900
GetThreadCount(uint32_t & count)901 int32_t CmKernelRT::GetThreadCount(uint32_t& count )
902 {
903 count = m_threadCount;
904 return CM_SUCCESS;
905 }
906
GetKernelSurfaces(bool * & surfArray)907 int32_t CmKernelRT::GetKernelSurfaces(bool *&surfArray)
908 {
909 surfArray = m_surfaceArray;
910 return CM_SUCCESS;
911 }
912
ResetKernelSurfaces()913 int32_t CmKernelRT::ResetKernelSurfaces()
914 {
915 uint32_t surfacePoolSize = m_surfaceMgr->GetSurfacePoolSize();
916 if (!m_surfaceArray)
917 {
918 m_surfaceArray = MOS_NewArray(bool, surfacePoolSize);
919 if (!m_surfaceArray)
920 {
921 CM_ASSERTMESSAGE("Error: Failed to rest kernel surfaces due to out of system memory.");
922 return CM_OUT_OF_HOST_MEMORY;
923 }
924 }
925 CmSafeMemSet( m_surfaceArray, 0, surfacePoolSize * sizeof( bool ) );
926
927 return CM_SUCCESS;
928 }
929
930 //*-----------------------------------------------------------------------------
931 //| Purpose: Get CmSurface from surface manager.
932 //| Use "value + indexSurfaceArray" to locate its surfaceIndex
933 //| Returns: CmSurface. Null if not found
934 //*-----------------------------------------------------------------------------
GetSurfaceFromSurfaceArray(SurfaceIndex * value,uint32_t indexSurfaceArray)935 CmSurface* CmKernelRT::GetSurfaceFromSurfaceArray( SurfaceIndex* value, uint32_t indexSurfaceArray)
936 {
937 int32_t hr = CM_SUCCESS;
938 CmSurface *surface = nullptr;
939 SurfaceIndex* surfaceIndex = nullptr;
940
941 surfaceIndex = value + indexSurfaceArray;
942 CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceIndex);
943
944 if (surfaceIndex->get_data() == CM_NULL_SURFACE
945 || surfaceIndex->get_data() == 0)
946 {
947 surface = (CmSurface *)CM_NULL_SURFACE;
948 goto finish;
949 }
950
951 m_surfaceMgr->GetSurface(surfaceIndex->get_data(), surface);
952
953 finish:
954 if(hr != CM_SUCCESS)
955 {
956 surface = nullptr;
957 }
958
959 return surface;
960 }
961
962 //*-----------------------------------------------------------------------------
963 //| Purpose: Set kernel arg for single vme surface or multiple vme surfaces
964 //| in surface array. So far, don't support vme surface array in thread arg.
965 //| Returns: Result of the operation.
966 //*-----------------------------------------------------------------------------
SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t argIndex,const void * value,uint32_t nThreadID)967 int32_t CmKernelRT::SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t argIndex, const void *value, uint32_t nThreadID)
968 {
969 uint32_t elementNum = 0;
970 CM_ARG& arg = m_args[ argIndex ];
971 uint32_t totalVmeArgValueSize = 0;
972 uint32_t totalSurfacesInVme = 0;
973 uint32_t tempVmeArgValueSize = 0;
974 uint32_t vmeArgValueOffset = 0;
975 uint32_t lastVmeSurfCount = 0;
976 CmSurfaceVme* surfVme = nullptr;
977 uint8_t *vmeArgValueArray = nullptr;
978 uint16_t *vmeCmIndexArray = nullptr;
979 int32_t hr = CM_SUCCESS;
980
981 //Get Number of elements in surface array
982 if (arg.unitVmeArraySize == 0)
983 { //First Time
984 elementNum = arg.unitSize / sizeof(uint32_t);
985 }
986 else
987 {
988 elementNum = arg.unitVmeArraySize;
989 }
990
991 //Get Size of vmeIndexArray and vmeCmIndexArray.
992 for(uint32_t i=0; i< elementNum; i++)
993 {
994 if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
995 {
996 tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
997 totalVmeArgValueSize += tempVmeArgValueSize;
998 totalSurfacesInVme++;
999 }
1000 else
1001 {
1002 surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1003 CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1004 tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1005 totalVmeArgValueSize += tempVmeArgValueSize;
1006 totalSurfacesInVme += surfVme->GetTotalSurfacesCount();
1007 }
1008 }
1009
1010 // Allocate and Zero Memory for arg.pValue and arg.surfIndex
1011 // arg.pValue : an array of CM_HAL_VME_ARG_VALUE structure followed by an array of reference surfaces
1012 // arg.surfIndex : an array listing all the Cm surface indexes, in the order of current, fw surfaces, bw surfaces
1013
1014 if (arg.unitSize < totalVmeArgValueSize) // need to re-allocate larger area)
1015 {
1016 if (arg.value)
1017 {
1018 MosSafeDeleteArray(arg.value);
1019 }
1020 arg.value = MOS_NewArray(uint8_t, totalVmeArgValueSize);
1021
1022 if (arg.surfIndex)
1023 {
1024 MosSafeDeleteArray(arg.surfIndex);
1025 }
1026 arg.surfIndex = MOS_NewArray(uint16_t, totalSurfacesInVme);
1027 }
1028
1029 CM_CHK_NULL_GOTOFINISH_CMERROR(arg.value);
1030 CmSafeMemSet(arg.value, 0, totalVmeArgValueSize);
1031 CM_CHK_NULL_GOTOFINISH_CMERROR(arg.surfIndex);
1032 CmSafeMemSet(arg.surfIndex, 0, totalSurfacesInVme * sizeof(uint16_t));
1033
1034 //Set each Vme Surface
1035 for (uint32_t i = 0; i< elementNum; i++)
1036 {
1037 if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1038 {
1039 PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)(arg.value + vmeArgValueOffset);
1040 vmeArg->fwRefNum = 0;
1041 vmeArg->bwRefNum = 0;
1042 vmeArg->curSurface = CM_NULL_SURFACE;
1043 tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1044 vmeArgValueOffset += tempVmeArgValueSize;
1045 arg.surfIndex[lastVmeSurfCount] = CM_NULL_SURFACE;
1046 lastVmeSurfCount++;
1047 }
1048 else
1049 {
1050 surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1051 CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1052 SetArgsSingleVme(surfVme, arg.value + vmeArgValueOffset, arg.surfIndex + lastVmeSurfCount);
1053 tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1054 vmeArgValueOffset += tempVmeArgValueSize;
1055 lastVmeSurfCount += surfVme->GetTotalSurfacesCount();
1056 }
1057 }
1058
1059 if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1060 {
1061 // First time set
1062 if( !arg.value )
1063 { // Increment size kernel arguments will take up in CURBE
1064 m_sizeInCurbe += CM_ARGUMENT_SURFACE_SIZE * elementNum;
1065 }
1066
1067 arg.unitCount = 1;
1068 arg.isDirty = true;
1069 arg.isSet = true;
1070 arg.unitKind = ARG_KIND_SURFACE_VME;
1071 arg.unitSize = (uint16_t)totalVmeArgValueSize; // the unitSize can't represent surfaces count here
1072 arg.unitVmeArraySize = elementNum;
1073
1074 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1075 m_perKernelArgExists = true;
1076 }
1077 else
1078 {
1079 // Thread arg doesn't support VME surfaces as it is rarely used and it is complex to implement,
1080 // since each thread may has different surface number in its vme surface argment.
1081 hr = CM_THREAD_ARG_NOT_ALLOWED;
1082 }
1083
1084 finish:
1085 if(hr != CM_SUCCESS)
1086 {
1087 MosSafeDeleteArray(arg.value);
1088 MosSafeDeleteArray(arg.surfIndex);
1089 }
1090 return hr;
1091
1092 }
1093
1094 //*-----------------------------------------------------------------------------
1095 //| Purpose: Fill arg for a single vme surface.
1096 //| vmeIndexArray points to arg.pValue
1097 //| vmeCmIndexArray points to arg.surfIndex
1098 //| Returns: Result of the operation.
1099 //*-----------------------------------------------------------------------------
SetArgsSingleVme(CmSurfaceVme * vmeSurface,uint8_t * vmeArgValueArray,uint16_t * cmSufacesArray)1100 int32_t CmKernelRT::SetArgsSingleVme(CmSurfaceVme* vmeSurface, uint8_t *vmeArgValueArray, uint16_t *cmSufacesArray)
1101 {
1102
1103 int32_t hr = CM_SUCCESS;
1104 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1105 uint32_t vmeBackwardSurfaceCount = 0;
1106 uint32_t vmeForwardSurfaceCount = 0;
1107 uint32_t vmeCurrentSurfaceIndex = 0;
1108 uint16_t vmeCurrentCmIndex = 0;
1109 int32_t vmeIndexArrayPosition = 0; // Offset for vmeIndexArray
1110 int32_t vmeCmIndexArrayPosition = 0; // Offset for vmeCmIndexArray
1111 uint32_t tempOutput = 0;
1112 uint32_t cmSurfArrayIdx = 0;
1113 uint32_t surfStateWidth = 0;
1114 uint32_t surfStateHeight = 0;
1115
1116 uint32_t *fArray = nullptr;
1117 uint32_t *bArray = nullptr;
1118 uint32_t *fCmIndex = nullptr;
1119 uint32_t *bCmIndex = nullptr;
1120
1121 uint32_t *fwSurfInArg = nullptr;
1122 uint32_t *bwSurfInArg = nullptr;
1123
1124 CmSurface *surface = nullptr;
1125 PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)vmeArgValueArray;
1126
1127 CM_CHK_NULL_GOTOFINISH_CMERROR(vmeSurface);
1128 CM_CHK_NULL_GOTOFINISH_CMERROR(vmeArg);
1129 CM_CHK_NULL_GOTOFINISH_CMERROR(cmSufacesArray);
1130
1131 if(vmeSurface == (CmSurfaceVme *)CM_NULL_SURFACE)
1132 {
1133 vmeArg->fwRefNum = 0;
1134 vmeArg->bwRefNum = 0;
1135 vmeArg->curSurface = CM_NULL_SURFACE;
1136 cmSufacesArray[cmSurfArrayIdx] = CM_NULL_SURFACE;
1137 return hr;
1138 }
1139
1140 // Get Vme Backward Forward Surface Count
1141 vmeSurface->GetIndexBackwardCount(vmeBackwardSurfaceCount);
1142 vmeSurface->GetIndexForwardCount(vmeForwardSurfaceCount);
1143
1144 vmeArg->fwRefNum = vmeForwardSurfaceCount;
1145 vmeArg->bwRefNum = vmeBackwardSurfaceCount; // these two numbers must be set before any other operations
1146
1147 vmeSurface->GetSurfaceStateResolution(vmeArg->surfStateParam.surfaceStateWidth, vmeArg->surfStateParam.surfaceStateHeight);
1148
1149 vmeSurface->GetIndexForwardArray(fArray);
1150 vmeSurface->GetIndexBackwardArray(bArray);
1151 vmeSurface->GetIndexCurrent(vmeCurrentSurfaceIndex);
1152
1153 vmeSurface->GetCmIndexCurrent(vmeCurrentCmIndex);
1154 vmeSurface->GetCmIndexForwardArray(fCmIndex);
1155 vmeSurface->GetCmIndexBackwardArray(bCmIndex);
1156
1157 cmSufacesArray[cmSurfArrayIdx++] = vmeCurrentCmIndex;
1158
1159 // Set Current Vme Surface
1160 m_surfaceMgr->GetSurface(vmeCurrentCmIndex, surface);
1161 CM_CHK_NULL_GOTOFINISH_CMERROR(surface);
1162
1163 vmeArg->curSurface = vmeCurrentSurfaceIndex;
1164
1165 //Set Forward Vme Surfaces
1166 fwSurfInArg = findFwRefInVmeArg(vmeArg);
1167 for (uint32_t i = 0; i < vmeForwardSurfaceCount; i++)
1168 {
1169 GetVmeSurfaceIndex( fArray, fCmIndex, i, &tempOutput);
1170 fwSurfInArg[i] = tempOutput;
1171 cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)fCmIndex[i];
1172 }
1173
1174 //Set Backward Vme Surfaces
1175 bwSurfInArg = findBwRefInVmeArg(vmeArg);
1176 for (uint32_t i = 0; i < vmeBackwardSurfaceCount; i++)
1177 {
1178 GetVmeSurfaceIndex( bArray, bCmIndex, i, &tempOutput);
1179 bwSurfInArg[i] = tempOutput;
1180 cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)bCmIndex[i];
1181 }
1182
1183 finish:
1184 return hr;
1185 }
1186
1187 //*-----------------------------------------------------------------------------
1188 //| Purpose: Get Vme Surface Index with memory object setting .
1189 //| Output value will be filled into arg.pValue
1190 //| Returns: Result of the operation.
1191 //*-----------------------------------------------------------------------------
GetVmeSurfaceIndex(uint32_t * vmeIndexArray,uint32_t * vmeCmIndexArray,uint32_t index,uint32_t * outputValue)1192 int32_t CmKernelRT::GetVmeSurfaceIndex(
1193 uint32_t *vmeIndexArray,
1194 uint32_t *vmeCmIndexArray,
1195 uint32_t index,
1196 uint32_t *outputValue)
1197 {
1198 int32_t hr = CM_SUCCESS;
1199 uint32_t value = vmeIndexArray[index];
1200
1201 if (vmeIndexArray[index] == CM_INVALID_VME_SURFACE)
1202 {
1203 value = CM_NULL_SURFACE;
1204 }
1205
1206 *outputValue = value;
1207
1208 return hr;
1209 }
1210
1211 //*-----------------------------------------------------------------------------
1212 //| Purpose: Set arguments for function SetKernelArg().
1213 //| Kernel argument is surface array.
1214 //! INPUT:
1215 //! 1) Current index in surface array
1216 //! 2) Index of kernel argument
1217 //! 3) Surface count in surface array
1218 //! 4) Pointer to current surface in surface array.
1219 //! 5) Current surface index
1220 //! 6) Pointer to argument value
1221 //! 7) value of surface handle combined with memory object control
1222 //! 8) Original surface index for each surface in array
1223 //| Returns: Result of the operation.
1224 //*-----------------------------------------------------------------------------
SetArgsInternalSurfArray(int32_t offset,uint32_t kernelArgIndex,int32_t surfCount,CmSurface * currentSurface,uint32_t currentSurfIndex,SurfaceIndex * value,uint32_t surfValue[],uint16_t origSurfIndex[])1225 int32_t CmKernelRT::SetArgsInternalSurfArray(
1226 int32_t offset,uint32_t kernelArgIndex,
1227 int32_t surfCount, CmSurface* currentSurface,
1228 uint32_t currentSurfIndex, SurfaceIndex* value,
1229 uint32_t surfValue[], uint16_t origSurfIndex[])
1230 {
1231 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1232 uint32_t surfRegTableIndex = 0;
1233 uint32_t handle = 0;
1234 uint32_t samplerIndex;
1235 uint16_t samplerCmIndex;
1236 uint32_t surfaceArraySize = 0;
1237
1238 m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1239 MosSafeDeleteArray(m_args[kernelArgIndex].surfArrayArg); // delete it if it was allcated
1240 m_args[kernelArgIndex].surfArrayArg = MOS_NewArray(SURFACE_ARRAY_ARG, surfCount);
1241 if (!m_args[kernelArgIndex].surfArrayArg)
1242 {
1243 CM_ASSERTMESSAGE("Error: Out of system memory.");
1244 return CM_OUT_OF_HOST_MEMORY;
1245 }
1246 CmSafeMemSet((void *)m_args[kernelArgIndex].surfArrayArg, 0, sizeof(SURFACE_ARRAY_ARG) * surfCount);
1247 while (offset < surfCount)
1248 {
1249 switch (currentSurface->Type())
1250 {
1251 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1252 {
1253 CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(currentSurface);
1254
1255 uint32_t numAliases = 0;
1256 surf2D->GetNumAliases(numAliases);
1257 if (numAliases)
1258 {
1259 m_args[kernelArgIndex].aliasCreated = true;
1260 }
1261 else
1262 {
1263 m_args[kernelArgIndex].aliasCreated = false;
1264 }
1265
1266 //set memory object control
1267 surf2D->GetIndexFor2D(surfRegTableIndex);
1268
1269 surfValue[offset] = surfRegTableIndex;
1270 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1271
1272 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D;
1273 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D;
1274
1275 break;
1276 }
1277 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1278 {
1279 CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(currentSurface);
1280
1281 uint32_t numAliases = 0;
1282 surf1D->GetNumAliases(numAliases);
1283 if (numAliases)
1284 {
1285 m_args[kernelArgIndex].aliasCreated = true;
1286 }
1287 else
1288 {
1289 m_args[kernelArgIndex].aliasCreated = false;
1290 }
1291
1292 //set memory object control
1293 surf1D->GetHandle(handle);
1294
1295 surfValue[offset] = handle;
1296 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1297
1298 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_1D;
1299 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_1D;
1300 break;
1301 }
1302 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1303 {
1304 CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(currentSurface);
1305
1306 //set memory object
1307 surf2DUP->GetHandle(handle);
1308
1309 surfValue[offset] = handle;
1310 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1311
1312 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D_UP;
1313 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D_UP;
1314 break;
1315 }
1316 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1317 {
1318 CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(currentSurface);
1319
1320 surf3D->GetHandle(handle);
1321
1322 surfValue[offset] = handle;
1323 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1324
1325 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1326 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1327
1328 break;
1329 }
1330
1331 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1332 {
1333 CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( currentSurface );
1334 stateBuffer->GetHandle( handle );
1335
1336 surfValue[ offset ] = handle;
1337 origSurfIndex[ offset ] = ( uint16_t )currentSurfIndex;
1338
1339 m_args[ kernelArgIndex ].surfArrayArg[ offset ].argKindForArray = ARG_KIND_STATE_BUFFER;
1340 m_args[ kernelArgIndex ].unitKind = ARG_KIND_STATE_BUFFER;
1341
1342 break;
1343 }
1344
1345 //sampler surface
1346 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1347 {
1348 CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (currentSurface);
1349 surfSampler->GetHandle(samplerIndex);
1350 surfSampler->GetCmIndexCurrent(samplerCmIndex);
1351
1352 m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1353 if (!currentSurface)
1354 {
1355 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1356 return CM_NULL_POINTER;
1357 }
1358
1359 surfValue[offset] = samplerIndex;
1360 origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1361
1362 SAMPLER_SURFACE_TYPE type;
1363 surfSampler->GetSurfaceType(type);
1364 if (type == SAMPLER_SURFACE_TYPE_2D)
1365 {
1366 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER;
1367 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER;
1368 }
1369 else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1370 {
1371 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE2DUP_SAMPLER;
1372 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1373 }
1374 else if(type == SAMPLER_SURFACE_TYPE_3D)
1375 {
1376 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1377 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1378 }
1379 else
1380 {
1381 CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1382 return CM_FAILURE;
1383 }
1384 break;
1385 }
1386 //sampler8x8surface
1387 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1388 {
1389 CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (currentSurface);
1390 surfSampler8x8->GetIndexCurrent(samplerIndex);
1391 surfSampler8x8->GetCmIndex(samplerCmIndex);
1392
1393 m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1394 if (!currentSurface)
1395 {
1396 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1397 return CM_FAILURE;
1398 }
1399
1400 surfValue[offset] = samplerIndex;
1401 origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1402
1403 CM_SAMPLER8x8_SURFACE type;
1404 type = surfSampler8x8->GetSampler8x8SurfaceType();
1405 if (type == CM_VA_SURFACE)
1406 {
1407 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1408 m_args[kernelArgIndex].surfArrayArg[offset].addressModeForArray = surfSampler8x8->GetAddressControlMode();
1409 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1410 }
1411 else if(type == CM_AVS_SURFACE)
1412 {
1413 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1414 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1415 }
1416 else
1417 {
1418 CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1419 return CM_FAILURE;
1420 }
1421 break;
1422 }
1423 default:
1424 {
1425 CM_ASSERTMESSAGE("Error: No matched surface for surface array");
1426 return CM_INVALID_ARG_VALUE;
1427 }
1428 }
1429 offset++;
1430 if (offset < surfCount)
1431 {
1432 currentSurfIndex = value[offset].get_data();
1433
1434 while ((!currentSurfIndex && (offset < surfCount))||(currentSurfIndex == CM_NULL_SURFACE))
1435 {
1436 surfValue[offset] = CM_NULL_SURFACE;
1437 origSurfIndex[offset] = 0;
1438 offset++;
1439 if (offset >= surfCount)
1440 break;
1441 currentSurfIndex = value[offset].get_data();
1442 }
1443
1444 if(surfaceArraySize == 0)
1445 {
1446 CM_ASSERTMESSAGE("Error: No surface in surface array");
1447 return CM_NO_AVAILABLE_SURFACE;
1448 }
1449 if (currentSurfIndex > surfaceArraySize)
1450 {
1451 currentSurfIndex = currentSurfIndex % surfaceArraySize;
1452 }
1453 }
1454 if (offset < surfCount)
1455 {
1456 m_surfaceMgr->GetSurface(currentSurfIndex, currentSurface);
1457 if (nullptr == currentSurface)
1458 {
1459 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1460 return CM_FAILURE;
1461 }
1462 }
1463 }
1464 return CM_SUCCESS;
1465 }
1466 //*-----------------------------------------------------------------------------
1467 // Set arguments for function SetKernelArg() and SetThreadArg()
1468 // Set parameter nArgType to CM_KERNEL_INTERNEL_ARG_KERNEL to set a kernel
1469 // argument; set parameter nArgType to CM_KERNEL_INTERNEL_ARG_THREAD to set
1470 // a thread argument
1471 //*-----------------------------------------------------------------------------
SetArgsInternal(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t index,size_t size,const void * value,uint32_t nThreadID)1472 int32_t CmKernelRT::SetArgsInternal( CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t index, size_t size, const void *value, uint32_t nThreadID )
1473 {
1474 uint32_t surfRegTableIndex = 0; // for 2D surf
1475 uint32_t handle = 0; // for 1D surf
1476
1477 uint32_t samplerIndex;
1478 uint16_t samplerCmIndex;
1479 uint32_t samplerIdx = 0;
1480 uint32_t vmeIdx = 0;
1481 uint16_t *surfIndexValue = nullptr;
1482 uint32_t surfaces[CM_MAX_ARGS_PER_KERNEL];
1483 uint16_t surfIndexArray[CM_MAX_ARGS_PER_KERNEL];
1484 std::vector< int > sampler_index_array;
1485
1486 //Clear "set" flag in case user call API to set the same one argument multiple times.
1487 m_args[index].isSet = false;
1488 if( m_args[ index ].unitKind == ARG_KIND_GENERAL || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPVEC) || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPCNT))
1489 {
1490 if( size != m_args[ index ].unitSize )
1491 {
1492 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1493 return CM_INVALID_ARG_SIZE;
1494 }
1495 }
1496 //For surface type
1497 else if (CHECK_SURFACE_TYPE(m_args[index].unitKind,
1498 ARG_KIND_SURFACE,
1499 ARG_KIND_SURFACE_1D,
1500 ARG_KIND_SURFACE_2D,
1501 ARG_KIND_SURFACE_2D_UP,
1502 ARG_KIND_SURFACE_3D,
1503 ARG_KIND_SURFACE_SAMPLER,
1504 ARG_KIND_SURFACE2DUP_SAMPLER,
1505 ARG_KIND_SURFACE_VME,
1506 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
1507 ARG_KIND_SURFACE_SAMPLER8X8_VA,
1508 ARG_KIND_SURFACE_2D_SCOREBOARD,
1509 ARG_KIND_STATE_BUFFER
1510 ))
1511 {
1512
1513 // this code is to convert SurfaceIndex object to index of type uint32_t,
1514 // which is expected by commonISA/genBinary
1515 // index is the index of the surface in surface registration table of CM device
1516 // in driver
1517
1518 int signatureSize = m_args[index].unitSize;
1519 int numSurfaces = signatureSize / sizeof(int);
1520 SurfaceIndex* surfIndex = (SurfaceIndex*)value;
1521 if (surfIndex == (SurfaceIndex*)CM_NULL_SURFACE)
1522 {
1523 m_args[index].isSet = true;
1524 m_args[index].unitCount = 1; // per kernel arg
1525 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1526 m_perKernelArgExists = true;
1527 m_args[index].isDirty = true;
1528 m_args[index].isNull = true;
1529 return CM_SUCCESS;
1530 }
1531 else
1532 {
1533 // In case that CM_NULL_SURFACE was set at last time and will
1534 // set a read surface index this time. So need set isDirty as
1535 // well to indicate update kernel data.
1536 if (m_args[index].isNull == true)
1537 {
1538 m_args[index].isDirty = true;
1539 m_args[index].isNull = false;
1540 }
1541 }
1542
1543 m_args[index].isNull = false;
1544 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1545
1546 if (m_args[index].unitKind != ARG_KIND_SURFACE_VME)
1547 {
1548 if (size != sizeof(SurfaceIndex)* numSurfaces)
1549 {
1550 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1551 return CM_INVALID_ARG_SIZE;
1552 }
1553 }
1554
1555 uint32_t surfIndexData = surfIndex->get_data();
1556 int i = 0;
1557 uint32_t surfaceArraySize = 0;
1558 m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1559
1560 if (surfIndexData > surfaceArraySize)
1561 {
1562 if (m_args[index].aliasIndex != surfIndexData)
1563 {
1564 m_args[index].isDirty = true;
1565 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1566 m_args[index].aliasIndex = surfIndexData;
1567 }
1568
1569 surfIndexData = surfIndexData % surfaceArraySize;
1570 }
1571 else
1572 {
1573 m_args[index].aliasIndex = 0;
1574 }
1575
1576 while (!surfIndexData && (i < numSurfaces))
1577 {
1578 surfaces[i] = CM_NULL_SURFACE;
1579 surfIndexArray[i] = 0;
1580 i++;
1581 if (i >= numSurfaces)
1582 break;
1583 surfIndexData = surfIndex[i].get_data();
1584 }
1585
1586 if (i >= numSurfaces)
1587 {
1588 m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1589 value = surfaces;
1590 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1591 m_args[index].unitSize = (uint16_t)size;
1592 goto finish;
1593 }
1594 CmSurface* surface = nullptr;
1595 m_surfaceMgr->GetSurface(surfIndexData, surface);
1596 if (nullptr == surface)
1597 {
1598 CM_ASSERTMESSAGE("Error: Invalid surface.");
1599 return CM_FAILURE;
1600 }
1601
1602 if (SurfTypeToArgKind(surface->Type()) != m_args[index].unitKind)
1603 { // if surface type changes i.e 2D <-> 2DUP Need to set bIsDrity as true
1604 m_args[index].isDirty = true;
1605 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1606 }
1607
1608 uint32_t cisaMajorVersion, cisaMinorVersion;
1609 m_program->GetCISAVersion(cisaMajorVersion, cisaMinorVersion);
1610
1611 //This path is for surface array, including 1D, 2D, 3D,samplersurface, samplersurface8x8
1612 if ((numSurfaces > 1) && (surface->Type() != CM_ENUM_CLASS_TYPE_CMSURFACEVME))
1613 {
1614 int32_t hr = SetArgsInternalSurfArray(i,index, numSurfaces, surface, surfIndexData, surfIndex,surfaces, surfIndexArray);
1615 if (hr != CM_SUCCESS)
1616 {
1617 CM_ASSERTMESSAGE("Error: SetArgsInternal for surface array failed!\n");
1618 return CM_INVALID_ARG_VALUE;
1619 }
1620 value = surfaces;
1621 surfIndexValue = surfIndexArray;
1622 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1623 m_args[index].unitSize = (uint16_t)size;
1624 }
1625 else
1626 { //This is for single surface and surface array for VME surface
1627 switch (surface->Type())
1628 {
1629 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1630 {
1631 CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(surface);
1632
1633 uint32_t numAliases = 0;
1634 surf2D->GetNumAliases(numAliases);
1635 if (numAliases)
1636 {
1637 m_args[index].aliasCreated = true;
1638 }
1639 else
1640 {
1641 m_args[index].aliasCreated = false;
1642 }
1643
1644 //set memory object control
1645 surf2D->GetIndexFor2D(surfRegTableIndex);
1646
1647 surfaces[i] = surfRegTableIndex;
1648 surfIndexArray[i] = (uint16_t)surfIndexData;
1649
1650 value = surfaces;
1651 surfIndexValue = surfIndexArray;
1652
1653 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1654 m_args[index].unitSize = (uint16_t)size;
1655
1656 if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D_UP)) // first time or last time is set to 2DUP
1657 {
1658 m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1659 if (m_args[index].surfaceKind == SAMPLER_SURF)
1660 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1661 }
1662 else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D &&
1663 m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER &&
1664 m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1665 m_args[index].unitKind != ARG_KIND_SURFACE_2D_SCOREBOARD)
1666 {
1667 CM_ASSERTMESSAGE("Error: Assign a 2D surface to the arg which is previously assigned 1D surface, 3D surface, or VME surface.");
1668 return CM_INVALID_ARG_VALUE;
1669 }
1670 break;
1671 }
1672 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1673 {
1674 CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(surface);
1675
1676 uint32_t numAliases = 0;
1677 surf1D->GetNumAliases(numAliases);
1678 if (numAliases)
1679 {
1680 m_args[index].aliasCreated = true;
1681 }
1682 else
1683 {
1684 m_args[index].aliasCreated = false;
1685 }
1686
1687 //set memory object control
1688 surf1D->GetHandle(handle);
1689
1690 surfaces[i] = handle;
1691 surfIndexArray[i] = (uint16_t)surfIndexData;
1692
1693 value = surfaces;
1694 surfIndexValue = surfIndexArray;
1695
1696 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1697 m_args[index].unitSize = (uint16_t)size;
1698
1699 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1700 {
1701 m_args[index].unitKind = ARG_KIND_SURFACE_1D;
1702 }
1703 else if (m_args[index].unitKind != ARG_KIND_SURFACE_1D)
1704 {
1705 CM_ASSERTMESSAGE("Error: Assign a 1D surface to the arg which is previously assigned 2D surface, 3D surface, or VME surface.");
1706 return CM_INVALID_ARG_VALUE;
1707 }
1708 break;
1709 }
1710 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1711 {
1712 CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(surface);
1713
1714 //set memory object
1715 surf2DUP->GetHandle(handle);
1716
1717 surfaces[i] = handle;
1718 surfIndexArray[i] = (uint16_t)surfIndexData;
1719
1720 value = surfaces;
1721 surfIndexValue = surfIndexArray;
1722
1723 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1724 m_args[index].unitSize = (uint16_t)size;
1725
1726 if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D)) // first time or last time is set to 2D
1727 {
1728 m_args[index].unitKind = ARG_KIND_SURFACE_2D_UP;
1729 }
1730 else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D_UP)
1731 {
1732 CM_ASSERTMESSAGE("Error: Assign a 2D surface UP to the arg which is previously assigned other surfaces.");
1733 return CM_INVALID_ARG_VALUE;
1734 }
1735
1736 break;
1737 }
1738 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1739 {
1740 CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(surface);
1741
1742 surf3D->GetHandle(handle);
1743
1744 surfaces[i] = handle;
1745 surfIndexArray[i] = (uint16_t)surfIndexData;
1746
1747 value = surfaces;
1748 surfIndexValue = surfIndexArray;
1749
1750 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1751 m_args[index].unitSize = (uint16_t)size;
1752
1753 if (m_args[index].unitKind == ARG_KIND_SURFACE) // first time
1754 {
1755 m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1756 }
1757 else if (m_args[index].unitKind != ARG_KIND_SURFACE_3D)
1758 {
1759 CM_ASSERTMESSAGE("Error: Assign a 3D surface to the arg which is previously assigned 1D surface, 2D surface or VME surface");
1760 return CM_INVALID_ARG_VALUE;
1761 }
1762 break;
1763 }
1764
1765 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1766 {
1767 CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( surface );
1768 stateBuffer->GetHandle( handle );
1769
1770 surfaces[ i ] = handle;
1771 surfIndexArray[ i ] = ( uint16_t )surfIndexData;
1772
1773 value = surfaces;
1774 surfIndexValue = surfIndexArray;
1775
1776 size = ( size / sizeof( SurfaceIndex ) ) * sizeof( uint32_t );
1777 m_args[ index ].unitSize = ( uint16_t )size;
1778
1779 if ( m_args[ index ].unitKind == ARG_KIND_SURFACE ) // first time
1780 {
1781 m_args[ index ].unitKind = ARG_KIND_STATE_BUFFER;
1782 }
1783 else if ( m_args[ index ].unitKind != ARG_KIND_STATE_BUFFER )
1784 {
1785 CM_ASSERTMESSAGE( "Error: Assign a state buffer to the arg which is previously assigned 1D surface, 2D surface, 3D surface or VME surface" );
1786 return CM_INVALID_ARG_VALUE;
1787 }
1788 break;
1789 }
1790
1791 case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
1792 {
1793 return SetArgsVme(nArgType, index, value, nThreadID);
1794 }
1795 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1796 {
1797 CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
1798 surfSampler8x8->GetIndexCurrent(samplerIndex);
1799 surfSampler8x8->GetCmIndex(samplerCmIndex);
1800 if (samplerCmIndex > surfaceArraySize)
1801 {
1802 m_args[index].aliasIndex = samplerCmIndex;
1803 m_args[index].aliasCreated = true;
1804 samplerCmIndex %= surfaceArraySize;
1805 }
1806
1807 m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1808 if (!surface)
1809 {
1810 CM_ASSERTMESSAGE("Error: Invalid sampler8x8 surface.");
1811 return CM_FAILURE;
1812 }
1813
1814 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1815 m_args[index].unitSize = (uint16_t)size;
1816
1817 value = &samplerIndex;
1818 surfIndexValue = &samplerCmIndex;
1819
1820 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1821 {
1822 if (surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE)
1823 {
1824 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1825 m_args[index].nCustomValue = surfSampler8x8->GetAddressControlMode();
1826 }
1827 else
1828 {
1829 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1830 }
1831 }
1832 else if (m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_AVS &&
1833 m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_VA)
1834 {
1835 CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1836 return CM_FAILURE;
1837 }
1838 break;
1839 }
1840 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1841 {
1842 CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
1843 surfSampler->GetHandle(samplerIndex);
1844 surfSampler->GetCmIndexCurrent(samplerCmIndex);
1845
1846 m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1847 if (!surface)
1848 {
1849 CM_ASSERTMESSAGE("Error: Invalid sampler surface.");
1850 return CM_FAILURE;
1851 }
1852
1853 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1854 m_args[index].unitSize = (uint16_t)size;
1855
1856 value = &samplerIndex;
1857 surfIndexValue = &samplerCmIndex;
1858
1859 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1860 { // first time
1861 SAMPLER_SURFACE_TYPE type;
1862 surfSampler->GetSurfaceType(type);
1863 if (type == SAMPLER_SURFACE_TYPE_2D)
1864 {
1865 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1866 }
1867 else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1868 {
1869 m_args[index].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1870 }
1871 else
1872 {
1873 m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1874 }
1875
1876 }
1877 else if ((m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER) &&
1878 m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1879 (m_args[index].unitKind != ARG_KIND_SURFACE_3D))
1880 {
1881 CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1882 return CM_FAILURE;
1883 }
1884 break;
1885 }
1886 default:
1887 {
1888 CM_ASSERTMESSAGE("Error: Invalid surface type.");
1889 return CM_INVALID_ARG_VALUE;
1890 }
1891 }
1892 }
1893 }
1894 else if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1895 {
1896 unsigned int numSamplers = m_args[index].unitSize / sizeof(int);
1897
1898 if (numSamplers > 1)
1899 {
1900 size = numSamplers * sizeof(unsigned int);
1901
1902 for (unsigned int i = 0; i < numSamplers; i++)
1903 {
1904 SamplerIndex* samplerIndex = (SamplerIndex*)value + i;
1905 samplerIdx = samplerIndex->get_data();
1906 sampler_index_array.push_back(samplerIdx);
1907 }
1908 }
1909 else
1910 {
1911 SamplerIndex* samplerIndex = (SamplerIndex*)value;
1912 samplerIdx = ((SamplerIndex*)value)->get_data();
1913 size = sizeof(unsigned int);
1914 m_args[index].unitSize = (uint16_t)size;
1915 value = &samplerIdx;
1916 }
1917 }
1918
1919 finish:
1920 if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1921 {
1922 CM_ARG& arg = m_args[ index ];
1923
1924 // Assume from now on, size is valid, i.e. confirmed with function signature
1925 if( !arg.value )
1926 {
1927 //Increment size kernel arguments will take up in CURBE
1928 uint32_t tempUnitSize = m_args[ index ].unitSize;
1929 if( (m_args[index].unitKind == ARG_KIND_SURFACE_VME ) ||
1930 (m_args[index].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1931 (m_args[index].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ))
1932 {
1933 tempUnitSize = CM_ARGUMENT_SURFACE_SIZE;
1934 }
1935
1936 // first setKernelArg or first setKernelArg after each enqueue
1937 arg.value = MOS_NewArray(uint8_t,size);
1938 if( !arg.value )
1939 {
1940 CM_ASSERTMESSAGE("Error: Out of system memory.");
1941 return CM_OUT_OF_HOST_MEMORY;
1942 }
1943
1944 arg.unitCount = 1;
1945
1946 CmSafeMemCopy((void *)arg.value, value, size);
1947
1948 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
1949 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
1950 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
1951 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
1952 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1953 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
1954 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
1955 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
1956 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
1957 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
1958 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
1959 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
1960 {
1961 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(int32_t)));
1962 if (!arg.surfIndex)
1963 {
1964 CM_ASSERTMESSAGE("Error: Out of system memory.");
1965 MosSafeDeleteArray(arg.value);
1966 return CM_OUT_OF_HOST_MEMORY;
1967 }
1968 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
1969 if( surfIndexValue == nullptr )
1970 {
1971 CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
1972 return CM_NULL_POINTER;
1973 }
1974 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size / sizeof(int32_t) * sizeof(uint16_t));
1975 }
1976
1977 if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1978 {
1979 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
1980 {
1981 *( (int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
1982 }
1983 }
1984
1985 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1986 arg.isDirty = true;
1987 }
1988 else
1989 {
1990 if( arg.unitCount != 1 )
1991 {
1992 CM_ASSERTMESSAGE("Error: Invalid arg count.");
1993 return CM_FAILURE;
1994 }
1995 if( memcmp( (void *)arg.value, value, size ) != 0 )
1996 {
1997 CmSafeMemCopy((void *)arg.value, value, size);
1998 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1999 arg.isDirty = true;
2000 }
2001 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2002 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2003 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2004 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2005 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2006 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2007 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2008 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2009 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2010 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2011 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2012 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2013 {
2014 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
2015 if( surfIndexValue == nullptr )
2016 {
2017 CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2018 return CM_NULL_POINTER;
2019 }
2020 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size/sizeof(int32_t) * sizeof(uint16_t));
2021 }
2022
2023 if (m_args[index].unitKind == ARG_KIND_SAMPLER)
2024 {
2025 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
2026 {
2027 *((int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
2028 }
2029 }
2030 }
2031
2032 m_perKernelArgExists = true;
2033 }
2034 else //per thread arg
2035 {
2036 CM_ARG& arg = m_args[ index ];
2037
2038 // Assume from now on, size is valid, i.e. confirmed with function signature
2039 if( !arg.value )
2040 {
2041 //Increment size per-thread arguments will take up in payload of media object or media object walker commands
2042 m_sizeInPayload += arg.unitSize;
2043 DW_ALIGNMENT(m_sizeInPayload);
2044
2045 // first setThreadArg or first setThreadArg after each enqueue
2046 arg.value = MOS_NewArray(uint8_t, (size * m_threadCount));
2047 if( !arg.value )
2048 {
2049 CM_ASSERTMESSAGE("Error: Out of system memory.");
2050 return CM_OUT_OF_HOST_MEMORY;
2051
2052 }
2053 arg.unitCount = m_threadCount;
2054
2055 uint32_t offset = size * nThreadID;
2056 uint8_t *threadValue = ( uint8_t *)arg.value;
2057 threadValue += offset;
2058
2059 CmSafeMemCopy(threadValue, value, size);
2060 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2061 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2062 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2063 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2064 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2065 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2066 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2067 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2068 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2069 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2070 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2071 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2072 {
2073 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(uint32_t) * m_threadCount));
2074 if( !arg.surfIndex )
2075 {
2076 CM_ASSERTMESSAGE("Error: Out of system memory.");
2077 MosSafeDeleteArray(arg.value);
2078 return CM_OUT_OF_HOST_MEMORY;
2079 }
2080 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(uint32_t) * sizeof(uint16_t) * m_threadCount);
2081 if( surfIndexValue == nullptr )
2082 {
2083 CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2084 return CM_NULL_POINTER;
2085 }
2086 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t) * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2087 }
2088 m_perThreadArgExists = true;
2089 }
2090 else
2091 {
2092 if( arg.unitCount != m_threadCount )
2093 {
2094 CM_ASSERTMESSAGE("Error: arg count is not matched with thread count.");
2095 return CM_FAILURE;
2096
2097 }
2098 uint32_t offset = size * nThreadID;
2099 uint8_t *threadValue = ( uint8_t *)arg.value;
2100 threadValue += offset;
2101
2102 if( memcmp( threadValue, value, size ) != 0 )
2103 {
2104 CmSafeMemCopy(threadValue, value, size);
2105 m_dirty |= CM_KERNEL_DATA_THREAD_ARG_DIRTY;
2106 arg.isDirty = true;
2107 }
2108 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2109 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2110 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2111 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2112 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2113 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2114 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2115 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2116 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2117 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2118 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2119 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2120 {
2121 if( surfIndexValue == nullptr )
2122 {
2123 CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2124 return CM_NULL_POINTER;
2125 }
2126 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t) * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2127 }
2128 }
2129 }
2130
2131 m_args[index].isSet = true;
2132
2133 return CM_SUCCESS;
2134 }
2135
2136 //*-----------------------------------------------------------------------------
2137 //! Set per kernel arguments. The total size of all per kernel arguments and per thread
2138 //! arguments should be less than or equal to 256 Bytes (CM_MAX_ARG_SIZE_IN_BYTE).
2139 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2140 //! i.e. after enqueue, ALL arguments need to be reset.
2141 //! INPUT:
2142 //! 1) Index of argument in CM kernel function (genx_main). The index is
2143 //! global for per kernel arguments and per thread arguments.
2144 //! 2) Size of the argument.
2145 //! 3) Pointer to argument value.
2146 //! OUTPUT:
2147 //! CM_SUCCESS or
2148 //! CM_INVALID_ARG_INDEX if index is invalid;
2149 //! CM_INVALID_ARG_SIZE if size is invalid;
2150 //! CM_INVALID_ARG_VALUE if value is NULL.
2151 //*-----------------------------------------------------------------------------
SetKernelArg(uint32_t index,size_t size,const void * value)2152 CM_RT_API int32_t CmKernelRT::SetKernelArg(uint32_t index, size_t size, const void * value )
2153 {
2154 INSERT_API_CALL_LOG();
2155 //It should be mutual exclusive with Indirect Data
2156 if(m_kernelPayloadData)
2157 {
2158 CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2159 return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2160 }
2161
2162 if( index >= m_argCount )
2163 {
2164 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2165 return CM_INVALID_ARG_INDEX;
2166
2167 }
2168
2169 if( !value)
2170 {
2171 CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2172 return CM_INVALID_ARG_VALUE;
2173 }
2174
2175 if( size == 0)
2176 {
2177 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
2178 return CM_INVALID_ARG_SIZE;
2179 }
2180
2181 int32_t nRetVal = 0;
2182 if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERKERNEL, index, size, value ) ) != CM_SUCCESS )
2183 {
2184 return nRetVal;
2185 }
2186
2187 return CM_SUCCESS;
2188 }
2189
SetKernelArgPointer(uint32_t index,size_t size,const void * value)2190 CM_RT_API int32_t CmKernelRT::SetKernelArgPointer(uint32_t index, size_t size, const void *value)
2191 {
2192 INSERT_API_CALL_LOG();
2193
2194 //It should be mutual exclusive with Indirect Data
2195 if (m_kernelPayloadData)
2196 {
2197 CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2198 return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2199 }
2200
2201 if (index >= m_argCount)
2202 {
2203 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2204 return CM_INVALID_ARG_INDEX;
2205 }
2206
2207 if (!value)
2208 {
2209 CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2210 return CM_INVALID_ARG_VALUE;
2211 }
2212
2213 uint64_t *argValue = MOS_NewArray(uint64_t, 1);
2214 if (!argValue)
2215 {
2216 CM_ASSERTMESSAGE("Error: Out of system memory.");
2217 return CM_OUT_OF_HOST_MEMORY;
2218 }
2219 CmSafeMemSet(argValue, 0, sizeof(uint64_t));
2220 CmSafeMemCopy(argValue, value, size);
2221
2222 // Get the gfx start address of SVM/stateless buffer.
2223 uint64_t gfxAddress = *(argValue);
2224 MosSafeDeleteArray(argValue);
2225
2226 // Check the gfx start address is valid or not
2227 std::set<CmSurface *> statelessSurfArray = m_surfaceMgr->GetStatelessSurfaceArray();
2228 bool valid = false;
2229 for(auto surface : statelessSurfArray)
2230 {
2231 CmBuffer_RT *buffer = static_cast<CmBuffer_RT *>(surface);
2232 uint64_t startAddress = 0;
2233 buffer->GetGfxAddress(startAddress);
2234 size_t size = buffer->GetSize();
2235
2236 if (gfxAddress >= startAddress
2237 && gfxAddress < (startAddress + size))
2238 {
2239 SurfaceIndex *surfIndex = nullptr;
2240 buffer->GetIndex(surfIndex);
2241 uint32_t surfIndexData = surfIndex->get_data();
2242 m_surfaceArray[surfIndexData] = true;
2243
2244 m_args[index].isStatelessBuffer = true;
2245 m_args[index].index = (uint16_t)surfIndexData;
2246
2247 valid = true;
2248 break;
2249 }
2250 }
2251 if (!valid)
2252 {
2253 CM_ASSERTMESSAGE("Error: the kernel arg pointer is not valid.");
2254 return CM_INVALID_KERNEL_ARG_POINTER;
2255 }
2256
2257 int32_t nRetVal = SetArgsInternal(CM_KERNEL_INTERNEL_ARG_PERKERNEL,
2258 index,
2259 size,
2260 value);
2261 if (nRetVal != CM_SUCCESS)
2262 {
2263 return nRetVal;
2264 }
2265
2266 return CM_SUCCESS;
2267 }
2268
2269 //*-----------------------------------------------------------------------------
2270 //| Purpose: Set Static Buffer
2271 //| Return : The result of operation
2272 //*-----------------------------------------------------------------------------
SetStaticBuffer(uint32_t index,const void * value)2273 CM_RT_API int32_t CmKernelRT::SetStaticBuffer(uint32_t index, const void * value )
2274 {
2275 INSERT_API_CALL_LOG();
2276 if(index >= CM_GLOBAL_SURFACE_NUMBER)
2277 {
2278 CM_ASSERTMESSAGE("Error: Surface Index exceeds max global surface number.");
2279 return CM_INVALID_GLOBAL_BUFFER_INDEX;
2280 }
2281
2282 if(!value)
2283 {
2284 CM_ASSERTMESSAGE("Error: Invalid StaticBuffer arg value.");
2285 return CM_INVALID_BUFFER_HANDLER;
2286 }
2287
2288 SurfaceIndex* surfIndex = (SurfaceIndex* )value;
2289 uint32_t surfIndexData = surfIndex->get_data();
2290 if (surfIndexData >= m_surfaceMgr->GetSurfacePoolSize())
2291 {
2292 CM_ASSERTMESSAGE("Error: StaticBuffer doesn't allow alias index.");
2293 return CM_INVALID_ARG_INDEX;
2294 }
2295
2296 CmSurface* surface = nullptr;
2297 m_surfaceMgr->GetSurface( surfIndexData, surface );
2298 if(surface == nullptr)
2299 {
2300 CM_ASSERTMESSAGE("Error: Invalid surface.");
2301 return CM_INVALID_BUFFER_HANDLER;
2302 }
2303
2304 CmBuffer_RT* surf1D = nullptr;
2305 if ( surface->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
2306 {
2307 uint32_t handle = 0; // for 1D surf
2308
2309 surf1D = static_cast< CmBuffer_RT* >( surface );
2310 surf1D->GetHandle( handle );
2311
2312 if (m_globalSurfaces[index] == nullptr)
2313 {
2314 m_globalSurfaces[index] = MOS_New(SurfaceIndex,0);
2315 if( !m_globalSurfaces[index] )
2316 {
2317 CM_ASSERTMESSAGE("Error: Out of system memory.");
2318 return CM_OUT_OF_HOST_MEMORY;
2319 }
2320 }
2321 *m_globalSurfaces[index] = handle;
2322 m_globalCmIndex[index] = surfIndexData;
2323 m_dirty |= CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY;
2324 }
2325 else
2326 {
2327 CM_ASSERTMESSAGE("Error: StaticBuffer only supports CmBuffer type.");
2328 return CM_INVALID_BUFFER_HANDLER;
2329 }
2330 return CM_SUCCESS;
2331 }
2332
2333 //*-----------------------------------------------------------------------------
2334 //! Set per thread arguments. The total size of all per kernel arguments and per thread
2335 //! arguments should be less than or equal to 256 Bytes
2336 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2337 //! i.e. after enqueue, ALL arguments need to be reset.
2338 //! INPUT:
2339 //! 1) Thread index.
2340 //! 2) Index of argument in CM kernel function (genx_main). The index is
2341 //! global for per kernel arguments and per thread arguments.
2342 //! 3) Size of the argument.
2343 //! 4) Pointer to argument .
2344 //! OUTPUT:
2345 //! CM_SUCCESS or
2346 //! CM_INVALID_ARG_INDEX if index is invalid
2347 //! CM_INVALID_ARG_SIZE if size is invalid
2348 //! CM_INVALID_ARG_VALUE if value is nullptr
2349 //*-----------------------------------------------------------------------------
SetThreadArg(uint32_t threadId,uint32_t index,size_t size,const void * value)2350 CM_RT_API int32_t CmKernelRT::SetThreadArg(uint32_t threadId, uint32_t index, size_t size, const void * value )
2351 {
2352 INSERT_API_CALL_LOG();
2353
2354 //It should be mutual exclusive with Indirect Data
2355 if(m_kernelPayloadData)
2356 {
2357 CM_ASSERTMESSAGE("Error: SetThredArg should be mutual exclusive with indirect data.");
2358 return CM_KERNELPAYLOAD_PERTHREADARG_MUTEX_FAIL;
2359 }
2360
2361 if(m_threadCount > m_halMaxValues->maxUserThreadsPerTask || m_threadCount <=0)
2362 {
2363 CM_ASSERTMESSAGE("Error: Minimum or Maximum number of threads exceeded.");
2364 return CM_FAILURE;
2365 }
2366
2367 if( index >= m_argCount )
2368 {
2369 CM_ASSERTMESSAGE("Error: Invalid thread arg count.");
2370 return CM_INVALID_ARG_INDEX;
2371
2372 }
2373
2374 if( threadId >= m_threadCount )
2375 {
2376 CM_ASSERTMESSAGE("Error: thread id exceeds the threadcount.");
2377 return CM_INVALID_THREAD_INDEX;
2378
2379 }
2380
2381 if( !value)
2382 {
2383 CM_ASSERTMESSAGE("Error: Invalid thread arg value.");
2384 return CM_INVALID_ARG_VALUE;
2385 }
2386
2387 if( size == 0)
2388 {
2389 CM_ASSERTMESSAGE("Error: Invalid thread arg size.");
2390 return CM_INVALID_ARG_SIZE;
2391 }
2392
2393 int32_t nRetVal = 0;
2394 if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERTHREAD, index, size, value, threadId ) ) != CM_SUCCESS )
2395 {
2396 return nRetVal;
2397 }
2398
2399 return CM_SUCCESS;
2400 }
2401
2402 //*-----------------------------------------------------------------------------
2403 //| Purpose: Calculate the total size of kernel data
2404 //*-----------------------------------------------------------------------------
CalcKernelDataSize(uint32_t movInstNum,uint32_t numArgs,uint32_t argSize,uint32_t & totalKernelDataSize)2405 int32_t CmKernelRT::CalcKernelDataSize(
2406 uint32_t movInstNum, // [in] the number of move instructions
2407 uint32_t numArgs, // [in] number of args , surface array count
2408 uint32_t argSize, // [in] Size of arguments
2409 uint32_t & totalKernelDataSize) // [out] total size of kernel data
2410 {
2411 int32_t hr = CM_SUCCESS;
2412
2413 uint32_t headSize = ( KERNEL_INFO_SIZE_IN_DWORD + numArgs * PER_ARG_SIZE_IN_DWORD ) * sizeof( uint32_t );
2414 uint32_t totalSize = headSize + movInstNum * CM_MOVE_INSTRUCTION_SIZE + m_binarySize + argSize;
2415
2416 totalSize += 4; // one dword for flag. the first bit is curbe on/off
2417 totalSize += 8; //sizeof( uint64_t ) for id
2418
2419 totalSize += 16; // static buffer indices
2420 totalSize += 12; // GT Pin buffer indices
2421
2422 ////////////////////////////////////////////////////////////////////////////
2423 // Calculate indirect data size (start)
2424 ////////////////////////////////////////////////////////////////////////////
2425 // Memory layout for indirect data:
2426 // Indirect Data Size -------------------- 2 bytes (must present)
2427 // Below area is present only if above value is not ZERO
2428 // Indirect Data Buffer ------------------ Size indicated above
2429 totalSize += sizeof(uint16_t); //field for indirect data size
2430 if(m_usKernelPayloadDataSize)
2431 {
2432 totalSize += m_usKernelPayloadDataSize;
2433 }
2434 // Memory layout for indirect surface:
2435 // Indirect Surface Count ----------------- 2 bytes (must present)
2436 // Below are present only if the above value is not ZERO
2437 // Kind of Indirect Surface 0 ------------- 2 Bytes
2438 // Handle of Indirect Surface 0 ----------- 2 Bytes
2439 // Surface Index of Indirect Surface 0 ---- 2 Bytes
2440 // ..........
2441 // Kind of Indirect Surface n-1 ----------- 2 Bytes
2442 // Handle of Indirect Surface n-1---------- 2 Bytes
2443 // Surface Index of Indirect Surface n-1 -- 2 Bytes
2444 totalSize += sizeof(uint16_t); //field for indirect surface count
2445 if(m_usKernelPayloadSurfaceCount)
2446 {
2447 totalSize += m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO);
2448 }
2449
2450 totalKernelDataSize = totalSize;
2451
2452 return hr;
2453 }
2454
2455 //*-----------------------------------------------------------------------------
2456 //| Purpose: Create mov instructions
2457 //| instructions will be copied into DstMem
2458 //*-----------------------------------------------------------------------------
CreateMovInstructions(uint32_t & movInstNum,uint8_t * & codeDst,CM_ARG * tempArgs,uint32_t numArgs)2459 int32_t CmKernelRT::CreateMovInstructions( uint32_t &movInstNum, uint8_t *&codeDst, CM_ARG* tempArgs, uint32_t numArgs)
2460 {
2461 //Create Mov Instruction
2462 CmDynamicArray movInsts( numArgs );
2463 uint32_t renderGen = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState->platform.eRenderCoreFamily;
2464 CM_RETURN_CODE ret = m_movInstConstructor->SetInstDistanceConfig(movInsts.GetMaxSize(), renderGen);
2465 if (ret != CM_SUCCESS && ret != CM_NOT_IMPLEMENTED)
2466 {
2467 return ret;
2468 }
2469
2470 movInstNum = 0;
2471
2472 //Note: if no thread arg and no per kernel arg, no need move instrcutions at all.
2473 if( m_curbeEnabled && (m_perThreadArgExists || m_perKernelArgExists))
2474 {
2475 if( ( m_argCount > 0 ) && ( m_threadCount > 1) )
2476 {
2477 PCM_ARG* sortedArgs = MOS_NewArray(PCM_ARG,numArgs);
2478 if( !sortedArgs )
2479 {
2480 CM_ASSERTMESSAGE("Error: Out of system memory.");
2481 return CM_OUT_OF_HOST_MEMORY;
2482 }
2483 for( uint32_t j = 0; j < numArgs; j++ )
2484 {
2485 sortedArgs[ j ] = tempArgs + j;
2486 }
2487 // sort arg to sortedArgs accorind to offsetinPayload
2488 QuickSort( sortedArgs, 0, numArgs - 1 );
2489
2490 // record compiler generated offset, used as move dst later
2491 uint16_t *unitOffsetInPayloadSorted = MOS_NewArray(uint16_t, numArgs);
2492 if( !unitOffsetInPayloadSorted )
2493 {
2494 CM_ASSERTMESSAGE("Error: Out of system memory.");
2495 MosSafeDeleteArray(sortedArgs);
2496 return CM_OUT_OF_HOST_MEMORY;
2497 }
2498 for( uint32_t j = 0; j < numArgs; j++ )
2499 {
2500 unitOffsetInPayloadSorted[j] = sortedArgs[j]->unitOffsetInPayload;
2501 }
2502
2503 uint16_t kernelArgEnd = 32;
2504 bool beforeFirstThreadArg = true;
2505 for( uint32_t j = 0; j < numArgs; j++ )
2506 {
2507 if( sortedArgs[j]->unitCount == 1 )
2508 // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2509 {
2510 if( beforeFirstThreadArg )
2511 {
2512 kernelArgEnd = sortedArgs[j]->unitOffsetInPayload + sortedArgs[j]->unitSize;
2513 }
2514 else
2515 {
2516 DW_ALIGNMENT( kernelArgEnd ); // necessary ?
2517 sortedArgs[j]->unitOffsetInPayload = kernelArgEnd;
2518 kernelArgEnd += sortedArgs[j]->unitSize;
2519 }
2520 }
2521 else // per thread
2522 {
2523 if( beforeFirstThreadArg )
2524 {
2525 beforeFirstThreadArg = false;
2526 }
2527 }
2528 }
2529
2530 GRF_ALIGNMENT(kernelArgEnd); // offset of thread arg start related to R0
2531 uint32_t threadArgStart = kernelArgEnd;
2532
2533 for (uint32_t j = 0; j < numArgs; j++)
2534 {
2535 if (sortedArgs[j]->unitCount > 1) // per thread
2536 {
2537 sortedArgs[j]->unitOffsetInPayload = (uint16_t)threadArgStart;
2538 threadArgStart += sortedArgs[j]->unitSize;
2539 DW_ALIGNMENT(threadArgStart);
2540 }
2541 }
2542
2543 bool needMovInstructions = false;
2544 for( uint32_t j = 0; j < numArgs; j++ )
2545 {
2546 if ( unitOffsetInPayloadSorted[j] != sortedArgs[j]->unitOffsetInPayload )
2547 {
2548 needMovInstructions = true;
2549 break;
2550 }
2551 }
2552
2553 if (needMovInstructions)
2554 {
2555 // Add move
2556 GRF_ALIGNMENT(threadArgStart);
2557 uint32_t threadArgEnd = threadArgStart;
2558 uint32_t size = threadArgEnd - 32;
2559 CM_ASSERT((size % 32) == 0);
2560
2561 // move all arguments starting from R1 (32 ) through threadArgEnd to R64 (R0 reserved for media dispatch)
2562 uint32_t nextIndex = 0;
2563 nextIndex += m_movInstConstructor->ConstructObjMovs(R64_OFFSET, 32, size, movInsts, nextIndex, true, m_blhwDebugEnable);
2564
2565 beforeFirstThreadArg = true;
2566 for (uint32_t j = 0; j < numArgs; j++)
2567 {
2568 if (sortedArgs[j]->unitCount == 1)
2569 // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2570 {
2571 if (beforeFirstThreadArg == false)
2572 {
2573 // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2574 nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2575 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - 32,
2576 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2577 }
2578 }
2579 else // per thread
2580 {
2581 if (beforeFirstThreadArg)
2582 {
2583 beforeFirstThreadArg = false;
2584 }
2585
2586 // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2587 nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2588 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - CM_PAYLOAD_OFFSET,
2589 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2590 }
2591 }
2592
2593 movInstNum = nextIndex;
2594 }
2595
2596 MosSafeDeleteArray(sortedArgs);
2597 MosSafeDeleteArray(unitOffsetInPayloadSorted);
2598 }
2599 }// End of if( m_curbeEnabled && m_ThreadArgExists)
2600
2601 uint32_t addInstDW[4];
2602 MOS_ZeroMemory(addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2603 uint32_t addInstNum =0;
2604
2605 if(m_threadSpace && m_adjustScoreboardY)
2606 {
2607 addInstNum = 1;
2608
2609 addInstDW[0] = CM_BDW_ADJUST_Y_SCOREBOARD_DW0;
2610 addInstDW[1] = CM_BDW_ADJUST_Y_SCOREBOARD_DW1;
2611 addInstDW[2] = CM_BDW_ADJUST_Y_SCOREBOARD_DW2;
2612
2613 // constant word needs high 16 bits to be same as low 16 bits
2614 uint16_t tmp = - (int32_t)(m_adjustScoreboardY);
2615 addInstDW[3] = (tmp << 16) + tmp;
2616
2617 }
2618
2619 if (movInstNum || addInstNum)
2620 {
2621 codeDst = MOS_NewArray(uint8_t, ((movInstNum + addInstNum) * CM_MOVE_INSTRUCTION_SIZE));
2622 if (!codeDst)
2623 {
2624 return CM_OUT_OF_HOST_MEMORY;
2625 }
2626 }
2627
2628 for( uint32_t j = 0; j < movInstNum; j ++ )
2629 {
2630 MovInst_RT* movInst = (MovInst_RT*)movInsts.GetElement( j );
2631 if (!movInst)
2632 {
2633 CM_ASSERTMESSAGE("Error: Invalid move instructions.");
2634 MosSafeDeleteArray(codeDst);
2635 return CM_FAILURE;
2636 }
2637 if (j != 0)
2638 {
2639 movInst->ClearDebug();
2640 }
2641 CmSafeMemCopy(codeDst + j * CM_MOVE_INSTRUCTION_SIZE, movInst->GetBinary(), CM_MOVE_INSTRUCTION_SIZE);
2642 CmSafeDelete(movInst); // delete each element in movInsts
2643 }
2644 movInsts.Delete();
2645
2646 if(addInstNum != 0)
2647 {
2648 CmSafeMemCopy(codeDst + movInstNum * CM_MOVE_INSTRUCTION_SIZE, addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2649
2650 movInstNum += addInstNum; // take add Y instruction into consideration
2651 }
2652
2653 return CM_SUCCESS;
2654 }
2655
CreateKernelArgDataGroup(uint8_t * & data,uint32_t value)2656 int32_t CmKernelRT::CreateKernelArgDataGroup(
2657 uint8_t *&data,
2658 uint32_t value)
2659 {
2660 if (data == nullptr)
2661 {
2662 data = MOS_NewArray(uint8_t, sizeof(uint32_t));
2663 if(!data)
2664 {
2665 return CM_OUT_OF_HOST_MEMORY;
2666 }
2667 }
2668 *(uint32_t *)data = value;
2669 return CM_SUCCESS;
2670 }
2671
CreateKernelImplicitArgDataGroup(uint8_t * & data,uint32_t size)2672 int32_t CmKernelRT::CreateKernelImplicitArgDataGroup(
2673 uint8_t *&data,
2674 uint32_t size)
2675 {
2676 data = MOS_NewArray(uint8_t, (size * sizeof(uint32_t)));
2677 if (!data)
2678 {
2679 return CM_OUT_OF_HOST_MEMORY;
2680 }
2681 *(uint32_t *)data = 0;
2682 return CM_SUCCESS;
2683 }
2684
2685 //*-----------------------------------------------------------------------------
2686 //| Purpose: Create mov instructions
2687 //| instructions will be copied into DstMem
2688 //*-----------------------------------------------------------------------------
CreateThreadArgData(PCM_HAL_KERNEL_ARG_PARAM kernelArg,uint32_t threadArgIndex,CmThreadSpaceRT * threadSpace,CM_ARG * cmArgs)2689 int32_t CmKernelRT::CreateThreadArgData(
2690 PCM_HAL_KERNEL_ARG_PARAM kernelArg,
2691 uint32_t threadArgIndex,
2692 CmThreadSpaceRT* threadSpace,
2693 CM_ARG* cmArgs )
2694 {
2695 int32_t hr = CM_SUCCESS;
2696 uint32_t threadArgCount = cmArgs[ threadArgIndex].unitCount;
2697 uint32_t threadArgSize = cmArgs[ threadArgIndex ].unitSize;
2698
2699 if (CHECK_SURFACE_TYPE(cmArgs->unitKind, ARG_KIND_SURFACE_VME))
2700 {
2701 // reallocate the memory since the number of surfaces in a vme surface could vary
2702 MosSafeDeleteArray(kernelArg->firstValue);
2703 }
2704
2705 if( kernelArg->firstValue == nullptr)
2706 {
2707 // if firstValue = nullptr, then create a new one, otherwise, update the exisitng one
2708 kernelArg->firstValue = MOS_NewArray(uint8_t, (cmArgs[threadArgIndex].unitCount * cmArgs[threadArgIndex].unitSize));
2709 if( !kernelArg->firstValue )
2710 {
2711 hr = CM_OUT_OF_HOST_MEMORY;
2712 goto finish;
2713 }
2714 }
2715
2716 if(kernelArg->unitCount == 1 ) // kernel arg
2717 {
2718 if (cmArgs[threadArgIndex].value)
2719 {
2720 CmSafeMemCopy(kernelArg->firstValue, cmArgs[threadArgIndex].value, threadArgCount * threadArgSize);
2721 }
2722 goto finish;
2723 }
2724
2725 if( threadSpace != nullptr )
2726 {
2727 CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2728 threadSpace->GetDependencyPatternType(dependencyPatternType);
2729
2730 if ((m_threadSpaceAssociated == true) && (dependencyPatternType != CM_NONE_DEPENDENCY))
2731 {
2732 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
2733 threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
2734
2735 uint32_t *boardOrder = nullptr;
2736 threadSpace->GetBoardOrder(boardOrder);
2737
2738 for (uint32_t index = 0; index < threadArgCount; index++)
2739 {
2740 uint32_t offset = threadSpaceUnit[boardOrder[index]].threadId;
2741 uint8_t *argSrc = (uint8_t*)cmArgs[threadArgIndex].value + offset * threadArgSize;
2742 uint8_t *argDst = kernelArg->firstValue + index * threadArgSize;
2743 CmSafeMemCopy(argDst, argSrc, threadArgSize);
2744 }
2745 }
2746 else
2747 {
2748 CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2749 }
2750 }
2751 else
2752 {
2753 CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2754 }
2755
2756 finish:
2757 return hr;
2758 }
2759
2760 //*-----------------------------------------------------------------------------
2761 //| Purpose: Sort thread space for scorboarding
2762 //*-----------------------------------------------------------------------------
SortThreadSpace(CmThreadSpaceRT * threadSpace)2763 int32_t CmKernelRT::SortThreadSpace( CmThreadSpaceRT* threadSpace )
2764 {
2765 int32_t hr = CM_SUCCESS;
2766 CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2767
2768 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2769
2770 threadSpace->GetDependencyPatternType(dependencyPatternType);
2771
2772 if(!threadSpace->IsThreadAssociated())
2773 {//Skip Sort if it is media walker
2774 return CM_SUCCESS;
2775 }
2776
2777 if (threadSpace->CheckDependencyVectorsSet())
2778 {
2779 threadSpace->WavefrontDependencyVectors();
2780 }
2781 else
2782 {
2783 switch (dependencyPatternType)
2784 {
2785 case CM_WAVEFRONT:
2786 threadSpace->Wavefront45Sequence();
2787 break;
2788
2789 case CM_WAVEFRONT26:
2790 threadSpace->Wavefront26Sequence();
2791 break;
2792
2793 case CM_WAVEFRONT26Z:
2794 threadSpace->Wavefront26ZSequence();
2795 break;
2796
2797 case CM_WAVEFRONT26ZI:
2798 CM_26ZI_DISPATCH_PATTERN dispatchPattern;
2799 threadSpace->Get26ZIDispatchPattern(dispatchPattern);
2800 switch (dispatchPattern)
2801 {
2802 case VVERTICAL_HVERTICAL_26:
2803 threadSpace->Wavefront26ZISeqVVHV26();
2804 break;
2805 case VVERTICAL_HHORIZONTAL_26:
2806 threadSpace->Wavefront26ZISeqVVHH26();
2807 break;
2808 case VVERTICAL26_HHORIZONTAL26:
2809 threadSpace->Wavefront26ZISeqVV26HH26();
2810 break;
2811 case VVERTICAL1X26_HHORIZONTAL1X26:
2812 threadSpace->Wavefront26ZISeqVV1x26HH1x26();
2813 break;
2814 default:
2815 threadSpace->Wavefront26ZISeqVVHV26();
2816 break;
2817 }
2818 break;
2819
2820 case CM_HORIZONTAL_WAVE:
2821 threadSpace->HorizentalSequence();
2822 break;
2823
2824 case CM_VERTICAL_WAVE:
2825 threadSpace->VerticalSequence();
2826 break;
2827
2828 case CM_NONE_DEPENDENCY:
2829 case CM_WAVEFRONT26X:
2830 case CM_WAVEFRONT26ZIG:
2831 break;
2832
2833 default:
2834 CM_ASSERTMESSAGE("Error: Invalid thread dependency type.");
2835 hr = CM_FAILURE;
2836 break;
2837 }
2838 }
2839
2840 finish:
2841 return hr;
2842 }
2843
2844 //*-----------------------------------------------------------------------------
2845 //| Purpose: Create temp args array with surface array broken down
2846 //| instructions will be copied into DstMem
2847 //*-----------------------------------------------------------------------------
CreateTempArgs(uint32_t numArgs,CM_ARG * & tempArgs)2848 int32_t CmKernelRT::CreateTempArgs(
2849 uint32_t numArgs,
2850 CM_ARG* &tempArgs)
2851 {
2852 int32_t hr = CM_SUCCESS;
2853 int32_t numSurfaces = 0;
2854 int32_t increasedArgs = 0;
2855
2856 if( numArgs < m_argCount || tempArgs != nullptr )
2857 {
2858 CM_ASSERTMESSAGE("Error: Invalid arg number or arg value.");
2859 hr = CM_FAILURE;
2860 goto finish;
2861 }
2862
2863 tempArgs = MOS_NewArray(CM_ARG, numArgs);
2864 CM_CHK_NULL_GOTOFINISH(tempArgs, CM_OUT_OF_HOST_MEMORY);
2865 CmSafeMemSet(tempArgs, 0, numArgs* sizeof(CM_ARG) );
2866
2867 for( uint32_t j = 0; j < m_argCount; j++ )
2868 {
2869 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind, // first time
2870 ARG_KIND_SURFACE,
2871 ARG_KIND_SURFACE_1D,
2872 ARG_KIND_SURFACE_2D,
2873 ARG_KIND_SURFACE_2D_UP,
2874 ARG_KIND_SURFACE_SAMPLER,
2875 ARG_KIND_SURFACE2DUP_SAMPLER,
2876 ARG_KIND_SURFACE_3D,
2877 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
2878 ARG_KIND_SURFACE_SAMPLER8X8_VA,
2879 ARG_KIND_SURFACE_2D_SCOREBOARD,
2880 ARG_KIND_STATE_BUFFER ) )
2881 {
2882 numSurfaces = m_args[j].unitSize/sizeof(int);
2883
2884 if (numSurfaces > 1)
2885 {
2886 if (m_args[j].unitCount == 1)
2887 { //Kernel arg
2888 for (int32_t k = 0; k < numSurfaces; k++)
2889 {
2890 tempArgs[j + increasedArgs + k] = m_args[j];
2891 tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2892 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2893 tempArgs[j + increasedArgs + k].value = (uint8_t *)((uint32_t *)m_args[j].value + k);
2894 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2895 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2896 //For each surface kind and custom value in surface array
2897 if (!m_args[j].surfIndex[k])
2898 {
2899 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
2900 //This is for special usage if there is empty element in surface array.
2901 tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SURFACE2D;
2902 continue;
2903 }
2904 tempArgs[j + increasedArgs + k].unitKind = m_args[j].surfArrayArg[k].argKindForArray;
2905 tempArgs[j + increasedArgs + k].nCustomValue = m_args[j].surfArrayArg[k].addressModeForArray;
2906 }
2907 }
2908 else
2909 {
2910 uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2911 CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
2912 for (int32_t k = 0; k < numSurfaces; k++)
2913 {
2914 tempArgs[j + increasedArgs + k] = m_args[j];
2915 tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2916 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2917 tempArgs[j + increasedArgs + k].value = MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2918 if(tempArgs[j + increasedArgs + k].value == nullptr)
2919 {
2920 CM_ASSERTMESSAGE("Error: Out of system memory.");
2921 hr = CM_OUT_OF_HOST_MEMORY;
2922 MosSafeDeleteArray(surfaces);
2923 goto finish;
2924 }
2925 for (uint32_t s = 0; s < m_args[j].unitCount; s++)
2926 {
2927 surfaces[s] = *(uint32_t *)((uint32_t *)m_args[j].value + k + numSurfaces * s);
2928 }
2929 CmSafeMemCopy(tempArgs[j + increasedArgs + k].value, surfaces, sizeof(int32_t) * m_args[j].unitCount);
2930 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2931 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = (uint16_t)-1;
2932 }
2933 MosSafeDeleteArray(surfaces);
2934 }
2935 increasedArgs += numSurfaces - 1;
2936 }
2937 else
2938 {
2939 tempArgs[j + increasedArgs] = m_args[j];
2940 }
2941 }
2942 else if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
2943 {
2944 numSurfaces = m_args[ j ].unitVmeArraySize;
2945 if(numSurfaces == 1)
2946 { // single vme surface
2947 tempArgs[j + increasedArgs] = m_args[j];
2948 }
2949 else
2950 { // multiple vme surfaces in surface array
2951 if (m_args[j].unitCount == 1) { //Kernel arg
2952 uint32_t vmeSurfOffset = 0;
2953
2954 for (int32_t k = 0; k < numSurfaces; k++)
2955 {
2956 uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[j].value + vmeSurfOffset));
2957
2958 tempArgs[j + increasedArgs + k] = m_args[j];
2959 tempArgs[j + increasedArgs + k].unitSize = vmeSize;
2960 tempArgs[j + increasedArgs + k].unitSizeOrig = vmeSize;
2961 tempArgs[j + increasedArgs + k].value = (uint8_t *)(m_args[j].value + vmeSurfOffset);
2962 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + k*4;
2963 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2964
2965 vmeSurfOffset += vmeSize;
2966 }
2967 }
2968 }
2969 increasedArgs += numSurfaces - 1;
2970 }
2971 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
2972 {
2973 unsigned int numSamplers = m_args[j].unitSize / sizeof(int);
2974
2975 if (numSamplers > 1)
2976 {
2977 if (m_args[j].unitCount == 1)
2978 {
2979 //Kernel arg
2980 for (unsigned int k = 0; k < numSamplers; k++)
2981 {
2982 tempArgs[j + increasedArgs + k] = m_args[j];
2983 tempArgs[j + increasedArgs + k].unitSize = sizeof(int);
2984 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int);
2985 tempArgs[j + increasedArgs + k].value = (unsigned char *)((unsigned int *)m_args[j].value + k);
2986 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2987 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2988 tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SAMPLER;
2989 }
2990 }
2991 else
2992 {
2993 // Use sampler index array as thread arg.
2994 // Not implemented yet.
2995 return CM_NOT_IMPLEMENTED;
2996 }
2997 increasedArgs += numSamplers - 1;
2998 }
2999 else
3000 {
3001 tempArgs[j + increasedArgs] = m_args[j];
3002 }
3003 }
3004 else
3005 {
3006 tempArgs[j + increasedArgs] = m_args[j];
3007 }
3008 }
3009
3010 finish:
3011 if(hr == CM_OUT_OF_HOST_MEMORY)
3012 {
3013 if(tempArgs)
3014 {
3015 for (uint32_t j = 0; j < numArgs; j++)
3016 {
3017 MosSafeDeleteArray(tempArgs[j].value);
3018 }
3019 }
3020 MosSafeDeleteArray( tempArgs );
3021 }
3022 return hr;
3023 }
3024
3025 //*-----------------------------------------------------------------------------
3026 //| Purpose: Get the number of args includes the num of surfaces in surface array
3027 //*-----------------------------------------------------------------------------
GetArgCountPlusSurfArray(uint32_t & argSize,uint32_t & argCountPlus)3028 int32_t CmKernelRT::GetArgCountPlusSurfArray(uint32_t &argSize, uint32_t & argCountPlus)
3029 {
3030 argCountPlus = m_argCount;
3031 argSize = 0;
3032
3033 if(m_usKernelPayloadDataSize)
3034 { // if payload data exists, the number of args is zero
3035 argCountPlus = 0;
3036 argSize = 0;
3037 return CM_SUCCESS;
3038 }
3039
3040 if( m_argCount != 0 ) //Need pass the arg either by arguments area, or by indirect payload area
3041 {
3042 //Sanity check for argument setting
3043 if((m_perThreadArgExists == false) && (m_perKernelArgExists == false) && (m_usKernelPayloadDataSize == 0))
3044 {
3045 if ( m_stateBufferBounded == CM_STATE_BUFFER_NONE )
3046 {
3047 CM_ASSERTMESSAGE( "Error: Kernel arguments are not set." );
3048 return CM_NOT_SET_KERNEL_ARGUMENT;
3049 }
3050 }
3051
3052 if(m_perThreadArgExists || m_perKernelArgExists)
3053 {
3054 unsigned int extraArgs = 0;
3055
3056 for( uint32_t j = 0; j < m_argCount; j ++ )
3057 {
3058 //Sanity checking for every argument setting
3059 if ( !m_args[j].isSet )
3060 {
3061 CM_ASSERTMESSAGE("Error: One Kernel argument is not set.");
3062 return CM_KERNEL_ARG_SETTING_FAILED;
3063 }
3064
3065 argSize += m_args[j].unitSize * m_args[j].unitCount;
3066
3067 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind,
3068 ARG_KIND_SURFACE,
3069 ARG_KIND_SURFACE_1D,
3070 ARG_KIND_SURFACE_2D,
3071 ARG_KIND_SURFACE_2D_UP,
3072 ARG_KIND_SURFACE_SAMPLER,
3073 ARG_KIND_SURFACE2DUP_SAMPLER,
3074 ARG_KIND_SURFACE_3D,
3075 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
3076 ARG_KIND_SURFACE_SAMPLER8X8_VA,
3077 ARG_KIND_SURFACE_2D_SCOREBOARD,
3078 ARG_KIND_STATE_BUFFER ) )
3079 {
3080 int numSurfaces = m_args[j].unitSize/sizeof(int);
3081 if (numSurfaces > 1) {
3082 extraArgs += numSurfaces - 1;
3083 }
3084 }
3085 else if (CHECK_SURFACE_TYPE(m_args[j].unitKind, ARG_KIND_SURFACE_VME))
3086 {
3087 int numSurfaces = m_args[j].unitVmeArraySize;
3088 if (numSurfaces > 1) {
3089 extraArgs += numSurfaces - 1;
3090 }
3091 }
3092 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
3093 {
3094 int numSamplers = m_args[j].unitSize / sizeof(int);
3095 if (numSamplers > 1)
3096 {
3097 extraArgs += (numSamplers - 1);
3098 }
3099 }
3100 }
3101
3102 argCountPlus = m_argCount + extraArgs;
3103 }
3104 }
3105 return CM_SUCCESS;
3106 }
3107
3108 //*-----------------------------------------------------------------------------
3109 //| Purpose: Create Thread Space Param
3110 //*-----------------------------------------------------------------------------
CreateThreadSpaceParam(PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,CmThreadSpaceRT * threadSpace)3111 int32_t CmKernelRT::CreateThreadSpaceParam(
3112 PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,
3113 CmThreadSpaceRT* threadSpace )
3114 {
3115 int32_t hr = CM_SUCCESS;
3116 CM_HAL_DEPENDENCY* dependency = nullptr;
3117 uint32_t threadSpaceWidth = 0;
3118 uint32_t threadSpaceHeight =0;
3119 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
3120 CM_THREAD_SPACE_DIRTY_STATUS dirtyStatus = CM_THREAD_SPACE_CLEAN;
3121
3122 if (kernelThreadSpaceParam == nullptr || threadSpace == nullptr)
3123 {
3124 CM_ASSERTMESSAGE("Error: Pointer to CmKernelThreadSpaceParam or thread space is null.");
3125 hr = CM_NULL_POINTER;
3126 goto finish;
3127 }
3128
3129 threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
3130 kernelThreadSpaceParam->threadSpaceWidth = (uint16_t)threadSpaceWidth;
3131 kernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
3132
3133 threadSpace->GetDependencyPatternType(kernelThreadSpaceParam->patternType);
3134 threadSpace->GetWalkingPattern(kernelThreadSpaceParam->walkingPattern);
3135 threadSpace->GetDependency( dependency);
3136
3137 if(dependency != nullptr)
3138 {
3139 CmSafeMemCopy(&kernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
3140 }
3141
3142 if( threadSpace->CheckWalkingParametersSet( ) )
3143 {
3144 kernelThreadSpaceParam->walkingParamsValid = 1;
3145 CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetWalkingParameters(kernelThreadSpaceParam->walkingParams));
3146 }
3147 else
3148 {
3149 kernelThreadSpaceParam->walkingParamsValid = 0;
3150 }
3151
3152 if( threadSpace->CheckDependencyVectorsSet( ) )
3153 {
3154 kernelThreadSpaceParam->dependencyVectorsValid = 1;
3155 CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetDependencyVectors(kernelThreadSpaceParam->dependencyVectors));
3156 }
3157 else
3158 {
3159 kernelThreadSpaceParam->dependencyVectorsValid = 0;
3160 }
3161
3162 threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
3163
3164 if(threadSpaceUnit)
3165 {
3166 kernelThreadSpaceParam->threadCoordinates = MOS_NewArray(CM_HAL_SCOREBOARD, (threadSpaceWidth * threadSpaceHeight));
3167 CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->threadCoordinates , CM_OUT_OF_HOST_MEMORY);
3168 CmSafeMemSet(kernelThreadSpaceParam->threadCoordinates, 0, threadSpaceHeight * threadSpaceWidth * sizeof(CM_HAL_SCOREBOARD));
3169
3170 uint32_t *boardOrder = nullptr;
3171 threadSpace->GetBoardOrder(boardOrder);
3172 CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
3173
3174 kernelThreadSpaceParam->reuseBBUpdateMask = 0;
3175 for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
3176 {
3177 kernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
3178 kernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
3179 kernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
3180 kernelThreadSpaceParam->threadCoordinates[i].resetMask= threadSpaceUnit[boardOrder[i]].reset;
3181 kernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
3182 kernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
3183 kernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
3184 kernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
3185 }
3186
3187 if( kernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
3188 {
3189 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
3190 threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
3191
3192 kernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
3193 kernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
3194 CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
3195 CmSafeMemCopy(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave,
3196 dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
3197
3198 }
3199 }
3200
3201 //Get group select setting information
3202 threadSpace->GetMediaWalkerGroupSelect(kernelThreadSpaceParam->groupSelect);
3203
3204 //Get color count
3205 threadSpace->GetColorCountMinusOne(kernelThreadSpaceParam->colorCountMinusOne);
3206
3207 dirtyStatus = threadSpace->GetDirtyStatus();
3208 switch (dirtyStatus)
3209 {
3210 case CM_THREAD_SPACE_CLEAN:
3211 kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_CLEAN;
3212 break;
3213 default:
3214 kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_DIRTY;
3215 break;
3216 }
3217
3218 finish:
3219 if( hr == CM_OUT_OF_HOST_MEMORY)
3220 {
3221 if( kernelThreadSpaceParam )
3222 {
3223 MosSafeDeleteArray(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
3224 MosSafeDeleteArray(kernelThreadSpaceParam->threadCoordinates);
3225 }
3226 }
3227
3228 return hr;
3229 }
3230
3231 //*-----------------------------------------------------------------------------
3232 //| Purpose: Delete the args array
3233 //*-----------------------------------------------------------------------------
DestroyArgs(void)3234 int32_t CmKernelRT::DestroyArgs( void )
3235 {
3236 for( uint32_t i =0 ; i < m_argCount; i ++ )
3237 {
3238 CM_ARG& arg = m_args[ i ];
3239 MosSafeDeleteArray( arg.value );
3240 MosSafeDeleteArray(arg.surfIndex);
3241 MosSafeDeleteArray(arg.surfArrayArg);
3242 arg.unitCount = 0;
3243 arg.unitSize = 0;
3244 arg.unitKind = 0;
3245 arg.unitOffsetInPayload = 0;
3246 arg.isDirty = true;
3247 arg.isSet = false;
3248 }
3249
3250 MosSafeDeleteArray( m_args );
3251
3252 m_threadSpaceAssociated = false;
3253 m_threadSpace = nullptr;
3254
3255 m_perThreadArgExists = false;
3256 m_perKernelArgExists = false;
3257
3258 m_sizeInCurbe = 0;
3259 m_curbeEnabled = true;
3260
3261 m_sizeInPayload = 0;
3262 m_adjustScoreboardY = 0;
3263
3264 ResetKernelSurfaces();
3265
3266 return CM_SUCCESS;
3267 }
3268
3269 //*-----------------------------------------------------------------------------
3270 // Calling reset makes it possible to change the per kernel or per thread
3271 // property of the argurments b/c it reset releases the memory for arguments
3272 //*-----------------------------------------------------------------------------
Reset(void)3273 int32_t CmKernelRT::Reset( void )
3274 {
3275 for( uint32_t i =0 ; i < m_argCount; i ++ )
3276 {
3277 CM_ARG& arg = m_args[ i ];
3278 MosSafeDeleteArray( arg.value );
3279 MosSafeDeleteArray( arg.surfIndex);
3280 MosSafeDeleteArray(arg.surfArrayArg);
3281 arg.value = nullptr;
3282 arg.unitCount = 0;
3283
3284 arg.unitSize = arg.unitSizeOrig;
3285 arg.unitKind = arg.unitKindOrig;
3286 arg.unitOffsetInPayload = arg.unitOffsetInPayloadOrig;
3287
3288 arg.isDirty = true;
3289 arg.isSet = false;
3290 arg.unitVmeArraySize = 0;
3291
3292 arg.isStatelessBuffer = false;
3293 arg.index = 0;
3294 }
3295
3296 m_threadCount = 0;
3297
3298 m_indexInTask = 0;
3299
3300 m_perThreadArgExists = false;
3301 m_perKernelArgExists = false;
3302
3303 m_sizeInCurbe = 0;
3304 m_curbeEnabled = true;
3305
3306 m_sizeInPayload = 0;
3307
3308 m_threadSpaceAssociated = false;
3309 m_threadSpace = nullptr;
3310 m_adjustScoreboardY = 0;
3311
3312 m_threadGroupSpace = nullptr;
3313
3314 MosSafeDeleteArray(m_kernelPayloadData);
3315 m_usKernelPayloadDataSize = 0;
3316
3317 if (m_usKernelPayloadSurfaceCount)
3318 {
3319 CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, m_usKernelPayloadSurfaceCount * sizeof(SurfaceIndex *));
3320 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
3321 m_usKernelPayloadSurfaceCount = 0;
3322 }
3323
3324 ResetKernelSurfaces();
3325
3326 return CM_SUCCESS;
3327 }
3328
3329 //*-----------------------------------------------------------------------------
3330 //| Purpose: Get the pointer to arguments array
3331 //*-----------------------------------------------------------------------------
GetArgs(CM_ARG * & arg)3332 int32_t CmKernelRT::GetArgs( CM_ARG* & arg )
3333 {
3334 arg = m_args;
3335 return CM_SUCCESS;
3336 }
3337
3338 //*-----------------------------------------------------------------------------
3339 //| Purpose: Get the arguments' count
3340 //*-----------------------------------------------------------------------------
GetArgCount(uint32_t & argCount)3341 int32_t CmKernelRT::GetArgCount( uint32_t & argCount )
3342 {
3343 argCount = m_argCount;
3344 return CM_SUCCESS;
3345 }
3346
3347 //*-----------------------------------------------------------------------------
3348 //| Purpose: Get the value of member CurbeEnable
3349 //*-----------------------------------------------------------------------------
GetCurbeEnable(bool & b)3350 int32_t CmKernelRT::GetCurbeEnable( bool& b )
3351 {
3352 b = m_curbeEnabled;
3353 return CM_SUCCESS;
3354 }
3355
3356 //*-----------------------------------------------------------------------------
3357 //| Purpose: Set the CurbeEnable member
3358 //*-----------------------------------------------------------------------------
SetCurbeEnable(bool b)3359 int32_t CmKernelRT::SetCurbeEnable( bool b )
3360 {
3361 m_curbeEnabled = b;
3362 return CM_SUCCESS;
3363 }
3364
3365 //*-----------------------------------------------------------------------------
3366 //| Purpose: Get the kernel's size in Curbe
3367 //*-----------------------------------------------------------------------------
GetSizeInCurbe(uint32_t & size)3368 int32_t CmKernelRT::GetSizeInCurbe( uint32_t& size )
3369 {
3370 size = m_sizeInCurbe;
3371 return CM_SUCCESS;
3372 }
3373
3374 //*-----------------------------------------------------------------------------
3375 //| Purpose: Get the total size in payload of meida object or media walker
3376 //*-----------------------------------------------------------------------------
GetSizeInPayload(uint32_t & size)3377 int32_t CmKernelRT::GetSizeInPayload( uint32_t& size )
3378 {
3379 size = m_sizeInPayload;
3380 return CM_SUCCESS;
3381 }
3382
3383 //*-----------------------------------------------------------------------------
3384 //| Purpose: Get the pointer to CM device
3385 //*-----------------------------------------------------------------------------
GetCmDevice(CmDeviceRT * & device)3386 int32_t CmKernelRT::GetCmDevice(CmDeviceRT* &device)
3387 {
3388 device = m_device;
3389 return CM_SUCCESS;
3390 }
3391
GetCmProgram(CmProgramRT * & program)3392 int32_t CmKernelRT::GetCmProgram( CmProgramRT* & program )
3393 {
3394 program = m_program;
3395 return CM_SUCCESS;
3396 }
3397
CollectKernelSurface()3398 int32_t CmKernelRT::CollectKernelSurface()
3399 {
3400 m_vmeSurfaceCount = 0;
3401 m_maxSurfaceIndexAllocated = 0;
3402
3403 for( uint32_t j = 0; j < m_argCount; j ++ )
3404 {
3405 if ((m_args[ j ].unitKind == ARG_KIND_SURFACE ) || // first time
3406 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_1D ) ||
3407 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D ) ||
3408 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
3409 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
3410 ( m_args[ j ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
3411 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_3D ) ||
3412 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
3413 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
3414 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_VME ) ||
3415 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
3416 ( m_args[ j ].unitKind == ARG_KIND_STATE_BUFFER ) )
3417 {
3418 int numSurfaces;
3419 int numValidSurfaces = 0;
3420
3421 if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3422 {
3423 numSurfaces = getSurfNumFromArgArraySize(m_args[j].unitSize, m_args[j].unitVmeArraySize);
3424 }
3425 else
3426 {
3427 numSurfaces = m_args[j].unitSize/sizeof(int);
3428 }
3429
3430 for (uint32_t k = 0; k < numSurfaces * m_args[j].unitCount; k ++)
3431 {
3432 uint16_t surfIndex = 0;
3433 if (m_args[j].surfIndex)
3434 {
3435 surfIndex = m_args[j].surfIndex[k];
3436 }
3437 if (surfIndex != 0 && surfIndex != CM_NULL_SURFACE)
3438 {
3439 m_surfaceArray[surfIndex] = true;
3440 numValidSurfaces ++;
3441 m_maxSurfaceIndexAllocated = Max(m_maxSurfaceIndexAllocated, surfIndex);
3442 }
3443 }
3444 if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3445 {
3446 m_vmeSurfaceCount += numValidSurfaces;
3447 }
3448 }
3449
3450 if (m_args[ j ].isStatelessBuffer)
3451 {
3452 uint32_t surfIndex = m_args[j].index;
3453 m_surfaceArray[surfIndex] = true;
3454 }
3455 }
3456
3457 for( int32_t i=0; i < CM_GLOBAL_SURFACE_NUMBER; ++i )
3458 {
3459 if( m_globalSurfaces[i] != nullptr )
3460 {
3461 uint32_t surfIndex = m_globalCmIndex[i];
3462 m_surfaceArray[surfIndex] = true;
3463 }
3464 }
3465
3466 for (int32_t i = 0; i < m_usKernelPayloadSurfaceCount; i++)
3467 {
3468 if (m_pKernelPayloadSurfaceArray[i] != nullptr)
3469 {
3470 uint32_t surfIndex = m_pKernelPayloadSurfaceArray[i]->get_data();
3471 m_surfaceArray[surfIndex] = true;
3472 }
3473 }
3474
3475 return CM_SUCCESS;
3476 }
3477
IsKernelDataReusable(CmThreadSpaceRT * threadSpace)3478 int32_t CmKernelRT::IsKernelDataReusable( CmThreadSpaceRT* threadSpace)
3479 {
3480 if(threadSpace)
3481 {
3482 if(threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN))
3483 {
3484 return false;
3485 }
3486 }
3487
3488 if(m_threadSpace)
3489 {
3490 if(m_threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN)
3491 {
3492 return false;
3493 }
3494 }
3495
3496 if(m_dirty != CM_KERNEL_DATA_CLEAN)
3497 {
3498 return false;
3499 }
3500
3501 return true;
3502 }
3503
3504 //*-----------------------------------------------------------------------------
3505 //| Purpose: Prepare Kernel Data including thread args, kernel args
3506 //| Returns: Result of the operation.
3507 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)3508 int32_t CmKernelRT::CreateKernelData(
3509 CmKernelData* & kernelData, // out
3510 uint32_t& kernelDataSize, // out
3511 const CmThreadSpaceRT* threadSpace ) // in
3512 {
3513 int32_t hr = CM_SUCCESS;
3514 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3515
3516 if( (threadSpace != nullptr) && (m_threadSpace != nullptr) )
3517 {
3518 // per-kernel threadspace and per-task threadspace cannot be set at the same time
3519 return CM_INVALID_THREAD_SPACE;
3520 }
3521
3522 if(m_lastKernelData == nullptr)
3523 {
3524 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3525 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3526 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3527 }
3528 else
3529 {
3530 if(IsKernelDataReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
3531 {
3532 // nothing changed; Reuse m_lastKernelData
3533 kernelData = m_lastKernelData;
3534 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3535 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3536 kernelDataSize = kernelData->GetKernelDataSize();
3537
3538 if (m_threadSpace)
3539 {
3540 halKernelParam = kernelData->GetHalCmKernelData();
3541 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3542 // need to set to clean here because CmThreadSpaceParam.BBdirtyStatus is only set in CreateKernelDataInternal
3543 // flag used to re-use batch buffer, don't care if BB is busy if it is "clean"
3544 halKernelParam->kernelThreadSpaceParam.bbDirtyStatus = CM_HAL_BB_CLEAN;
3545 }
3546 }
3547 else
3548 {
3549 if(m_lastKernelData->IsInUse())
3550 { // Need to Create a new one , if the kernel data is in use
3551 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3552 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3553 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3554 }
3555 else if(threadSpace && threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus() != CM_THREAD_SPACE_CLEAN))
3556 { // if thread space is assocaited , don't support reuse
3557 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3558 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3559 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3560 }
3561 else if(m_dirty < CM_KERNEL_DATA_THREAD_COUNT_DIRTY || // Kernel arg or thread arg dirty
3562 (m_threadSpace && m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DEPENDENCY_MASK_DIRTY))
3563 {
3564 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData,threadSpace));
3565 kernelData = m_lastKernelData;
3566 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3567 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3568 kernelDataSize = kernelData->GetKernelDataSize();
3569
3570 }
3571 else
3572 {
3573 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3574 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3575 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3576 }
3577 }
3578 }
3579
3580 CleanArgDirtyFlag();
3581 if(threadSpace)
3582 {
3583 threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3584 }
3585 if (m_threadSpace)
3586 {
3587 m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3588 }
3589
3590 finish:
3591 return hr;
3592 }
3593
GetName()3594 char* CmKernelRT::GetName() { return (char*)m_kernelInfo->kernelName; }
3595
3596 //*-----------------------------------------------------------------------------
3597 //| Purpose: Create Kernel Data
3598 //| Returns: Result of the operation.
3599 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3600 int32_t CmKernelRT::CreateKernelData(
3601 CmKernelData* & kernelData, // out
3602 uint32_t& kernelDataSize, // out
3603 const CmThreadGroupSpace* threadGroupSpace ) // in
3604 {
3605 int32_t hr = CM_SUCCESS;
3606 CmThreadGroupSpace* usedThreadGroupSpace = nullptr;
3607
3608 //If kernel has associated TGS, we will use it, instead of per-task TGS
3609 if (m_threadGroupSpace)
3610 {
3611 usedThreadGroupSpace = m_threadGroupSpace;
3612 }
3613 else
3614 {
3615 usedThreadGroupSpace = const_cast<CmThreadGroupSpace*>(threadGroupSpace);
3616 }
3617
3618 if(m_lastKernelData == nullptr)
3619 {
3620 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3621 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3622 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3623 }
3624 else
3625 {
3626 if (!((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) || (m_dirty & CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY)))
3627 {
3628 // nothing changed; Reuse m_lastKernelData
3629 kernelData = m_lastKernelData;
3630 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3631 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3632 kernelDataSize = kernelData->GetKernelDataSize();
3633 }
3634 else
3635 {
3636 if(m_lastKernelData->IsInUse())
3637 { // Need to Clone a new one
3638 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3639 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3640 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3641 }
3642 else
3643 {
3644 // change happend -> Reuse m_lastKernelData but need to change its content accordingly
3645 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData, usedThreadGroupSpace));
3646 kernelData = m_lastKernelData;
3647 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3648 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3649 kernelDataSize = kernelData->GetKernelDataSize();
3650 }
3651 }
3652 }
3653
3654 CleanArgDirtyFlag();
3655
3656 finish:
3657 return hr;
3658 }
3659
CleanArgDirtyFlag()3660 int32_t CmKernelRT::CleanArgDirtyFlag()
3661 {
3662
3663 for(uint32_t i =0 ; i< m_argCount; i++)
3664 {
3665 m_args[i].isDirty = false;
3666 }
3667
3668 if(m_threadSpace && m_threadSpace->GetDirtyStatus())
3669 {
3670 m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3671 }
3672
3673 m_dirty = CM_KERNEL_DATA_CLEAN;
3674
3675 return CM_SUCCESS;
3676 }
3677
3678 //*-----------------------------------------------------------------------------
3679 //| Purpose: Update the global surface and gtpin surface info to kernel data
3680 //| Returns: Result of the operation.
3681 //*-----------------------------------------------------------------------------
UpdateKernelDataGlobalSurfaceInfo(PCM_HAL_KERNEL_PARAM halKernelParam)3682 int32_t CmKernelRT::UpdateKernelDataGlobalSurfaceInfo( PCM_HAL_KERNEL_PARAM halKernelParam )
3683 {
3684 int32_t hr = CM_SUCCESS;
3685
3686 //global surface
3687 for ( uint32_t j = 0; j < CM_GLOBAL_SURFACE_NUMBER; j++ )
3688 {
3689 if ( m_globalSurfaces[ j ] != nullptr )
3690 {
3691 halKernelParam->globalSurface[ j ] = m_globalSurfaces[ j ]->get_data();
3692 halKernelParam->globalSurfaceUsed = true;
3693 }
3694 else
3695 {
3696 halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3697 }
3698 }
3699
3700 for ( uint32_t j = CM_GLOBAL_SURFACE_NUMBER; j < CM_MAX_GLOBAL_SURFACE_NUMBER; j++ )
3701 {
3702 halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3703 }
3704 #if USE_EXTENSION_CODE
3705 UpdateKernelDataGTPinSurfaceInfo(halKernelParam);
3706 #endif
3707
3708 return hr;
3709 }
3710
3711 //*-----------------------------------------------------------------------------
3712 //| Purpose: Prepare Kernel Data including thread args, kernel args
3713 //| Returns: Result of the operation.
3714 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3715 int32_t CmKernelRT::CreateKernelDataInternal(
3716 CmKernelData* & kernelData, // out
3717 uint32_t& kernelDataSize, // out
3718 const CmThreadGroupSpace* threadGroupSpace) // in
3719 {
3720 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3721 int32_t hr = CM_SUCCESS;
3722 uint32_t movInstNum = 0;
3723 uint32_t kernelCurbeSize = 0;
3724 uint32_t numArgs = 0;
3725 CM_ARG *tempArgs = nullptr;
3726 uint32_t argSize = 0;
3727 uint32_t surfNum = 0; //Pass needed BT entry numbers to HAL CM
3728 CmKernelRT *cmKernel = nullptr;
3729 uint32_t minKernelPlayloadOffset = 0;
3730 bool adjustLocalIdPayloadOffset = false;
3731
3732 CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create(this, kernelData));
3733 halKernelParam = kernelData->GetHalCmKernelData();
3734 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3735
3736 //Get Num of args with surface array
3737 CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
3738
3739 //Create Temp args
3740 CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
3741
3742 //Create move instructions
3743 CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
3744 CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
3745 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
3746
3747 halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
3748 halKernelParam->clonedKernelParam.kernelID = m_cloneKernelID;
3749 halKernelParam->clonedKernelParam.hasClones = m_hasClones;
3750
3751 halKernelParam->kernelId = m_id++;
3752 if ((m_program->m_cisaMajorVersion >= 3 && m_program->m_cisaMinorVersion >= 3))
3753 halKernelParam->numArgs = numArgs;
3754 else
3755 halKernelParam->numArgs = numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM;
3756 halKernelParam->numThreads = m_threadCount;
3757 halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3758 halKernelParam->kernelDataSize = kernelDataSize;
3759 halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3760 halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
3761
3762 halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
3763 halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
3764
3765 halKernelParam->kernelBinary = (uint8_t*)m_binary;
3766
3767 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->GetCmKernel(cmKernel));
3768 if (cmKernel == nullptr)
3769 {
3770 return CM_NULL_POINTER;
3771 }
3772 MOS_SecureStrcpy(halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName());
3773
3774 uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
3775 threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
3776
3777 for (uint32_t i = 0; i < numArgs; i++)
3778 {
3779 // get the min kernel payload offset
3780 if ((halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE) && IsKernelArg(tempArgs[i]))
3781 {
3782 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3783 {
3784 if (minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload)
3785 {
3786 minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3787 }
3788 }
3789 else
3790 {
3791 if ((minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3792 {
3793 minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3794 }
3795 }
3796 }
3797 }
3798
3799 for (uint32_t i = 0; i < numArgs; i++)
3800 {
3801 halKernelParam->argParams[i].unitCount = tempArgs[i].unitCount;
3802 halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[i].unitKind);
3803 halKernelParam->argParams[i].unitSize = tempArgs[i].unitSize;
3804 halKernelParam->argParams[i].payloadOffset = tempArgs[i].unitOffsetInPayload;
3805 halKernelParam->argParams[i].perThread = false;
3806 halKernelParam->argParams[i].nCustomValue = tempArgs[i].nCustomValue;
3807 halKernelParam->argParams[i].aliasIndex = tempArgs[i].aliasIndex;
3808 halKernelParam->argParams[i].aliasCreated = tempArgs[i].aliasCreated;
3809 halKernelParam->argParams[i].isNull = tempArgs[i].isNull;
3810
3811 if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_LOCALSIZE) {
3812 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3813 *(uint32_t *)halKernelParam->argParams[i].firstValue = thrdSpaceWidth;
3814 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = thrdSpaceHeight;
3815 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = thrdSpaceDepth;
3816 }
3817 else if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_GROUPSIZE) {
3818 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3819 *(uint32_t *)halKernelParam->argParams[i].firstValue = grpSpaceWidth;
3820 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = grpSpaceHeight;
3821 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = grpSpaceDepth;
3822 }
3823 else if (tempArgs[i].unitKind == ARG_KIND_IMPLICIT_LOCALID) {
3824 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3825 halKernelParam->localIdIndex = i;
3826 }
3827 else
3828 CreateThreadArgData(&halKernelParam->argParams[i], i, nullptr, tempArgs);
3829
3830 if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
3831 {
3832 if (IsKernelArg(halKernelParam->argParams[i]))
3833 {
3834 // Kernel arg : calculate curbe size & adjust payloadoffset
3835 if (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID)
3836 {
3837 halKernelParam->argParams[i].payloadOffset -= minKernelPlayloadOffset;
3838 }
3839 else
3840 {
3841 // ARG_KIND_IMPLICIT_LOCALID is only for visa3.3+, need to adjust payloadOffset of local id for visa3.3+ later.
3842 adjustLocalIdPayloadOffset = true;
3843 }
3844
3845 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3846 if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize))
3847 { // The largest one
3848 kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3849 }
3850 }
3851 else
3852 {
3853 if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3854 { // The largest one
3855 kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3856 }
3857 }
3858 }
3859 }
3860 }
3861
3862 if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
3863 {
3864 PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
3865 PCM_HAL_STATE state = cmData->cmHalState;
3866 kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
3867 halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
3868 }
3869
3870 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3871 {
3872 // GPGPU walker - implicit args
3873 for (uint32_t i = numArgs; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3874 {
3875 halKernelParam->argParams[i].unitCount = 1;
3876 halKernelParam->argParams[i].kind = CM_ARGUMENT_GENERAL;
3877 halKernelParam->argParams[i].unitSize = 4;
3878 halKernelParam->argParams[i].payloadOffset = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + (i - numArgs) * sizeof(uint32_t);
3879 halKernelParam->argParams[i].perThread = false;
3880 }
3881
3882 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 0].firstValue, thrdSpaceWidth));
3883 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 1].firstValue, thrdSpaceHeight));
3884 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 2].firstValue, grpSpaceWidth));
3885 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 3].firstValue, grpSpaceHeight));
3886 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 4].firstValue, thrdSpaceWidth));
3887 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 5].firstValue, thrdSpaceHeight));
3888 halKernelParam->localIdIndex = halKernelParam->numArgs - 2;
3889 }
3890 halKernelParam->gpgpuWalkerParams.gpgpuEnabled = true;
3891 halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
3892 halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
3893 halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
3894 halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
3895 halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
3896 halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
3897 //Get SLM size
3898 halKernelParam->slmSize = GetSLMSize();
3899
3900 //Get spill area to adjust scratch space
3901 halKernelParam->spillSize = GetSpillMemUsed();
3902
3903 //Set Barrier mode
3904 halKernelParam->barrierMode = m_barrierMode;
3905 halKernelParam->numberThreadsInGroup = thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3906 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3907 kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + CM_GPUWALKER_IMPLICIT_ARG_NUM * sizeof(uint32_t);
3908 else
3909 kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4);
3910 if ((kernelCurbeSize % 32) == 4) //The per-thread data occupy 2 GRF.
3911 {
3912 halKernelParam->curbeSizePerThread = 64;
3913 }
3914 else
3915 {
3916 halKernelParam->curbeSizePerThread = 32;
3917 }
3918 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3919 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread + halKernelParam->curbeSizePerThread *
3920 thrdSpaceWidth * thrdSpaceHeight;
3921 //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3922 halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread;
3923 }
3924 else {
3925 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) + halKernelParam->curbeSizePerThread *
3926 thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3927 //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3928 halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
3929 }
3930 halKernelParam->payloadSize = 0; // no thread arg allowed
3931
3932 // adjust payloadOffset of local id for visa3.3+
3933 if (adjustLocalIdPayloadOffset)
3934 {
3935 halKernelParam->argParams[halKernelParam->localIdIndex].payloadOffset = halKernelParam->crossThreadConstDataLen;
3936 }
3937
3938 m_sizeInCurbe = GetAlignedCurbeSize(halKernelParam->totalCurbeSize);
3939
3940 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
3941
3942 if (m_samplerBtiCount != 0)
3943 {
3944 CmSafeMemCopy((void*)halKernelParam->samplerBTIParam.samplerInfo, (void*)m_samplerBtiEntry, sizeof(m_samplerBtiEntry));
3945 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
3946
3947 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
3948 m_samplerBtiCount = 0;
3949 }
3950
3951 CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
3952
3953 UpdateKernelDataGlobalSurfaceInfo(halKernelParam);
3954
3955 //Destroy Temp Args
3956 for (uint32_t j = 0; j < numArgs; j++)
3957 {
3958 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3959 {
3960 MosSafeDeleteArray(tempArgs[j].value);
3961 }
3962 }
3963 MosSafeDeleteArray(tempArgs);
3964
3965 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
3966 finish:
3967 if (hr != CM_SUCCESS)
3968 {
3969 //Clean allocated memory : need to count the implicit args
3970 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3971
3972 for (uint32_t i = 0; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3973 {
3974 if (halKernelParam)
3975 {
3976 if (halKernelParam->argParams[i].firstValue)
3977 {
3978 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3979 }
3980 }
3981 }
3982 }
3983 else
3984 {
3985 for (uint32_t i = 0; i < numArgs; i++)
3986 {
3987 if (halKernelParam)
3988 {
3989 if (halKernelParam->argParams[i].firstValue)
3990 {
3991 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3992 }
3993 }
3994 }
3995 }
3996 //Destroy Temp Args in failing case
3997 if (tempArgs)
3998 {
3999 for (uint32_t j = 0; j < numArgs; j++)
4000 {
4001 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4002 {
4003 MosSafeDeleteArray(tempArgs[j].value);
4004 }
4005 }
4006 MosSafeDeleteArray(tempArgs);
4007 }
4008 }
4009 return hr;
4010 }
4011
4012 //*-----------------------------------------------------------------------------
4013 //| Purpose: Prepare Kernel Data including thread args, kernel args
4014 //| Returns: Result of the operation.
4015 //*-----------------------------------------------------------------------------
IsBatchBufferReusable(CmThreadSpaceRT * taskThreadSpace)4016 bool CmKernelRT::IsBatchBufferReusable( CmThreadSpaceRT * taskThreadSpace )
4017 {
4018 bool reusable = true;
4019 //Update m_id if the batch buffer is not reusable.
4020 if (m_dirty & CM_KERNEL_DATA_THREAD_ARG_DIRTY)
4021 {
4022 reusable = false; // if thread arg dirty
4023 }
4024 else if ((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) && (m_curbeEnabled == false))
4025 {
4026 reusable = false; // if kernel arg dirty and curbe disabled
4027 }
4028 else if (m_dirty & CM_KERNEL_DATA_THREAD_COUNT_DIRTY)
4029 {
4030 reusable = false; // if thread count dirty
4031 }
4032 else if (m_threadSpace)
4033 {
4034 if (m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4035 {
4036 reusable = false; // if per kernel thread space exists and it is completely dirty
4037 }
4038 }
4039 else if (taskThreadSpace)
4040 {
4041 if (taskThreadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4042 {
4043 reusable = false; // if per task thread space change and it is completely dirty
4044 }
4045 }
4046 return reusable;
4047
4048 }
4049
4050 //*-----------------------------------------------------------------------------
4051 //| Purpose: Checks to see if kernel prologue has changed
4052 //| Returns: Result of the operation.
4053 //*-----------------------------------------------------------------------------
IsPrologueDirty(void)4054 bool CmKernelRT::IsPrologueDirty( void )
4055 {
4056 bool prologueDirty = false;
4057
4058 if( m_threadCount != m_lastThreadCount )
4059 {
4060 if( m_lastThreadCount )
4061 {
4062 if( m_threadCount == 1 || m_lastThreadCount == 1 )
4063 {
4064 prologueDirty = true;
4065 }
4066 }
4067 m_lastThreadCount = m_threadCount;
4068 }
4069
4070 if( m_adjustScoreboardY != m_lastAdjustScoreboardY )
4071 {
4072 if( m_lastAdjustScoreboardY )
4073 {
4074 prologueDirty = true;
4075 }
4076 m_lastAdjustScoreboardY = m_adjustScoreboardY;
4077 }
4078
4079 return prologueDirty;
4080 }
4081
4082 //*-----------------------------------------------------------------------------
4083 //| Purpose: Prepare Kernel Data including thread args, kernel args
4084 //| Returns: Result of the operation.
4085 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)4086 int32_t CmKernelRT::CreateKernelDataInternal(
4087 CmKernelData* & kernelData, // out
4088 uint32_t& kernelDataSize, // out
4089 const CmThreadSpaceRT* threadSpace ) // in
4090 {
4091 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4092 int32_t hr = CM_SUCCESS;
4093 uint32_t movInstNum = 0;
4094 uint32_t kernelCurbeSize = 0;
4095 uint32_t numArgs = 0;
4096 uint32_t bottomRange = 1024;
4097 uint32_t upRange = 0;
4098 uint32_t unitSize = 0;
4099 bool hasThreadArg = false;
4100 CmThreadSpaceRT *cmThreadSpace = nullptr;
4101 bool isKernelThreadSpace = false;
4102 CM_ARG *tempArgs = nullptr;
4103 uint32_t argSize = 0;
4104 uint32_t surfNum = 0; //Pass needed BT entry numbers to HAL CM
4105 CmKernelRT *cmKernel = nullptr;
4106
4107 if( threadSpace == nullptr && m_threadSpace!= nullptr)
4108 {
4109 cmThreadSpace = m_threadSpace;
4110 isKernelThreadSpace = true;
4111 }
4112 else
4113 {
4114 cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4115 }
4116
4117 CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create( this, kernelData ));
4118 halKernelParam = kernelData->GetHalCmKernelData();
4119 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4120
4121 //Get Num of args with surface array
4122 CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
4123
4124 if( numArgs > 0)
4125 {
4126 //Create Temp args
4127 CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
4128 //Create move instructions
4129 CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
4130 }
4131
4132 CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
4133 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
4134
4135 if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4136 {
4137 m_id ++;
4138 }
4139
4140 if( IsPrologueDirty( ) )
4141 {
4142 // can't re-use kernel binary in GSH
4143 // just update upper 16 bits
4144 uint64_t tempID = m_id;
4145 tempID >>= 48;
4146 tempID++;
4147 tempID <<= 48;
4148 // get rid of old values in upper 16 bits
4149 m_id <<= 16;
4150 m_id >>= 16;
4151 m_id |= tempID;
4152 }
4153
4154 halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
4155 halKernelParam->clonedKernelParam.kernelID = m_cloneKernelID;
4156 halKernelParam->clonedKernelParam.hasClones = m_hasClones;
4157 halKernelParam->kernelId = m_id; // kernel id , high 32-bit is kernel id, low 32-bit is kernel data id for batch buffer reuse
4158 halKernelParam->numArgs = numArgs;
4159 halKernelParam->numThreads = m_threadCount;
4160 halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4161 halKernelParam->kernelDataSize = kernelDataSize;
4162 halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4163
4164 halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
4165 halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
4166 halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
4167
4168 halKernelParam->kernelBinary = (uint8_t*)m_binary;
4169
4170 CM_CHK_CMSTATUS_GOTOFINISH( kernelData->GetCmKernel( cmKernel ) );
4171 if ( cmKernel == nullptr )
4172 {
4173 return CM_NULL_POINTER;
4174 }
4175 MOS_SecureStrcpy( halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName() );
4176
4177 if ( cmThreadSpace )
4178 {// either from per kernel thread space or per task thread space
4179 CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(cmThreadSpace)); // must be called before CreateThreadArgData
4180 }
4181
4182 for(uint32_t i =0 ; i< numArgs; i++)
4183 {
4184 halKernelParam->argParams[i].unitCount = tempArgs[ i ].unitCount;
4185 halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[ i ].unitKind);
4186 halKernelParam->argParams[i].unitSize = tempArgs[ i ].unitSize;
4187 halKernelParam->argParams[i].payloadOffset = tempArgs[ i ].unitOffsetInPayload;
4188 halKernelParam->argParams[i].perThread = (tempArgs[ i ].unitCount > 1) ? true :false;
4189 halKernelParam->argParams[i].nCustomValue = tempArgs[ i ].nCustomValue;
4190 halKernelParam->argParams[i].aliasIndex = tempArgs[ i ].aliasIndex;
4191 halKernelParam->argParams[i].aliasCreated = tempArgs[ i ].aliasCreated;
4192 halKernelParam->argParams[i].isNull = tempArgs[ i ].isNull;
4193
4194 CreateThreadArgData(&halKernelParam->argParams[i], i, cmThreadSpace, tempArgs);
4195
4196 if(CHECK_SURFACE_TYPE ( halKernelParam->argParams[i].kind,
4197 ARG_KIND_SURFACE_VME,
4198 ARG_KIND_SURFACE_SAMPLER,
4199 ARG_KIND_SURFACE2DUP_SAMPLER))
4200 {
4201 unitSize = CM_ARGUMENT_SURFACE_SIZE;
4202 }
4203 else
4204 {
4205 unitSize = halKernelParam->argParams[i].unitSize;
4206 }
4207
4208 if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
4209 {
4210 if(IsKernelArg(halKernelParam->argParams[i]))
4211 {
4212 // Kernel arg : calculate curbe size & adjust payloadoffset
4213 // Note: Here the payloadOffset may be different from original value
4214 uint32_t offset = halKernelParam->argParams[i].payloadOffset - CM_PAYLOAD_OFFSET;
4215 if (offset >= kernelCurbeSize)
4216 {
4217 kernelCurbeSize = offset + unitSize;
4218 }
4219 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4220 }
4221 }
4222
4223 if(!IsKernelArg(halKernelParam->argParams[i]))
4224 { //Thread arg : Calculate payload size & adjust payloadoffset
4225 hasThreadArg = true;
4226 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4227
4228 if(halKernelParam->argParams[i].payloadOffset < bottomRange)
4229 {
4230 bottomRange = halKernelParam->argParams[i].payloadOffset;
4231 }
4232 if(halKernelParam->argParams[i].payloadOffset >= upRange)
4233 {
4234 upRange = halKernelParam->argParams[i].payloadOffset + unitSize;
4235 }
4236 }
4237 }
4238
4239 if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
4240 {
4241 PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
4242 PCM_HAL_STATE state = cmData->cmHalState;
4243 kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
4244 halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
4245 }
4246
4247 halKernelParam->payloadSize = hasThreadArg ? MOS_ALIGN_CEIL(upRange - bottomRange, 4): 0;
4248 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
4249 halKernelParam->curbeSizePerThread = halKernelParam->totalCurbeSize;
4250
4251 halKernelParam->perThreadArgExisted = hasThreadArg;
4252
4253 m_sizeInCurbe = GetAlignedCurbeSize( kernelCurbeSize );
4254
4255 if ( halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE )
4256 {
4257 for(uint32_t i=0; i< numArgs; i++)
4258 {
4259 if(!IsKernelArg(halKernelParam->argParams[i]))
4260 { // thread arg: need to minus curbe size
4261 halKernelParam->argParams[i].payloadOffset -= halKernelParam->curbeSizePerThread;
4262 }
4263 }
4264 }
4265
4266 //Create indirect data
4267 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
4268
4269 if ( m_samplerBtiCount != 0 )
4270 {
4271 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4272 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4273
4274 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4275 m_samplerBtiCount = 0;
4276 }
4277
4278 CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
4279
4280 //Create thread space param: only avaliable if per kernel ts exists
4281 if(m_threadSpace)
4282 {
4283 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadSpaceParam(&halKernelParam->kernelThreadSpaceParam, m_threadSpace));
4284 }
4285
4286 //Get SLM size
4287 halKernelParam->slmSize = GetSLMSize();
4288
4289 //Get Spill mem used
4290 halKernelParam->spillSize = GetSpillMemUsed();
4291
4292 //Set Barrier mode
4293 halKernelParam->barrierMode = m_barrierMode;
4294
4295 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4296
4297 //Destroy Temp Args
4298 for (uint32_t j = 0; j < numArgs; j++)
4299 {
4300 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4301 {
4302 MosSafeDeleteArray(tempArgs[j].value);
4303 }
4304 }
4305 MosSafeDeleteArray( tempArgs );
4306
4307 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4308 finish:
4309 if(hr != CM_SUCCESS)
4310 {
4311 if(halKernelParam)
4312 {
4313 //Clean allocated memory
4314 for(uint32_t i =0 ; i< numArgs; i++)
4315 {
4316 if( halKernelParam->argParams[i].firstValue )
4317 {
4318 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
4319 }
4320 }
4321 }
4322
4323 //Destroy Temp Args
4324 if (tempArgs)
4325 {
4326 for (uint32_t j = 0; j < numArgs; j++)
4327 {
4328 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4329 {
4330 MosSafeDeleteArray(tempArgs[j].value);
4331 }
4332 }
4333 MosSafeDeleteArray(tempArgs);
4334 }
4335 }
4336 return hr;
4337 }
4338
4339 //*-----------------------------------------------------------------------------
4340 //| Purpose: Update kernel data's kernel arg, thread arg, thread count
4341 //| Returns: Result of the operation.
4342 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadSpaceRT * threadSpace)4343 int32_t CmKernelRT::UpdateKernelData(
4344 CmKernelData* kernelData, // in
4345 const CmThreadSpaceRT* threadSpace)
4346 {
4347 int32_t hr = CM_SUCCESS;
4348 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4349 bool bbResuable = true;
4350 CmThreadSpaceRT *cmThreadSpace = nullptr;
4351 bool isKernelThreadSpace = false;
4352 uint32_t argIndexStep = 0;
4353 uint32_t argIndex = 0;
4354 uint32_t surfNum = 0; //Update Number of surface used by kernel
4355
4356 if( threadSpace == nullptr && m_threadSpace!= nullptr)
4357 {
4358 cmThreadSpace = m_threadSpace;
4359 isKernelThreadSpace = true;
4360 }
4361 else
4362 {
4363 cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4364 }
4365
4366 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4367 CM_ASSERT(kernelData->IsInUse() == false);
4368
4369 halKernelParam = kernelData->GetHalCmKernelData();
4370 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4371
4372 if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4373 {
4374 m_id ++;
4375 halKernelParam->kernelId = m_id;
4376 }
4377
4378 //Update arguments
4379 for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4380 {
4381 argIndexStep = 1;
4382
4383 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4384 ARG_KIND_SURFACE,
4385 ARG_KIND_SURFACE_1D,
4386 ARG_KIND_SURFACE_2D,
4387 ARG_KIND_SURFACE_2D_UP,
4388 ARG_KIND_SURFACE_SAMPLER,
4389 ARG_KIND_SURFACE2DUP_SAMPLER,
4390 ARG_KIND_SURFACE_3D,
4391 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4392 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4393 ARG_KIND_SURFACE_2D_SCOREBOARD,
4394 ARG_KIND_STATE_BUFFER ) )
4395 {
4396 argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4397 }
4398 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4399 {
4400 argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4401 }
4402
4403 if(m_args[ orgArgIndex ].isDirty)
4404 {
4405 if(m_args[ orgArgIndex ].unitCount > 1)
4406 { // thread arg is dirty
4407 bbResuable = false;
4408 }
4409
4410 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4411 ARG_KIND_SURFACE,
4412 ARG_KIND_SURFACE_1D,
4413 ARG_KIND_SURFACE_2D,
4414 ARG_KIND_SURFACE_2D_UP,
4415 ARG_KIND_SURFACE_SAMPLER,
4416 ARG_KIND_SURFACE2DUP_SAMPLER,
4417 ARG_KIND_SURFACE_3D,
4418 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4419 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4420 ARG_KIND_SURFACE_2D_SCOREBOARD,
4421 ARG_KIND_STATE_BUFFER ) )
4422 { // for surface args
4423
4424 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4425 if(m_args[ orgArgIndex ].unitCount == 1) // kernel arg
4426 {
4427 if (numSurfaces > 1)
4428 {
4429 for (uint32_t kk = 0; kk < numSurfaces; kk++)
4430 {
4431 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4432 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4433 m_args[orgArgIndex].value + kk*sizeof(uint32_t), sizeof(uint32_t));
4434 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4435 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4436 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4437
4438 if (!m_args[orgArgIndex].surfIndex[kk])
4439 {
4440 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4441 //This is for special usage if there is empty element in surface array.
4442 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4443 continue;
4444 }
4445
4446 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4447 halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4448 }
4449 }
4450 else
4451 {
4452 CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4453 CmSafeMemCopy(halKernelParam->argParams[argIndex].firstValue,
4454 m_args[ orgArgIndex ].value, sizeof(uint32_t));
4455 halKernelParam->argParams[argIndex].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4456 halKernelParam->argParams[argIndex].aliasIndex = m_args[orgArgIndex].aliasIndex;
4457 halKernelParam->argParams[argIndex].aliasCreated = m_args[orgArgIndex].aliasCreated;
4458 halKernelParam->argParams[argIndex].isNull = m_args[orgArgIndex].isNull;
4459 }
4460
4461 }
4462 else // thread arg
4463 {
4464 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4465 uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, (sizeof(uint32_t) * m_args[orgArgIndex].unitCount));
4466 CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
4467 for (uint32_t kk=0; kk< numSurfaces ; kk++)
4468 {
4469 for (uint32_t s = 0; s < m_args[orgArgIndex].unitCount; s++)
4470 {
4471 surfaces[s] = *(uint32_t *)((uint32_t *)m_args[orgArgIndex].value + kk + numSurfaces * s);
4472 }
4473 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4474 surfaces, sizeof(uint32_t) * m_args[orgArgIndex].unitCount);
4475
4476 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4477
4478 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4479 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4480 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4481
4482 }
4483 MosSafeDeleteArray(surfaces);
4484 }
4485
4486 }
4487 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4488 {
4489 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4490 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4491 {
4492 uint32_t vmeSurfOffset = 0;
4493 for (uint32_t kk = 0; kk< numSurfaces; kk++)
4494 {
4495 uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4496
4497 // reallocate the firstValue for VME surface every time
4498 // since the number of surfaces may vary
4499 MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4500 halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4501 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4502 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4503 m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4504
4505 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4506
4507 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4508 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4509 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4510 halKernelParam->argParams[argIndex + kk].unitSize = vmeSize;
4511 vmeSurfOffset += vmeSize;
4512 }
4513 }
4514 }
4515 else
4516 {
4517 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, cmThreadSpace, m_args));
4518 }
4519 }
4520 argIndex += argIndexStep;
4521 }
4522
4523 //Update Thread space param
4524 if(m_threadSpace && m_threadSpace->GetDirtyStatus())
4525 {
4526
4527 CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(m_threadSpace));
4528
4529 uint32_t threadSpaceWidth = 0, threadSpaceHeight = 0;
4530 PCM_HAL_KERNEL_THREADSPACE_PARAM cmKernelThreadSpaceParam = &halKernelParam->kernelThreadSpaceParam;
4531 m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
4532
4533 cmKernelThreadSpaceParam->threadSpaceWidth = (uint16_t)threadSpaceWidth;
4534 cmKernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
4535 m_threadSpace->GetDependencyPatternType(cmKernelThreadSpaceParam->patternType);
4536 m_threadSpace->GetWalkingPattern(cmKernelThreadSpaceParam->walkingPattern);
4537 m_threadSpace->GetColorCountMinusOne(cmKernelThreadSpaceParam->colorCountMinusOne);
4538
4539 CM_HAL_DEPENDENCY* dependency = nullptr;
4540 m_threadSpace->GetDependency( dependency);
4541
4542 if(dependency != nullptr)
4543 {
4544 CmSafeMemCopy(&cmKernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
4545 }
4546
4547 if( m_threadSpace->CheckWalkingParametersSet() )
4548 {
4549 CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetWalkingParameters(cmKernelThreadSpaceParam->walkingParams));
4550 }
4551
4552 if( m_threadSpace->CheckDependencyVectorsSet() )
4553 {
4554 CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetDependencyVectors(cmKernelThreadSpaceParam->dependencyVectors));
4555 }
4556
4557 if(m_threadSpace->IsThreadAssociated())
4558 {// media object only
4559 uint32_t *boardOrder = nullptr;
4560 m_threadSpace->GetBoardOrder(boardOrder);
4561 CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
4562
4563 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
4564 m_threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
4565 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceUnit);
4566
4567 cmKernelThreadSpaceParam->reuseBBUpdateMask = 0;
4568 for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
4569 {
4570 cmKernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
4571 cmKernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
4572 cmKernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
4573 cmKernelThreadSpaceParam->threadCoordinates[i].resetMask = threadSpaceUnit[boardOrder[i]].reset;
4574 cmKernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
4575 cmKernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
4576 cmKernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
4577 cmKernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
4578 }
4579
4580 if( cmKernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
4581 {
4582 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
4583 m_threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
4584
4585 if (cmKernelThreadSpaceParam->dispatchInfo.numWaves >= dispatchInfo.numWaves)
4586 {
4587 cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4588 CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4589 }
4590 else
4591 {
4592 cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4593 MosSafeDeleteArray(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
4594 cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
4595 CM_CHK_NULL_GOTOFINISH(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
4596 CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4597 }
4598 }
4599 }
4600 }
4601
4602 // Update indirect data
4603 if( m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY)
4604 {
4605 halKernelParam->indirectDataParam.indirectDataSize = m_usKernelPayloadDataSize;
4606 halKernelParam->indirectDataParam.surfaceCount = m_usKernelPayloadSurfaceCount;
4607
4608 if(m_usKernelPayloadDataSize != 0)
4609 {
4610 if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4611 { // size change, need to reallocate
4612 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4613 halKernelParam->indirectDataParam.indirectData = MOS_NewArray(uint8_t, m_usKernelPayloadDataSize);
4614 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.indirectData, CM_OUT_OF_HOST_MEMORY);
4615 }
4616 CmSafeMemCopy(halKernelParam->indirectDataParam.indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4617 }
4618
4619 if(m_usKernelPayloadSurfaceCount != 0)
4620 {
4621 if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4622 { // size change, need to reallocate
4623 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4624 halKernelParam->indirectDataParam.surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, m_usKernelPayloadSurfaceCount);
4625 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4626
4627 }
4628 CmSafeMemCopy((void*)halKernelParam->indirectDataParam.surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4629 m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4630 //clear m_IndirectSurfaceInfoArray every enqueue
4631 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4632 m_usKernelPayloadSurfaceCount = 0;
4633 }
4634 }
4635
4636 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4637 {
4638 if ( m_samplerBtiCount != 0 )
4639 {
4640 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4641 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4642
4643 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4644 m_samplerBtiCount = 0;
4645 }
4646 }
4647 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4648
4649 CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4650
4651 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4652
4653 finish:
4654 if( hr != CM_SUCCESS)
4655 {
4656 if( halKernelParam )
4657 {
4658 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4659 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4660 }
4661 }
4662 return hr;
4663 }
4664
4665 //*-----------------------------------------------------------------------------
4666 //| Purpose: Update kernel data's kernel arg, thread arg, thread count
4667 //| Returns: Result of the operation.
4668 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadGroupSpace * threadGroupSpace)4669 int32_t CmKernelRT::UpdateKernelData(
4670 CmKernelData* kernelData, // in
4671 const CmThreadGroupSpace* threadGroupSpace ) // in
4672 {
4673 int32_t hr = CM_SUCCESS;
4674 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4675 uint32_t argIndexStep = 0;
4676 uint32_t argIndex = 0;
4677 uint32_t surfNum = 0;
4678 auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
4679
4680 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4681 CM_ASSERT(kernelData->IsInUse() == false);
4682
4683 halKernelParam = kernelData->GetHalCmKernelData();
4684 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4685
4686 CM_CHK_NULL_GOTOFINISH_CMERROR(threadGroupSpace);
4687
4688 //Update arguments
4689 for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4690 {
4691 argIndexStep = 1;
4692
4693 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4694 ARG_KIND_SURFACE,
4695 ARG_KIND_SURFACE_1D,
4696 ARG_KIND_SURFACE_2D,
4697 ARG_KIND_SURFACE_2D_UP,
4698 ARG_KIND_SURFACE_SAMPLER,
4699 ARG_KIND_SURFACE2DUP_SAMPLER,
4700 ARG_KIND_SURFACE_3D,
4701 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4702 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4703 ARG_KIND_SURFACE_2D_SCOREBOARD,
4704 ARG_KIND_STATE_BUFFER ) )
4705 {
4706 argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4707 }
4708 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4709 {
4710 argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4711 }
4712
4713 if(m_args[ orgArgIndex ].isDirty)
4714 {
4715 if(m_args[ orgArgIndex ].unitCount > 1)
4716 { // thread arg is dirty
4717 CM_ASSERTMESSAGE("Error: Thread arg is not allowed in GPGPU walker.");
4718 hr = CM_FAILURE; // Thread arg is not allowed in GPGPU walker
4719 goto finish;
4720 }
4721
4722 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4723 ARG_KIND_SURFACE,
4724 ARG_KIND_SURFACE_1D,
4725 ARG_KIND_SURFACE_2D,
4726 ARG_KIND_SURFACE_2D_UP,
4727 ARG_KIND_SURFACE_SAMPLER,
4728 ARG_KIND_SURFACE2DUP_SAMPLER,
4729 ARG_KIND_SURFACE_3D,
4730 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4731 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4732 ARG_KIND_SURFACE_2D_SCOREBOARD,
4733 ARG_KIND_STATE_BUFFER ) )
4734 { // for surface args
4735 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4736 if(m_args[ orgArgIndex ].unitCount == 1) // kernel arg
4737 {
4738 if (numSurfaces > 1 )
4739 {
4740 for(uint32_t kk=0; kk< numSurfaces ; kk++)
4741 {
4742 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue
4743 != nullptr);
4744 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4745 m_args[ orgArgIndex ].value + kk*sizeof(uint32_t),
4746 sizeof(uint32_t));
4747 halKernelParam->argParams[argIndex + kk].aliasIndex
4748 = m_args[orgArgIndex].aliasIndex;
4749 halKernelParam->argParams[argIndex + kk].aliasCreated
4750 = m_args[orgArgIndex].aliasCreated;
4751 halKernelParam->argParams[argIndex + kk].isNull
4752 = m_args[orgArgIndex].isNull;
4753
4754 if (!m_args[orgArgIndex].surfIndex[kk])
4755 {
4756 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4757 //This is for special usage if there is empty element in surface array.
4758 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4759 continue;
4760 }
4761 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4762 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4763 halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4764
4765 }
4766 }
4767 else
4768 {
4769 CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4770 halKernelParam->argParams[argIndex].kind
4771 = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4772 halKernelParam->argParams[argIndex].aliasIndex
4773 = m_args[orgArgIndex].aliasIndex;
4774 halKernelParam->argParams[argIndex].aliasCreated
4775 = m_args[orgArgIndex].aliasCreated;
4776 halKernelParam->argParams[argIndex].isNull
4777 = m_args[orgArgIndex].isNull;
4778 if (halKernelParam->argParams[argIndex].isNull)
4779 {
4780 *(halKernelParam->argParams[argIndex].firstValue)
4781 = 0;
4782 }
4783 else
4784 {
4785 CmSafeMemCopy(
4786 halKernelParam->argParams[argIndex].firstValue,
4787 m_args[orgArgIndex].value, sizeof(uint32_t));
4788 }
4789 }
4790 }
4791 }
4792 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4793 {
4794 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4795 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4796 {
4797 uint32_t vmeSurfOffset = 0;
4798 for (uint32_t kk = 0; kk< numSurfaces; kk++)
4799 {
4800 uint32_t vmeSize = getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4801
4802 // reallocate the firstValue for VME surface every time
4803 // since the number of surfaces may vary
4804 MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4805 halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4806 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4807 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4808 m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4809
4810 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4811
4812 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4813 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4814 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4815 halKernelParam->argParams[argIndex + kk].unitSize = m_args[orgArgIndex].unitSize;
4816 vmeSurfOffset += vmeSize;
4817 }
4818 }
4819 }
4820 else
4821 {
4822 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, nullptr, m_args));
4823 }
4824 }
4825 argIndex += argIndexStep;
4826 }
4827
4828 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4829 {
4830 if ( m_samplerBtiCount != 0 )
4831 {
4832 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4833 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4834
4835 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4836 m_samplerBtiCount = 0;
4837 }
4838 }
4839
4840 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4841
4842 CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4843
4844 // GPGPU walker - implicit args
4845 uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
4846 threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
4847
4848 halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
4849 halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
4850 halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
4851 halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
4852 halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
4853 halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
4854
4855 if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 3))
4856 {
4857 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 0].firstValue, thrdSpaceWidth));
4858 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 1].firstValue, thrdSpaceHeight));
4859 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 2].firstValue, grpSpaceWidth));
4860 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 3].firstValue, grpSpaceHeight));
4861 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 4].firstValue, thrdSpaceWidth));
4862 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 5].firstValue, thrdSpaceHeight));
4863 }
4864
4865 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4866 finish:
4867 return hr;
4868 }
4869
4870 //*-----------------------------------------------------------------------------
4871 //| Purpose: Create kernel indirect data
4872 //| Returns: Result of the operation.
4873 //*-----------------------------------------------------------------------------
CreateKernelIndirectData(PCM_HAL_INDIRECT_DATA_PARAM halIndirectData)4874 int32_t CmKernelRT::CreateKernelIndirectData(
4875 PCM_HAL_INDIRECT_DATA_PARAM halIndirectData ) // in/out
4876 {
4877 int32_t hr = CM_SUCCESS;
4878
4879 halIndirectData->indirectDataSize = m_usKernelPayloadDataSize;
4880 halIndirectData->surfaceCount = m_usKernelPayloadSurfaceCount;
4881
4882 if( halIndirectData->indirectData == nullptr && m_usKernelPayloadDataSize != 0)
4883 {
4884 halIndirectData->indirectData = MOS_NewArray(uint8_t, halIndirectData->indirectDataSize);
4885 CM_CHK_NULL_GOTOFINISH(halIndirectData->indirectData, CM_OUT_OF_HOST_MEMORY);
4886 }
4887
4888 // For future kernel data, pKbyte is starting point
4889 if( halIndirectData->surfaceInfo == nullptr && m_usKernelPayloadSurfaceCount != 0)
4890 {
4891 halIndirectData->surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, halIndirectData->surfaceCount);
4892 CM_CHK_NULL_GOTOFINISH(halIndirectData->surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4893 }
4894
4895 if(m_usKernelPayloadDataSize != 0)
4896 {
4897 CmSafeMemCopy(halIndirectData->indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4898 }
4899
4900 if(m_usKernelPayloadSurfaceCount != 0)
4901 {
4902 CmSafeMemCopy((void*)halIndirectData->surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4903 m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4904 //clear m_IndirectSurfaceInfoArray every enqueue
4905 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4906 m_usKernelPayloadSurfaceCount = 0;
4907 }
4908 finish:
4909 if( hr != CM_SUCCESS)
4910 {
4911 if(halIndirectData->indirectData) MosSafeDeleteArray(halIndirectData->indirectData);
4912 if(halIndirectData->surfaceInfo) MosSafeDeleteArray(halIndirectData->surfaceInfo);
4913 }
4914 return hr;
4915 }
4916
4917 //*-----------------------------------------------------------------------------
4918 //| Purpose: UpdateLastKernelData
4919 //| Returns: Result of the operation.
4920 //*-----------------------------------------------------------------------------
UpdateLastKernelData(CmKernelData * & kernelData)4921 int32_t CmKernelRT::UpdateLastKernelData(
4922 CmKernelData* & kernelData) // in
4923 {
4924 int32_t hr = CM_SUCCESS;
4925
4926 if( kernelData == nullptr || m_lastKernelData == kernelData )
4927 {
4928 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4929 return CM_NULL_POINTER;
4930 }
4931
4932 if(m_lastKernelData)
4933 {
4934 CmKernelData::Destroy(m_lastKernelData); // reduce ref count or delete it
4935 }
4936 CSync* kernelLock = m_device->GetProgramKernelLock();
4937 CLock locker(*kernelLock);
4938 m_lastKernelData = kernelData;
4939 m_lastKernelData->Acquire();
4940 m_lastKernelDataSize = m_lastKernelData->GetKernelDataSize();
4941
4942 return hr;
4943 }
4944
4945 //*-----------------------------------------------------------------------------
4946 //| Purpose: Wrapper of CmKernelData::Destroy.
4947 //| Returns: Result of the operation.
4948 //*-----------------------------------------------------------------------------
ReleaseKernelData(CmKernelData * & kernelData)4949 int32_t CmKernelRT::ReleaseKernelData(
4950 CmKernelData* & kernelData)
4951 {
4952 int32_t hr = CM_SUCCESS;
4953
4954 if( kernelData == nullptr)
4955 {
4956 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4957 return CM_NULL_POINTER;
4958 }
4959
4960 CSync* kernelLock = m_device->GetProgramKernelLock();
4961 CLock locker(*kernelLock);
4962
4963 if(m_lastKernelData == kernelData)
4964 {
4965 // If the kernel data is the last kernel data
4966 // Need to update m_lastKernelData.
4967 hr = CmKernelData::Destroy(m_lastKernelData);
4968 }
4969 else
4970 {
4971 hr = CmKernelData::Destroy(kernelData);
4972 }
4973
4974 return hr;
4975 }
4976
4977 //*-----------------------------------------------------------------------------
4978 //| Purpose: Acquire Kernel and Program
4979 //*-----------------------------------------------------------------------------
AcquireKernelProgram()4980 int32_t CmKernelRT::AcquireKernelProgram()
4981 {
4982 CSync* kernelLock = m_device->GetProgramKernelLock();
4983 CLock locker(*kernelLock);
4984
4985 this->Acquire(); // increase kernel's ref count
4986 m_program->Acquire(); // increase program's ref count
4987
4988 return CM_SUCCESS;
4989 }
4990
4991 //*-----------------------------------------------------------------------------
4992 //| Purpose: Acquire KenrelData, Kernel and Program
4993 //*-----------------------------------------------------------------------------
AcquireKernelData(CmKernelData * & kernelData)4994 int32_t CmKernelRT::AcquireKernelData(
4995 CmKernelData * &kernelData)
4996 {
4997 int32_t hr = CM_SUCCESS;
4998
4999 if (kernelData == nullptr)
5000 {
5001 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
5002 return CM_NULL_POINTER;
5003 }
5004
5005 CSync* kernelLock = m_device->GetProgramKernelLock();
5006 CLock locker(*kernelLock);
5007 kernelData->Acquire(); // increase kernel data's ref count
5008
5009 return hr;
5010 }
5011
SetAsClonedKernel(uint32_t cloneKernelID)5012 void CmKernelRT::SetAsClonedKernel(uint32_t cloneKernelID)
5013 {
5014 m_isClonedKernel = true;
5015 m_cloneKernelID = cloneKernelID;
5016 }
5017
GetCloneKernelID(uint32_t & cloneKernelID)5018 bool CmKernelRT::GetCloneKernelID(uint32_t& cloneKernelID)
5019 {
5020 if (m_isClonedKernel)
5021 {
5022 cloneKernelID = m_cloneKernelID;
5023 return true;
5024 }
5025
5026 return false;
5027 }
5028
SetHasClones()5029 void CmKernelRT::SetHasClones()
5030 {
5031 m_hasClones = true;
5032 }
5033
5034 //*-----------------------------------------------------------------------------
5035 //| Purpose: Clone/copy current kernel
5036 //| Returns: New kernel with content of source kernel
5037 //*-----------------------------------------------------------------------------
CloneKernel(CmKernelRT * & kernelOut,uint32_t id)5038 int32_t CmKernelRT::CloneKernel(CmKernelRT *& kernelOut, uint32_t id)
5039 {
5040 int32_t hr = CM_SUCCESS;
5041
5042 CSync* kernelLock = m_device->GetProgramKernelLock();
5043 CLock locker(*kernelLock);
5044
5045 CmDynamicArray * kernelArray = m_device->GetKernelArray();
5046
5047 uint32_t freeSlotinKernelArray = kernelArray->GetFirstFreeIndex();
5048
5049 hr = Create(m_device, m_program, (char*)GetName(), freeSlotinKernelArray, id, kernelOut, m_options);
5050
5051 if (hr == CM_SUCCESS)
5052 {
5053 kernelOut->SetAsClonedKernel(m_id >> 32);
5054 kernelArray->SetElement(freeSlotinKernelArray, kernelOut);
5055 uint32_t *kernelCount = m_device->GetKernelCount();
5056 *kernelCount = *kernelCount + 1;
5057
5058 SetHasClones();
5059 }
5060
5061 return hr;
5062 }
5063
5064 //*-----------------------------------------------------------------------------
5065 //| Purpose: Set Kernel's index in one task
5066 //| Returns: Result of the operation.
5067 //*-----------------------------------------------------------------------------
SetIndexInTask(uint32_t index)5068 int32_t CmKernelRT::SetIndexInTask(uint32_t index)
5069 {
5070 m_indexInTask = index;
5071 return CM_SUCCESS;
5072 }
5073
5074 //*-----------------------------------------------------------------------------
5075 //| Purpose: Get Kernel's index in one task
5076 //| Returns: Result of the operation.
5077 //*-----------------------------------------------------------------------------
GetIndexInTask(void)5078 uint32_t CmKernelRT::GetIndexInTask(void)
5079 {
5080 return m_indexInTask;
5081 }
5082
5083 //*-----------------------------------------------------------------------------
5084 //| Purpose: Set Associated Flag
5085 //| Returns: Result of the operation.
5086 //*-----------------------------------------------------------------------------
SetAssociatedToTSFlag(bool b)5087 int32_t CmKernelRT::SetAssociatedToTSFlag(bool b)
5088 {
5089 m_threadSpaceAssociated = b;
5090 return CM_SUCCESS;
5091 }
5092
5093 //*-----------------------------------------------------------------------------
5094 //| Purpose: Set threadspace for kernel
5095 //| Returns: Result of the operation.
5096 //| Note: It's exclusive with AssociateThreadGroupSpace()
5097 //*-----------------------------------------------------------------------------
AssociateThreadSpace(CmThreadSpace * & threadSpace)5098 CM_RT_API int32_t CmKernelRT::AssociateThreadSpace(CmThreadSpace *&threadSpace)
5099 {
5100 if( threadSpace == nullptr )
5101 {
5102 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5103 return CM_INVALID_ARG_VALUE;
5104 }
5105
5106 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5107 if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5108 {
5109 CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5110 if (threadSpaceRTConst == nullptr)
5111 {
5112 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5113 return CM_INVALID_ARG_VALUE;
5114 }
5115 CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5116 return AssociateThreadGroupSpace(threadGroupSpace);
5117 }
5118 else
5119 {
5120 if (m_threadGroupSpace != nullptr)
5121 {
5122 CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadGroupSpace().");
5123 return CM_INVALID_KERNEL_THREADSPACE;
5124 }
5125 }
5126
5127 bool threadSpaceChanged = false;
5128 if( m_threadSpace )
5129 {
5130 if( m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace) )
5131 {
5132 threadSpaceChanged = true;
5133 }
5134 }
5135
5136 m_threadSpace = static_cast<CmThreadSpaceRT *>(threadSpace);
5137
5138 uint32_t threadSpaceWidth = 0;
5139 uint32_t threadSpaceHeight = 0;
5140 m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
5141 uint32_t threadCount = threadSpaceWidth * threadSpaceHeight;
5142 if (m_threadCount)
5143 {
5144 // Setting threadCount twice with different values will cause reset of kernels
5145 if (m_threadCount != threadCount)
5146 {
5147 m_threadCount = threadCount;
5148 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
5149 }
5150 }
5151 else // first time
5152 {
5153 m_threadCount = threadCount;
5154 }
5155
5156 if( threadSpaceChanged )
5157 {
5158 m_threadSpace->SetDirtyStatus( CM_THREAD_SPACE_DATA_DIRTY);
5159 }
5160
5161 return CM_SUCCESS;
5162 }
5163
5164 //*-----------------------------------------------------------------------------
5165 //| Purpose: Set thread group space for kernel
5166 //| Returns: Result of the operation.
5167 //| Note: It's exclusive with AssociateThreadSpace()
5168 //*-----------------------------------------------------------------------------
AssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5169 CM_RT_API int32_t CmKernelRT::AssociateThreadGroupSpace(CmThreadGroupSpace *&threadGroupSpace)
5170 {
5171 if( threadGroupSpace == nullptr )
5172 {
5173 CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5174 return CM_INVALID_ARG_VALUE;
5175 }
5176
5177 if (m_threadSpace != nullptr)
5178 {
5179 CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadSpace().");
5180 return CM_INVALID_KERNEL_THREADGROUPSPACE;
5181 }
5182
5183 m_threadGroupSpace = threadGroupSpace;
5184
5185 return CM_SUCCESS;
5186 }
5187
5188 //*-----------------------------------------------------------------------------
5189 //| Purpose: Clear threadspace for kernel
5190 //| Returns: Result of the operation.
5191 //*-----------------------------------------------------------------------------
DeAssociateThreadSpace(CmThreadSpace * & threadSpace)5192 CM_RT_API int32_t CmKernelRT::DeAssociateThreadSpace(CmThreadSpace * &threadSpace)
5193 {
5194 if (threadSpace == nullptr)
5195 {
5196 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5197 return CM_NULL_POINTER;
5198 }
5199
5200 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5201 if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5202 {
5203 CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5204 if (threadSpaceRTConst == nullptr)
5205 {
5206 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5207 return CM_INVALID_ARG_VALUE;
5208 }
5209
5210 CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5211 if (m_threadGroupSpace != threadGroupSpace)
5212 {
5213 CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5214 return CM_INVALID_ARG_VALUE;
5215 }
5216 m_threadGroupSpace = nullptr;
5217 }
5218 else
5219 {
5220 if (m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace))
5221 {
5222 CM_ASSERTMESSAGE("Error: Invalid thread space handle.");
5223 return CM_INVALID_ARG_VALUE;
5224 }
5225 m_threadSpace = nullptr;
5226 }
5227
5228 return CM_SUCCESS;
5229 }
5230 //*--------------------------------------------------------------------------------------------
5231 //| Purpose: query spill memory size, the function can only take effect when jitter is enabled
5232 //| Return: Result of the operation.
5233 //*---------------------------------------------------------------------------------------------
5234
QuerySpillSize(uint32_t & spillMemorySize)5235 CM_RT_API int32_t CmKernelRT::QuerySpillSize(uint32_t &spillMemorySize)
5236 {
5237 CM_KERNEL_INFO *kernelInfo = nullptr;
5238
5239 int32_t hr = m_program->GetKernelInfo(m_kernelIndex, kernelInfo);
5240 if (hr != CM_SUCCESS || kernelInfo == nullptr)
5241 return hr;
5242
5243 if (m_program->IsJitterEnabled()) {
5244 if (kernelInfo->jitInfo != nullptr) {
5245 spillMemorySize = (kernelInfo->jitInfo)->spillMemUsed;
5246 return hr;
5247 }
5248 else
5249 return CM_FAILURE;
5250 }
5251
5252 return CM_FAILURE;
5253 }
5254
5255 //*-----------------------------------------------------------------------------
5256 //| Purpose: Clear threadgroupspace for kernel
5257 //| Returns: Result of the operation.
5258 //*-----------------------------------------------------------------------------
DeAssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5259 int32_t CmKernelRT::DeAssociateThreadGroupSpace(CmThreadGroupSpace * &threadGroupSpace)
5260 {
5261 if (threadGroupSpace == nullptr)
5262 {
5263 CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5264 return CM_NULL_POINTER;
5265 }
5266 if (m_threadGroupSpace != threadGroupSpace)
5267 {
5268 CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5269 return CM_INVALID_ARG_VALUE;
5270 }
5271 m_threadGroupSpace = nullptr;
5272 m_dirty = CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY;
5273
5274 return CM_SUCCESS;
5275 }
5276
5277 //*-----------------------------------------------------------------------------
5278 //| Purpose: Indicate whether thread arg existed.
5279 //| Returns: Result of the operation.
5280 //*-----------------------------------------------------------------------------
IsThreadArgExisted()5281 bool CmKernelRT::IsThreadArgExisted()
5282 {
5283 return m_perThreadArgExists;
5284 }
5285
5286 //*-----------------------------------------------------------------------------
5287 //| Purpose: Get the size of SharedLocalMemory
5288 //| Returns: Result of the operation.
5289 //*-----------------------------------------------------------------------------
GetSLMSize()5290 uint32_t CmKernelRT::GetSLMSize()
5291 {
5292 return (uint32_t)m_kernelInfo->kernelSLMSize;
5293 }
5294
5295 //*-----------------------------------------------------------------------------
5296 //| Purpose: Get the spill size of the kernel from JIT
5297 //| Returns: Result of the operation.
5298 //*-----------------------------------------------------------------------------
GetSpillMemUsed()5299 uint32_t CmKernelRT::GetSpillMemUsed()
5300 {
5301 uint32_t spillSize;
5302
5303 if (m_program->IsJitterEnabled() && m_kernelInfo->jitInfo != nullptr)
5304 {
5305 spillSize = (m_kernelInfo->jitInfo)->spillMemUsed;
5306 }
5307 else
5308 {
5309 // kernel uses "--nojitter" option, don't allocate scratch space
5310 spillSize = 0;
5311 }
5312
5313 return spillSize;
5314 }
5315
SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind,uint32_t surfaceIndex,uint32_t bti)5316 int32_t CmKernelRT::SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind, uint32_t surfaceIndex, uint32_t bti)
5317 {
5318 uint16_t i = 0;
5319 for ( i = 0; i < CM_MAX_STATIC_SURFACE_STATES_PER_BT; i++ )
5320 {
5321 if ( ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == surfaceIndex ) && ( m_IndirectSurfaceInfoArray[ i ].kind == kind ) && ( m_IndirectSurfaceInfoArray[ i ].bindingTableIndex == bti ) ) ||
5322 ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == 0 ) && ( m_IndirectSurfaceInfoArray[ i ].kind == 0 ) ) )
5323 {
5324 return i;
5325 }
5326 }
5327 // should never reach this
5328 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5329 return CM_FAILURE;
5330 }
5331
5332 //-----------------------------------------------------------------------------------------------------------------
5333 //! Set surface binding table index count for each indirect surface
5334 //! INPUT:
5335 //! 1) Surface format
5336 //! 2) Surface type.
5337 //! OUTPUT:
5338 //! binding table index count
5339 //-----------------------------------------------------------------------------------------------------------------
SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format,CM_ENUM_CLASS_TYPE surfaceType)5340 int32_t CmKernelRT::SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format, CM_ENUM_CLASS_TYPE surfaceType)
5341 {
5342 if (surfaceType == CM_ENUM_CLASS_TYPE_CMBUFFER_RT)
5343 {
5344 return 1;
5345 }
5346 else
5347 {
5348 if ((format == CM_SURFACE_FORMAT_NV12) ||
5349 (format == CM_SURFACE_FORMAT_P010) ||
5350 (format == CM_SURFACE_FORMAT_P208) ||
5351 (format == CM_SURFACE_FORMAT_P016))
5352 {
5353 return 2;
5354 }
5355 else if (format == CM_SURFACE_FORMAT_422H ||
5356 format == CM_SURFACE_FORMAT_411P ||
5357 format == CM_SURFACE_FORMAT_IMC3 ||
5358 format == CM_SURFACE_FORMAT_422V ||
5359 format == CM_SURFACE_FORMAT_444P)
5360 { // 3 planes surface
5361 return 3;
5362 }
5363 else
5364 {
5365 return 1;
5366 }
5367 }
5368 // should never reach this
5369 CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5370 return 0;
5371 }
5372
5373 //-----------------------------------------------------------------------------------------------------------------
5374 //! Set surface binding table index by user.
5375 //! If application hope to assign a specific binding table index for a surface, it should call this function.
5376 //! The assigned binding table index should be an valid value for general surface ( say >=1 and <=242),
5377 //! otherwise, this call will return failure.
5378 //! INPUT:
5379 //! 1) Surface whose binding table index need be set.
5380 //! 2) Assiend binding table index.
5381 //! OUTPUT:
5382 //! CM_SUCCESS
5383 //! CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX if the surface index is not a valid binding table index (valid: 1~242)
5384 //! CM_FAILURE otherwise
5385 //-----------------------------------------------------------------------------------------------------------------
SetSurfaceBTI(SurfaceIndex * surface,uint32_t btIndex)5386 CM_RT_API int32_t CmKernelRT::SetSurfaceBTI(SurfaceIndex* surface, uint32_t btIndex)
5387 {
5388
5389 uint32_t width, height, bytesPerPixel;
5390 CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID;
5391 //Sanity check
5392 if (surface == nullptr)
5393 {
5394 CM_ASSERTMESSAGE("Error: Pointer to surface is null.");
5395 return CM_NULL_POINTER;
5396 }
5397 if (!m_surfaceMgr->IsValidSurfaceIndex(btIndex))
5398 {
5399 CM_ASSERTMESSAGE("Error: Invalid binding table index.");
5400 return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5401 }
5402
5403 //Sanity check: if the BTI has been used once enqueue
5404 uint32_t i = 0;
5405 for (i = 0; i < m_usKernelPayloadSurfaceCount; i++)
5406 {
5407 if (m_IndirectSurfaceInfoArray[i].bindingTableIndex == (uint16_t)btIndex)
5408 {
5409 CM_ASSERTMESSAGE("Error: Binding table index has been used once enqueue.");
5410 return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5411 }
5412 }
5413
5414 uint32_t index = surface->get_data();
5415 uint32_t handle = 0;
5416
5417 CmSurface* surfaceRT = nullptr;
5418 m_surfaceMgr->GetSurface( index, surfaceRT );
5419 if(surfaceRT == nullptr)
5420 {
5421 CM_ASSERTMESSAGE("Error: Invalid surface.");
5422 return CM_NULL_POINTER;
5423 }
5424
5425 CmSurface2DRT* surf2D = nullptr;
5426 uint32_t indirectSurfInfoEntry = 0;
5427 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2D )
5428 {
5429 surf2D = static_cast< CmSurface2DRT* >( surfaceRT );
5430 surf2D->GetHandle( handle );
5431 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D, handle, btIndex);
5432 if (indirectSurfInfoEntry == CM_FAILURE)
5433 {
5434 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5435 return CM_FAILURE;
5436 }
5437 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D;
5438 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5439 surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5440 }
5441 else
5442 {
5443 CmBuffer_RT* cmBuffer = nullptr;
5444 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
5445 {
5446 cmBuffer = static_cast< CmBuffer_RT* >( surfaceRT );
5447 cmBuffer->GetHandle( handle );
5448 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_1D, handle, btIndex);
5449 if (indirectSurfInfoEntry == CM_FAILURE)
5450 {
5451 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5452 return CM_FAILURE;
5453 }
5454 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_1D;
5455 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5456 }
5457 else
5458 {
5459 CmSurface2DUPRT* surf2DUP = nullptr;
5460 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2DUP )
5461 {
5462 surf2DUP = static_cast< CmSurface2DUPRT* >( surfaceRT );
5463 surf2DUP->GetHandle( handle );
5464 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D_UP, handle, btIndex);
5465 if (indirectSurfInfoEntry == CM_FAILURE)
5466 {
5467 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5468 return CM_FAILURE;
5469 }
5470 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D_UP;
5471 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5472 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5473 }
5474 else
5475 {
5476 CmSurfaceSampler* surfSampler = nullptr;
5477 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER )
5478 {
5479 surfSampler = static_cast< CmSurfaceSampler* >(surfaceRT);
5480
5481 //Get actually SurfaceIndex ID for 2D
5482 uint16_t surfIndexForCurrent = 0;
5483 surfSampler->GetCmIndexCurrent(surfIndexForCurrent);
5484 CmSurface* surfSampRT= nullptr;
5485 m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSampRT);
5486 if(surfSampRT == nullptr)
5487 {
5488 CM_ASSERTMESSAGE("Error: Invalid surface.");
5489 return CM_NULL_POINTER;
5490 }
5491
5492 SAMPLER_SURFACE_TYPE surfaceType;
5493 surfSampler->GetSurfaceType(surfaceType);
5494 surfSampler->GetHandle( handle );
5495 if ( surfaceType == SAMPLER_SURFACE_TYPE_2D )
5496 {
5497 CmSurface2DRT* surfSamp2D = nullptr;
5498 surfSamp2D = static_cast<CmSurface2DRT*>(surfSampRT);
5499 surfSamp2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5500
5501 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER, handle, btIndex);
5502 if (indirectSurfInfoEntry == CM_FAILURE)
5503 {
5504 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5505 return CM_FAILURE;
5506 }
5507 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER;
5508 }
5509 else if ( surfaceType == SAMPLER_SURFACE_TYPE_2DUP )
5510 {
5511 CmSurface2DUPRT* surfSamp2DUP = nullptr;
5512 surfSamp2DUP = static_cast<CmSurface2DUPRT*>(surfSampRT);
5513 surfSamp2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5514
5515 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE2DUP_SAMPLER, handle, btIndex);
5516 if (indirectSurfInfoEntry == CM_FAILURE)
5517 {
5518 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5519 return CM_FAILURE;
5520 }
5521 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE2DUP_SAMPLER;
5522 }
5523 else if ( surfaceType == SAMPLER_SURFACE_TYPE_3D )
5524 {
5525 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_3D, handle, btIndex);
5526 if (indirectSurfInfoEntry == CM_FAILURE)
5527 {
5528 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5529 return CM_FAILURE;
5530 }
5531 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_3D;
5532 }
5533 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5534 }
5535 else
5536 {
5537 CmSurfaceSampler8x8* surfSampler8x8 = nullptr;
5538 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 )
5539 {
5540 surfSampler8x8 = static_cast< CmSurfaceSampler8x8* >( surfaceRT );
5541 surfSampler8x8->GetIndexCurrent( handle );
5542
5543 //Get actually SurfaceIndex ID for 2D
5544 uint16_t surfIndexForCurrent = 0;
5545 surfSampler8x8->GetCmIndex(surfIndexForCurrent);
5546 CmSurface* surfSamp8x8RT = nullptr;
5547 m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSamp8x8RT);
5548 if(surfSamp8x8RT == nullptr)
5549 {
5550 CM_ASSERTMESSAGE("Error: Invalid surface.");
5551 return CM_NULL_POINTER;
5552 }
5553
5554 CmSurface2DRT* surfSamp8x82D = nullptr;
5555 surfSamp8x82D = static_cast<CmSurface2DRT*>(surfSamp8x8RT);
5556 surfSamp8x82D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5557
5558 if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_AVS_SURFACE )
5559 {
5560 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_AVS, handle, btIndex);
5561 if (indirectSurfInfoEntry == CM_FAILURE)
5562 {
5563 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5564 return CM_FAILURE;
5565 }
5566 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5567 }
5568 else if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE )
5569 {
5570 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_VA, handle, btIndex);
5571 if (indirectSurfInfoEntry == CM_FAILURE)
5572 {
5573 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5574 return CM_FAILURE;
5575 }
5576 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
5577 }
5578 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5579 }
5580 else
5581 {
5582 return CM_FAILURE;
5583 }
5584 }
5585 }
5586 }
5587 }
5588
5589 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].bindingTableIndex = (uint16_t)btIndex;
5590 if (SetSurfBTINumForIndirectData(format, surfaceRT->Type())== 0)
5591 {
5592 CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5593 return CM_FAILURE;
5594 }
5595 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].numBTIPerSurf = (uint16_t)SetSurfBTINumForIndirectData(format, surfaceRT->Type());
5596
5597 //Copy it to surface index array
5598
5599 m_pKernelPayloadSurfaceArray[indirectSurfInfoEntry] = surface;
5600
5601
5602 // count is actally one larger than the actual index
5603 m_usKernelPayloadSurfaceCount = indirectSurfInfoEntry + 1;
5604 m_dirty |= (CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY | CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY);
5605 return CM_SUCCESS;
5606 }
5607
GetKernelIndex()5608 uint32_t CmKernelRT::GetKernelIndex()
5609 {
5610 return m_kernelIndex;
5611 }
GetKernelGenxBinarySize(void)5612 uint32_t CmKernelRT::GetKernelGenxBinarySize(void)
5613 {
5614 if(m_kernelInfo == nullptr)
5615 {
5616 CM_ASSERTMESSAGE("Error: Invalid kernel genx binary size.");
5617 return 0;
5618 }
5619 else
5620 {
5621 return m_kernelInfo->genxBinarySize;
5622 }
5623 }
5624
5625 //-----------------------------------------------------------------------------------------------------------------
5626 //! Map Surface type to Kernel arg Kind.
5627 //! INPUT: Surface type :CM_ENUM_CLASS_TYPE
5628 //! OUTPUT: Kernel arg Kind :CM_ARG_KIND
5629 //-----------------------------------------------------------------------------------------------------------------
SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)5630 CM_ARG_KIND CmKernelRT::SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)
5631 {
5632 switch(surfType)
5633 {
5634 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT :return ARG_KIND_SURFACE_1D;
5635 case CM_ENUM_CLASS_TYPE_CMSURFACE2D :return ARG_KIND_SURFACE_2D;
5636 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP :return ARG_KIND_SURFACE_2D_UP;
5637 case CM_ENUM_CLASS_TYPE_CMSURFACE3D :return ARG_KIND_SURFACE_3D;
5638 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER :return ARG_KIND_SURFACE_SAMPLER;
5639 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 :return ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5640 case CM_ENUM_CLASS_TYPE_CMSURFACEVME :return ARG_KIND_SURFACE_VME;
5641 case CM_ENUM_CLASS_TYPE_CMSAMPLER_RT :return ARG_KIND_SAMPLER;
5642 case CM_ENUM_CLASS_TYPE_CMSAMPLER8X8STATE_RT :return ARG_KIND_SAMPLER;
5643 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER :return ARG_KIND_STATE_BUFFER;
5644
5645 default:
5646 CM_ASSERTMESSAGE("Error: Invalid surface type.");
5647 break;
5648 }
5649 return ARG_KIND_GENERAL;
5650 }
5651
CalculateKernelSurfacesNum(uint32_t & kernelSurfaceNum,uint32_t & neededBTEntryNum)5652 int32_t CmKernelRT::CalculateKernelSurfacesNum(uint32_t& kernelSurfaceNum, uint32_t& neededBTEntryNum)
5653 {
5654 uint32_t surfaceArraySize = 0;
5655 CmSurface* surf = nullptr;
5656 CmSurface2DRT* surf2D = nullptr;
5657 CmSurface2DUPRT* surf2DUP = nullptr;
5658 uint32_t width, height, bytesPerPixel;
5659 CM_SURFACE_FORMAT format;
5660 uint32_t maxBTIndex = 0;
5661
5662 kernelSurfaceNum = 0;
5663 neededBTEntryNum = 0;
5664
5665 surfaceArraySize = m_surfaceMgr->GetSurfacePoolSize();
5666
5667 //Calculate surface number and needed binding table entries
5668 for (uint32_t surfIndex = 0; surfIndex <= m_maxSurfaceIndexAllocated; surfIndex ++)
5669 {
5670 if (m_surfaceArray[surfIndex%surfaceArraySize])
5671 {
5672 surf = nullptr;
5673 m_surfaceMgr->GetSurface(surfIndex, surf);
5674 if (surf)
5675 {
5676 switch(surf->Type())
5677 {
5678 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
5679 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
5680 kernelSurfaceNum ++;
5681 neededBTEntryNum ++;
5682 break;
5683
5684 case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
5685 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
5686 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
5687 //virtual surface, no need increase count
5688 break;
5689
5690 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
5691 kernelSurfaceNum++;
5692 surf2D = static_cast<CmSurface2DRT*>(surf);
5693 format = CM_SURFACE_FORMAT_INVALID;
5694 surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5695 if ((format == CM_SURFACE_FORMAT_NV12) ||
5696 (format == CM_SURFACE_FORMAT_P010) ||
5697 (format == CM_SURFACE_FORMAT_P208) ||
5698 (format == CM_SURFACE_FORMAT_P016))
5699 {
5700 neededBTEntryNum += 2;
5701 }
5702 else if (format == CM_SURFACE_FORMAT_422H ||
5703 format == CM_SURFACE_FORMAT_411P ||
5704 format == CM_SURFACE_FORMAT_IMC3 ||
5705 format == CM_SURFACE_FORMAT_422V ||
5706 format == CM_SURFACE_FORMAT_444P)
5707 { // 3 planes surface
5708 neededBTEntryNum += 3;
5709 }
5710 else
5711 {
5712 neededBTEntryNum += 1;
5713 }
5714 break;
5715
5716 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
5717 kernelSurfaceNum++;
5718 surf2DUP = static_cast<CmSurface2DUPRT*>(surf);
5719 format = CM_SURFACE_FORMAT_INVALID;
5720 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5721 if ((format == CM_SURFACE_FORMAT_NV12) ||
5722 (format == CM_SURFACE_FORMAT_P010) ||
5723 (format == CM_SURFACE_FORMAT_P208) ||
5724 (format == CM_SURFACE_FORMAT_P016))
5725 {
5726 neededBTEntryNum += 2;
5727 }
5728 else if (format == CM_SURFACE_FORMAT_422H ||
5729 format == CM_SURFACE_FORMAT_411P ||
5730 format == CM_SURFACE_FORMAT_IMC3 ||
5731 format == CM_SURFACE_FORMAT_422V ||
5732 format == CM_SURFACE_FORMAT_444P)
5733 { // 3 planes surface
5734 neededBTEntryNum += 3;
5735 }
5736 else
5737 {
5738 neededBTEntryNum += 1;
5739 }
5740 break;
5741
5742 default:
5743 break;
5744 }
5745 }
5746 }
5747 }
5748
5749 if ((maxBTIndex + 1) > neededBTEntryNum)
5750 {
5751 neededBTEntryNum = maxBTIndex + 1;
5752 }
5753
5754 //Wordaround: the calculation maybe not accurate if the VME surfaces are existed
5755 neededBTEntryNum += m_vmeSurfaceCount;
5756
5757 return CM_SUCCESS;
5758 }
5759
5760 //*-----------------------------------------------------------------------------
5761 //| Purpose: Get aligned curbe size for different platforms
5762 //| Returns: Result of operation.
5763 //*-----------------------------------------------------------------------------
GetAlignedCurbeSize(uint32_t value)5764 uint32_t CmKernelRT::GetAlignedCurbeSize(uint32_t value)
5765 {
5766 uint32_t curbeAlignedSize = 0;
5767
5768 curbeAlignedSize = MOS_ALIGN_CEIL(value, RENDERHAL_CURBE_BLOCK_ALIGN);
5769 return curbeAlignedSize;
5770 }
5771
5772 #if CM_LOG_ON
Log()5773 std::string CmKernelRT::Log()
5774 {
5775
5776 std::ostringstream oss;
5777
5778 oss << " Kernel Name:" << m_kernelInfo->kernelName << std::endl
5779 << " Kernel Binary Size:" << m_kernelInfo->jitBinarySize
5780 << " Index In Task:" << m_indexInTask
5781 << " Thread Count:" << m_threadCount
5782 << " Curbe Size:" << m_sizeInCurbe
5783 << " Kernel arg Count:" << m_argCount
5784 << std::endl;
5785
5786 // Per Kernel Thread Space Log
5787 if(m_threadSpace)
5788 {
5789 oss << m_threadSpace->Log();
5790 }
5791
5792 // Per Kernel Thread Group Space Log
5793 if(m_threadGroupSpace)
5794 {
5795 oss << m_threadGroupSpace->Log();
5796 }
5797
5798 // Arguments Log
5799 for (uint32_t argIndex= 0; argIndex< m_argCount; argIndex++ )
5800 {
5801 if (m_args[argIndex].value) // filter out the implicit arguments
5802 {
5803 ArgLog(oss, argIndex, m_args[argIndex]);
5804 }
5805 }
5806
5807 return oss.str();
5808 }
5809
ArgLog(std::ostringstream & oss,uint32_t index,CM_ARG arg)5810 void CmKernelRT::ArgLog(std::ostringstream &oss, uint32_t index, CM_ARG arg)
5811 {
5812
5813 oss << "[" << index << "] th Argument"
5814 << " Type :" << arg.unitKind
5815 << " Count:" << arg.unitCount
5816 << " Size:" << arg.unitSize
5817 << " Surface Kind:" << (int)arg.surfaceKind
5818 << " OffsetInPayload:" << arg.unitOffsetInPayload
5819 << " OffsetInPayloadOrig:" << arg.unitOffsetInPayloadOrig << "";
5820
5821 CmLogger::LogDataArrayHex( oss, arg.value, arg.unitSize * arg.unitCount);
5822
5823 if (CHECK_SURFACE_TYPE(arg.unitKind,
5824 ARG_KIND_SURFACE_1D,
5825 ARG_KIND_SURFACE_2D,
5826 ARG_KIND_SURFACE_2D_UP,
5827 ARG_KIND_SURFACE_VME,
5828 ARG_KIND_SURFACE_SAMPLER,
5829 ARG_KIND_SURFACE_3D,
5830 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5831 ARG_KIND_SURFACE_SAMPLER8X8_VA,
5832 ARG_KIND_SURFACE2DUP_SAMPLER))
5833 {
5834 uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5835 if (arg.unitKind == ARG_KIND_SURFACE_VME)
5836 {
5837 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5838 }
5839 for (uint16_t i = 0; i < numSurfaces; i++)
5840 {
5841 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5842
5843 if(surfaceIndex == CM_NULL_SURFACE)
5844 continue;
5845
5846 CmSurface *surf = nullptr;
5847 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5848 if (surf == nullptr)
5849 {
5850 continue;
5851 }
5852 surf->Log(oss);
5853 }
5854 }
5855 }
5856 #endif
5857
SurfaceDump(uint32_t kernelNumber,int32_t taskId)5858 void CmKernelRT::SurfaceDump(uint32_t kernelNumber, int32_t taskId)
5859 {
5860 #if MDF_SURFACE_CONTENT_DUMP
5861 CM_ARG arg;
5862
5863 for (uint32_t argIndex = 0; argIndex< m_argCount; argIndex++)
5864 {
5865 arg = m_args[argIndex];
5866 if (CHECK_SURFACE_TYPE(arg.unitKind,
5867 ARG_KIND_SURFACE_1D,
5868 ARG_KIND_SURFACE_2D,
5869 ARG_KIND_SURFACE_2D_UP,
5870 ARG_KIND_SURFACE_VME,
5871 ARG_KIND_SURFACE_SAMPLER,
5872 ARG_KIND_SURFACE_3D,
5873 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5874 ARG_KIND_SURFACE_SAMPLER8X8_VA,
5875 ARG_KIND_SURFACE2DUP_SAMPLER))
5876 {
5877 uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5878 if (arg.unitKind == ARG_KIND_SURFACE_VME)
5879 {
5880 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5881 }
5882
5883 for (uint16_t i = 0; i < numSurfaces; i++)
5884 {
5885 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5886 CmSurface *surf = nullptr;
5887 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5888 if (surf == nullptr)
5889 {
5890 return;
5891 }
5892 surf->DumpContent(kernelNumber, m_kernelInfo->kernelName, taskId, argIndex, i);
5893 }
5894 }
5895 }
5896 #endif
5897 }
5898
SetSamplerBTI(SamplerIndex * sampler,uint32_t nIndex)5899 CM_RT_API int32_t CmKernelRT::SetSamplerBTI(SamplerIndex* sampler, uint32_t nIndex)
5900 {
5901 if (!sampler)
5902 {
5903 return CM_NULL_POINTER;
5904 }
5905 if (CM_SAMPLER_MAX_BINDING_INDEX < nIndex)
5906 {
5907 return CM_KERNELPAYLOAD_SAMPLER_INVALID_BTINDEX;
5908 }
5909
5910 uint32_t samplerIndex = sampler->get_data();
5911 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5912
5913 uint32_t i = 0;
5914 for (i = 0; i < m_samplerBtiCount; i++)
5915 {
5916 if ((m_samplerBtiEntry[i].samplerIndex == samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5917 {
5918 break;
5919 }
5920 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
5921 {
5922 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5923 {
5924 if (cmHalState->useNewSamplerHeap)
5925 {
5926 SamplerParam sampler1 = {};
5927 SamplerParam sampler2 = {};
5928 cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[m_samplerBtiEntry[i].samplerIndex], sampler1);
5929 cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[samplerIndex], sampler2);
5930
5931 if (sampler1.elementType== sampler2.elementType)
5932 {
5933 // return failure only if the two samplers have the same type, because different type samplers are able to set to the same BTI
5934 return CM_FAILURE;
5935 }
5936 }
5937 else
5938 {
5939 return CM_FAILURE;
5940 }
5941 }
5942
5943 CmSampler8x8State_RT *sampler8x8 = nullptr;
5944 CmSampler8x8State_RT *tmpSampler8x8 = nullptr;
5945 m_device->GetSampler8x8(samplerIndex, sampler8x8);
5946 m_device->GetSampler8x8(m_samplerBtiEntry[i].samplerIndex, tmpSampler8x8);
5947
5948 if (sampler8x8 && tmpSampler8x8 && (sampler8x8->GetStateType() == CM_SAMPLER8X8_AVS)
5949 && (tmpSampler8x8->GetStateType() == CM_SAMPLER8X8_AVS) &&
5950 cmHalState->cmHalInterface->IsAdjacentSamplerIndexRequiredbyHw())
5951 {
5952 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) &&
5953 ((m_samplerBtiEntry[i].samplerBTI == nIndex + 1) || (m_samplerBtiEntry[i].samplerBTI == nIndex - 1)))
5954 return CM_FAILURE;
5955 }
5956 }
5957 }
5958
5959 if (i >= CM_MAX_SAMPLER_TABLE_SIZE)
5960 {
5961 CM_ASSERTMESSAGE("Error: Exceed maximum sampler table size.");
5962 return CM_FAILURE;
5963 }
5964
5965 if (i == m_samplerBtiCount)
5966 {
5967 m_samplerBtiEntry[i].samplerIndex = samplerIndex;
5968 m_samplerBtiEntry[i].samplerBTI = nIndex;
5969
5970 m_samplerBtiCount = i + 1;
5971
5972 m_dirty |= cMKERNELDATASAMPLERBTIDIRTY;
5973 }
5974 return CM_SUCCESS;
5975 }
5976
GetBinary(std::vector<char> & binary)5977 CMRT_UMD_API int32_t CmKernelRT::GetBinary(std::vector<char>& binary)
5978 {
5979 binary.resize(m_binarySize);
5980
5981 CmSafeMemCopy((void *)&binary[0], (void *)m_binary, m_binarySize);
5982
5983 return CM_SUCCESS;
5984 }
5985
ReplaceBinary(std::vector<char> & binary)5986 CMRT_UMD_API int32_t CmKernelRT::ReplaceBinary(std::vector<char>& binary)
5987 {
5988 uint32_t size = binary.size();
5989
5990 if (size == 0)
5991 {
5992 return CM_INVALID_ARG_VALUE;
5993 }
5994
5995 if(m_binaryOrig == nullptr)
5996 {
5997 //Store the orignal binary once.
5998 m_binaryOrig = m_binary;
5999 m_binarySizeOrig = m_binarySize;
6000 }
6001
6002 m_binary = MOS_NewArray(char, size);
6003 CmSafeMemCopy((void *)m_binary, (void *)&binary[0], size);
6004
6005 m_binarySize = size;
6006
6007 return CM_SUCCESS;
6008 }
6009
ResetBinary()6010 CMRT_UMD_API int32_t CmKernelRT::ResetBinary()
6011 {
6012 if (m_binaryOrig == nullptr)
6013 {
6014 //ReplaceBinary is never called
6015 return CM_SUCCESS;
6016 }
6017 if(m_binary!= m_binaryOrig)
6018 {
6019 MosSafeDeleteArray(m_binary);
6020 }
6021 m_binary = m_binaryOrig;
6022 m_binarySize = m_binarySizeOrig;
6023
6024 return CM_SUCCESS;
6025 }
6026
UpdateSamplerHeap(CmKernelData * kernelData)6027 int CmKernelRT::UpdateSamplerHeap(CmKernelData *kernelData)
6028 {
6029 // Get sampler bti & offset
6030 PCM_HAL_KERNEL_PARAM cmKernel = nullptr;
6031 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
6032 PCM_HAL_STATE state = cmData->cmHalState;
6033 std::list<SamplerParam>::iterator iter;
6034 unsigned int heapOffset = 0;
6035
6036 if (state->useNewSamplerHeap == false)
6037 {
6038 return CM_SUCCESS;
6039 }
6040
6041 heapOffset = 0;
6042 cmKernel = kernelData->GetHalCmKernelData();
6043 std::list<SamplerParam> *sampler_heap = cmKernel->samplerHeap;
6044
6045 // First pass, inserts sampler with user-defined BTI to the list. Sorts by element order low to high, then by BTI order low to high.
6046 for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6047 {
6048 for (unsigned int n = 0; n < cmKernel->samplerBTIParam.samplerCount; ++n)
6049 {
6050 SamplerParam sampler = {};
6051 sampler.samplerTableIndex = cmKernel->samplerBTIParam.samplerInfo[n].samplerIndex;
6052
6053 if (state->samplerTable[sampler.samplerTableIndex].ElementType == samplerElementType)
6054 {
6055 sampler.bti = cmKernel->samplerBTIParam.samplerInfo[n].samplerBTI;
6056 sampler.userDefinedBti = true;
6057 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6058
6059 // Guarantees each user-defined BTI has a spacing between each other user-defined BTIs larger than the stepping
6060 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6061 {
6062 if (iter->elementType == sampler.elementType)
6063 {
6064 unsigned int diff = (iter->bti > sampler.bti) ? (iter->bti - sampler.bti) : (sampler.bti - iter->bti);
6065 if (diff < sampler.btiStepping)
6066 {
6067 CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6068 return MOS_STATUS_INVALID_PARAMETER;
6069 }
6070 }
6071 }
6072
6073 // Inserts by the order
6074 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6075 {
6076 if (iter->elementType > sampler.elementType)
6077 {
6078 break;
6079 }
6080 else if ((iter->elementType == sampler.elementType) && (iter->bti > sampler.bti))
6081 {
6082 break;
6083 }
6084 }
6085 sampler.heapOffset = sampler.bti * sampler.btiMultiplier;
6086 sampler_heap->insert(iter, sampler);
6087 }
6088 }
6089 }
6090
6091 // Second pass, loops over all kernel/thread args, find regular sampler and insert to sampler heap.
6092 // Follows the existing sorted order.
6093 for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6094 {
6095 for (unsigned int index = 0; index < cmKernel->numArgs; index++)
6096 {
6097 PCM_HAL_KERNEL_ARG_PARAM argParam = &cmKernel->argParams[index];
6098 if (argParam->isNull)
6099 {
6100 continue;
6101 }
6102
6103 for (unsigned int threadIndex = 0; threadIndex < argParam->unitCount; threadIndex++)
6104 {
6105 if (argParam->kind == CM_ARGUMENT_SAMPLER)
6106 {
6107 unsigned char *arg = argParam->firstValue + (threadIndex * argParam->unitSize);
6108 unsigned int samplerTableIndex = *((uint32_t *)arg);
6109
6110 SamplerParam sampler = {};
6111 sampler.samplerTableIndex = samplerTableIndex;
6112 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6113 sampler.regularBti = true;
6114
6115 if (sampler.elementType != samplerElementType)
6116 {
6117 continue;
6118 }
6119
6120 // if the sampler is already in the heap, skip
6121 bool isDuplicate = false;
6122 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6123 {
6124 if (iter->samplerTableIndex == sampler.samplerTableIndex)
6125 {
6126 isDuplicate = true;
6127 iter->regularBti = true;
6128 break;
6129 }
6130 }
6131 if (isDuplicate == true)
6132 {
6133 continue;
6134 }
6135
6136 // insert the new sampler to the heap
6137 heapOffset = 0;
6138 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6139 {
6140 if (iter->elementType == sampler.elementType)
6141 {
6142 // Needs to keep the inserted sampler's correctness, so do not insert before same element regular sampler
6143 // Only insert before user-defined BTI
6144 if (iter->userDefinedBti == true)
6145 {
6146 unsigned int curOffset = iter->heapOffset;
6147 if (heapOffset > curOffset)
6148 {
6149 // Confliction, which means that sampler heap in smaller
6150 // element type has excced the position which is supposed
6151 // to put this user-defined BTI sampler.
6152 // User needs to set the BTI to a larger value.
6153 CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6154 return MOS_STATUS_INVALID_PARAMETER;
6155 }
6156 else
6157 {
6158 if (curOffset - heapOffset >= sampler.btiStepping * sampler.btiMultiplier)
6159 {
6160 break;
6161 }
6162 else
6163 {
6164 heapOffset = curOffset + iter->btiStepping * iter->btiMultiplier;
6165 }
6166 }
6167 }
6168 else
6169 {
6170 heapOffset += iter->btiStepping * iter->btiMultiplier;
6171 }
6172 }
6173 else if (iter->elementType > sampler.elementType)
6174 {
6175 break;
6176 }
6177 else
6178 {
6179 heapOffset = iter->heapOffset + iter->size;
6180 std::list<SamplerParam>::iterator iter_next = std::next(iter, 1);
6181 if ((iter_next != sampler_heap->end()) && (iter_next->elementType > iter->elementType))
6182 {
6183 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6184 heapOffset = (heapOffset + iter_next->btiStepping * iter_next->btiMultiplier - 1) / (iter_next->btiStepping * iter_next->btiMultiplier) * (iter_next->btiStepping * iter_next->btiMultiplier);
6185 }
6186 }
6187 }
6188
6189 if (iter == sampler_heap->end())
6190 {
6191 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6192 heapOffset = (heapOffset + sampler.btiStepping * sampler.btiMultiplier - 1) / (sampler.btiStepping * sampler.btiMultiplier) * (sampler.btiStepping * sampler.btiMultiplier);
6193 }
6194 sampler.heapOffset = heapOffset;
6195
6196 if (sampler.btiMultiplier != 0)
6197 {
6198 sampler.bti = sampler.heapOffset / sampler.btiMultiplier;
6199 }
6200 else
6201 {
6202 CM_ASSERTMESSAGE("Sampler BTI setting error. Multiplier cannot be zero!\n");
6203 return MOS_STATUS_INVALID_PARAMETER;
6204 }
6205 sampler_heap->insert(iter, sampler);
6206 }
6207 }
6208 }
6209 }
6210
6211 return CM_SUCCESS;
6212 }
6213 }
6214