1 /*
2 * Copyright (c) 2007-2017, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_kernel_rt.cpp
24 //! \brief     Contains CmKernelRT definitions.
25 //!
26 
27 #include "cm_kernel_rt.h"
28 
29 #include "cm_program.h"
30 #include "cm_device_rt.h"
31 #include "cm_surface_manager.h"
32 #include "cm_surface_2d_up_rt.h"
33 #include "cm_surface_3d_rt.h"
34 #include "cm_buffer_rt.h"
35 #include "cm_mov_inst.h"
36 #include "cm_kernel_data.h"
37 #include "cm_thread_space_rt.h"
38 #include "cm_state_buffer.h"
39 #include "cm_surface_vme.h"
40 #include "cm_debug.h"
41 #include "cm_surface_sampler8x8.h"
42 #include "cm_surface_sampler.h"
43 #include "cm_group_space.h"
44 #include "cm_surface_2d_rt.h"
45 #include "cm_sampler8x8_state_rt.h"
46 #include "cm_visa.h"
47 #include "cm_extension_creator.h"
48 #include "cm_execution_adv.h"
49 
50 #define GENERATE_GLOBAL_SURFACE_INDEX
51 
52 #define READ_FIELD_FROM_BUF( dst, type ) \
53     dst = *((type *) &buf[bytePosition]); \
54     bytePosition += sizeof(type);
55 
56 #define PER_ARG_SIZE_IN_DWORD 3
57 #define KERNEL_INFO_SIZE_IN_DWORD 4
58 
59 #define DW_ALIGNMENT( byte_address ) \
60     if( byte_address % 4 ) \
61     byte_address = ( byte_address / 4 + 1 ) * 4;
62 
63 #define GRF_ALIGNMENT( byte_address ) \
64     if( byte_address % 32 ) \
65     byte_address = ( byte_address / 32 + 1 ) * 32;
66 
67 // To check if surface type nType is equal to the surface type list in argument ...
68 #define CHECK_SURFACE_TYPE( nType, ... )  ( _CheckSurfaceType( nType, __VA_ARGS__, -1 ) )
69 
70 #define IsKernelArg(arg)    ((arg).unitCount == 1)
71 
72 // Warning : x must be uint32_t
73 #define SET_MEMORY_OBJECT_CONTROL(x, memCtl) \
74            x = ((uint16_t)(memCtl.mem_ctrl<< 8 | memCtl.mem_type << 4 | memCtl.age)) << 16 | (x);
75 
76 #define   ADD_INTO_VME_INDEX_ARRAY(value)     \
77     vmeIndexArray[vmeIndexArrayPosition] = value ;                 \
78     vmeIndexArrayPosition ++;
79 
80 #define   ADD_INTO_VME_CM_INDEX_ARRAY(value)  ; \
81     vmeCmIndexArray[vmeCmIndexArrayPosition] = value ;                 \
82     vmeCmIndexArrayPosition ++;
83 
84 typedef CM_ARG* PCM_ARG;
85 
86 #define CM_KERNEL_DATA_CLEAN                   0         // kernel data clean
87 #define CM_KERNEL_DATA_KERNEL_ARG_DIRTY        1         // per kernel arg dirty
88 #define CM_KERNEL_DATA_THREAD_ARG_DIRTY        (1 << 1)  // per thread arg dirty
89 #define CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY      (1 << 2)  // indirect payload data dirty
90 #define CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY (1 << 3)  // indirect payload data size changes
91 #define CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY    (1 << 4)  // global surface dirty
92 #define CM_KERNEL_DATA_THREAD_COUNT_DIRTY      (1 << 5)  // thread count dirty, reset() be called
93 #define cMKERNELDATASAMPLERBTIDIRTY            (1 << 6)  // sampler bti dirty
94 #define CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY      (1 << 7)       // threadgroupspace dirty
95 
Partition(PCM_ARG * args,int32_t p,int32_t r)96 int32_t Partition( PCM_ARG* args, int32_t p, int32_t r )
97 {
98     uint16_t x = args[p]->unitOffsetInPayload;
99     int32_t i = p - 1;
100     int32_t j = r + 1;
101     while( 1 )
102     {
103         do {
104             j --;
105         } while( args[j]->unitOffsetInPayload > x );
106 
107         do {
108             i ++;
109         } while( args[i]->unitOffsetInPayload < x );
110 
111         if( i < j )
112         {
113             PCM_ARG tmpP = args[i];
114             args[i] = args[j];
115             args[j] = tmpP;
116         }
117         else
118         {
119             return j;
120         }
121     }
122 }
123 
124 // Cannot be called directly! use macro CHECK_SURFACE_TYPE!
_CheckSurfaceType(int nType,...)125 bool _CheckSurfaceType( int nType, ... )
126 {
127     bool match = false;
128     va_list ap;
129     va_start( ap, nType );
130     int type = 0;
131 
132     while ( ( type = va_arg( ap, int ) ) >= 0 )
133     {
134         if( type == nType )
135         {
136             match = true;
137             break;
138         }
139     }
140     va_end(ap);
141 
142     return match;
143 }
144 
QuickSort(PCM_ARG * args,int32_t p,int32_t r)145 void QuickSort( PCM_ARG* args, int32_t p, int32_t r )
146 {
147     if( p < r )
148     {
149         int32_t q = Partition( args, p, r );
150         QuickSort( args, p, q );
151         QuickSort( args, q + 1, r );
152     }
153 }
154 
155 namespace CMRT_UMD
156 {
157 static bool bCmMovInstRegistered = CmExtensionCreator<CmMovInstConstructor>::RegisterClass<CmMovInstConstructor>();
158 //*-----------------------------------------------------------------------------
159 //| Purpose:   Create object for mov instructions
160 //|            instructions will be copied into DstMem
161 //*-----------------------------------------------------------------------------
ConstructObjMovs(uint32_t dstOffset,uint32_t srcOffset,uint32_t size,CmDynamicArray & movInsts,uint32_t index,bool isBdw,bool isHwDebug)162 uint32_t CmMovInstConstructor::ConstructObjMovs(uint32_t dstOffset, uint32_t srcOffset, uint32_t size, CmDynamicArray &movInsts, uint32_t index, bool isBdw, bool isHwDebug)
163 {
164     return MovInst_RT::CreateMoves(dstOffset, srcOffset, size, movInsts, index, isBdw, isHwDebug);
165 }
166 
167 //*-----------------------------------------------------------------------------
168 //| Purpose:     Create CM Kernel
169 //| Arguments :
170 //|               device        [in]    Pointer to device
171 //|               program      [in]    Pointer to cm Program
172 //|               kernelName    [in]    Name of kernel
173 //|               kernelId      [in]    Kernel's ID
174 //|               kernel       [in/out]    Reference Pointer to CM Kernel
175 //|               options       [in]    jitter, or non-jitter
176 //| Returns:    Result of the operation.
177 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmProgramRT * program,const char * kernelName,uint32_t kernelIndex,uint32_t kernelSeqNum,CmKernelRT * & kernel,const char * options)178 int32_t CmKernelRT::Create(CmDeviceRT *device,
179                            CmProgramRT *program,
180                            const char *kernelName,
181                            uint32_t kernelIndex,
182                            uint32_t kernelSeqNum,
183                            CmKernelRT* &kernel,
184                            const char *options)
185 {
186     int32_t result = CM_SUCCESS;
187     CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)device->GetAccelData())->cmHalState;
188 
189     if (state && state->advExecutor)
190     {
191         kernel = state->advExecutor->CreateKernelRT(device, program, kernelIndex, kernelSeqNum);
192     }
193     else
194     {
195         kernel = new (std::nothrow) CmKernelRT(device, program, kernelIndex, kernelSeqNum);
196     }
197 
198     if( kernel )
199     {
200         kernel->Acquire();
201         result = kernel->Initialize( kernelName, options );
202         if( result != CM_SUCCESS )
203         {
204             CmKernelRT::Destroy( kernel, program);
205             return result;
206         }
207     }
208     else
209     {
210         CM_ASSERTMESSAGE("Error: Failed to create CmKernel due to out of system memory.");
211         return CM_OUT_OF_HOST_MEMORY;
212     }
213     if (options)
214     {
215         if (strcmp(options, "PredefinedGPUCopyKernel") == 0)
216         {
217             kernel->m_blCreatingGPUCopyKernel = true;
218         }
219         else
220         {
221             kernel->m_blCreatingGPUCopyKernel = false;
222         }
223     }
224 
225 #if USE_EXTENSION_CODE
226     result = kernel->InitForGTPin(device, program, kernel);
227 #endif
228 
229     return result;
230 }
231 
232 //*-----------------------------------------------------------------------------
233 //| Purpose:    Destory Kernel
234 //| Returns:    Result of the operation.
235 //*-----------------------------------------------------------------------------
Destroy(CmKernelRT * & kernel,CmProgramRT * & program)236 int32_t CmKernelRT::Destroy( CmKernelRT* &kernel, CmProgramRT *&program )
237 {
238     uint32_t refCount = kernel->SafeRelease();
239     if (refCount == 0)
240     {
241         kernel = nullptr;
242     }
243 
244     refCount = program->SafeRelease();
245     if (refCount == 0)
246     {
247         program = nullptr;
248     }
249     return CM_SUCCESS;
250 }
251 
252 //*-----------------------------------------------------------------------------
253 //| Purpose:    Acuqire Kernel: increment refcount
254 //| Returns:    Result of the operation.
255 //*-----------------------------------------------------------------------------
Acquire(void)256 int32_t CmKernelRT::Acquire( void)
257 {
258     m_refcount ++;
259     return m_refcount;
260 }
261 
262 //*-----------------------------------------------------------------------------
263 //| Purpose:    SafeRelease Kernel: Delete the instance
264 //| Returns:    Result of the operation.
265 //*-----------------------------------------------------------------------------
SafeRelease(void)266 int32_t CmKernelRT::SafeRelease( void)
267 {
268     --m_refcount;
269     if (m_refcount == 0)
270     {
271         PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
272         PCM_HAL_STATE state = cmData->cmHalState;
273         if (state->dshEnabled)
274         {
275             state->pfnDSHUnregisterKernel(state, m_id);
276         }
277         delete this;
278         return 0;
279     }
280     return m_refcount;
281 }
282 
283 //*-----------------------------------------------------------------------------
284 //| Purpose:    Kernel constructor
285 //| Returns:    Result of the operation.
286 //*-----------------------------------------------------------------------------
CmKernelRT(CmDeviceRT * device,CmProgramRT * program,uint32_t kernelIndex,uint32_t kernelSeqNum)287 CmKernelRT::CmKernelRT(CmDeviceRT *device,
288                        CmProgramRT *program,
289                        uint32_t kernelIndex,
290                        uint32_t kernelSeqNum):
291     m_device( device ),
292     m_surfaceMgr( nullptr ),
293     m_program( program ),
294     m_options( nullptr ),
295     m_binary( nullptr ),
296     m_binaryOrig(nullptr),
297     m_binarySize(0),
298     m_binarySizeOrig(0),
299     m_threadCount( 0 ),
300     m_lastThreadCount( 0 ),
301     m_sizeInCurbe( 0 ),
302     m_sizeInPayload( 0 ),
303     m_argCount( 0 ),
304     m_args( nullptr ),
305     m_kernelInfo(nullptr),
306     m_kernelIndexInProgram( CM_INVALID_KERNEL_INDEX ),
307     m_curbeEnabled( true ),
308     m_nonstallingScoreboardEnabled(false),
309     m_dirty( CM_KERNEL_DATA_CLEAN ),
310     m_lastKernelData( nullptr ),
311     m_lastKernelDataSize( 0 ),
312     m_indexInTask(0),
313     m_threadSpaceAssociated(false),
314     m_perThreadArgExists(false),
315     m_perKernelArgExists( false ),
316     m_threadSpace( nullptr ),
317     m_adjustScoreboardY( 0 ),
318     m_lastAdjustScoreboardY( 0 ),
319     m_blCreatingGPUCopyKernel( false),
320     m_usKernelPayloadDataSize( 0 ),
321     m_kernelPayloadData( nullptr ),
322     m_usKernelPayloadSurfaceCount( 0 ),
323     m_samplerBtiCount( 0 ),
324     m_refcount(0),
325     m_halMaxValues( nullptr ),
326     m_halMaxValuesEx( nullptr ),
327     m_surfaceArray(nullptr),
328     m_threadGroupSpace( nullptr ),
329     m_vmeSurfaceCount( 0 ),
330     m_maxSurfaceIndexAllocated(0),
331     m_barrierMode(CM_LOCAL_BARRIER),
332     m_isClonedKernel(false),
333     m_cloneKernelID(0),
334     m_hasClones( false ),
335     m_stateBufferBounded( CM_STATE_BUFFER_NONE ),
336     m_movInstConstructor(nullptr)
337 {
338     program->Acquire();
339     m_program = program;
340 
341     device->GetSurfaceManager(m_surfaceMgr);
342 
343     m_id = kernelSeqNum; // Unique number for each kernel. This ID is used in Batch buffer.
344     m_id <<= 32;
345     m_kernelIndex = kernelIndex;
346 
347     for (int i = 0; i < CM_GLOBAL_SURFACE_NUMBER; i++)
348     {
349         m_globalSurfaces[i] = nullptr;
350         m_globalCmIndex[i] = 0;
351     }
352 
353     m_blhwDebugEnable = program->IsHwDebugEnabled();
354 
355     CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, sizeof(m_pKernelPayloadSurfaceArray));
356     CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, sizeof(m_IndirectSurfaceInfoArray));
357     CmSafeMemSet( m_samplerBtiEntry, 0, sizeof( m_samplerBtiEntry ) );
358 
359     if (m_samplerBtiCount > 0)
360     {
361         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
362         m_samplerBtiCount = 0;
363     }
364 
365     ResetKernelSurfaces();
366 }
367 
368 //*-----------------------------------------------------------------------------
369 //| Purpose:    Destructor of Class CmKernel
370 //| Returns:    None.
371 //*-----------------------------------------------------------------------------
~CmKernelRT(void)372 CmKernelRT::~CmKernelRT( void )
373 {
374     MosSafeDeleteArray(m_options);
375 
376     DestroyArgs();
377 
378     if(m_lastKernelData)
379     {
380         CmKernelData::Destroy( m_lastKernelData );
381     }
382 
383     if( m_device->CheckGTPinEnabled() && !m_blCreatingGPUCopyKernel)
384     {
385         MosSafeDeleteArray(m_binary);
386     }
387 
388     if( CM_INVALID_KERNEL_INDEX != m_kernelIndexInProgram )
389     {
390         m_program->ReleaseKernelInfo(m_kernelIndexInProgram);
391     }
392 
393     for(int i=0; i< CM_GLOBAL_SURFACE_NUMBER; i++)
394     {
395         SurfaceIndex *surfIndex = m_globalSurfaces[i];
396         MosSafeDelete(surfIndex);
397     }
398 
399     MosSafeDeleteArray(m_kernelPayloadData);
400     MosSafeDeleteArray(m_surfaceArray);
401     MosSafeDelete(m_movInstConstructor);
402 }
403 
404 //*-----------------------------------------------------------------------------
405 //| Purpose:    Initialize CM kernel
406 //| Returns:    Result of the operation.
407 //*-----------------------------------------------------------------------------
Initialize(const char * kernelName,const char * options)408 int32_t CmKernelRT::Initialize( const char* kernelName, const char* options )
409 {
410     if( kernelName == nullptr )
411     {
412         CM_ASSERTMESSAGE("Error: Kernel name is null.");
413         return CM_NULL_POINTER;
414     }
415 
416     size_t length = strnlen( kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE );
417     if( length >= CM_MAX_KERNEL_NAME_SIZE_IN_BYTE  )
418     {
419         CM_ASSERTMESSAGE("Error: Kernel name size is too long.");
420         return CM_FAILURE;
421     }
422 
423     uint32_t kernelCount = 0;
424     m_program->GetKernelCount( kernelCount );
425 
426     CM_KERNEL_INFO* kernelInfo = nullptr;
427     uint32_t i = 0;
428     for( i = 0; i < kernelCount; i ++ )
429     {
430         m_program->GetKernelInfo( i, kernelInfo );
431         if( !kernelInfo )
432         {
433             CM_ASSERTMESSAGE("Error: Invalid kernel info.");
434             return CM_NULL_POINTER;
435         }
436         if( strcmp( kernelInfo->kernelName, kernelName ) == 0 )
437         {
438             break;
439         }
440     }
441 
442     if( i == kernelCount )
443     {
444         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
445         return CM_FAILURE;
446     }
447 
448     m_device->GetHalMaxValues( m_halMaxValues, m_halMaxValuesEx);
449 
450     m_program->AcquireKernelInfo(i);
451     m_kernelInfo = kernelInfo;
452     m_kernelIndexInProgram = i;
453 
454     if( options )
455     {
456         size_t length = strnlen( options, CM_MAX_OPTION_SIZE_IN_BYTE );
457         if(length >= CM_MAX_OPTION_SIZE_IN_BYTE)
458         {
459             CM_ASSERTMESSAGE("Error: Option string is too long.");
460             return CM_INVALID_ARG_VALUE;
461         }
462         else
463         {
464             m_options = MOS_NewArray(char, (length+1));
465             if( !m_options )
466             {
467                 CM_ASSERTMESSAGE("Error: Out of system memory.");
468                 return CM_OUT_OF_HOST_MEMORY;
469 
470             }
471             CmSafeMemCopy( m_options, options, length);
472             m_options[ length ] = '\0';
473 
474             char* tmp = strstr( m_options, "nocurbe" );
475             if( tmp )
476             {
477                 m_curbeEnabled = false;
478             }
479         }
480     }
481 
482     m_nonstallingScoreboardEnabled = true;
483 
484     void* commonISACode = nullptr;
485     uint32_t commonISACodeSize = 0;
486     m_program->GetCommonISACode(commonISACode, commonISACodeSize);
487     if ((commonISACode == nullptr) || (commonISACodeSize <= 0))
488     {
489         CM_ASSERTMESSAGE("Error: Invalid VISA.");
490         return CM_INVALID_COMMON_ISA;
491     }
492 
493     bool useVisaApi = true;
494     vISA::ISAfile *isaFile = nullptr;
495     vISA::KernelBody *kernelBody = nullptr;
496 
497     auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
498     if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 2))
499     {
500         useVisaApi = false;
501     }
502     else
503     {
504         isaFile = m_program->getISAfile();
505         if (!isaFile)
506         {
507             CM_ASSERTMESSAGE("Error: Invalid VISA.");
508             return CM_INVALID_COMMON_ISA;
509         }
510         kernelBody = isaFile->getKernelsData().at(m_kernelIndexInProgram);
511     }
512 
513     uint8_t *buf = (uint8_t*)commonISACode;
514     uint32_t bytePosition = m_kernelInfo->kernelIsaOffset;
515 
516     uint32_t kernelInfoRefCount = 0;
517     m_program->GetKernelInfoRefCount(m_kernelIndexInProgram, kernelInfoRefCount);
518     if (kernelInfoRefCount <= 2)    //Only read for 1st time Kernel creation, later we reuse them
519     {
520         if (useVisaApi)
521         {
522             m_kernelInfo->globalStringCount = kernelBody->getStringCount();
523         }
524         {
525             READ_FIELD_FROM_BUF(m_kernelInfo->globalStringCount, unsigned short);
526         }
527 
528         m_kernelInfo->globalStrings = (const char**) malloc( m_kernelInfo->globalStringCount * sizeof(char*) );
529         if(m_kernelInfo->globalStrings  == nullptr)
530         {
531             CM_ASSERTMESSAGE("Error: Out of system memory.");
532             return CM_OUT_OF_HOST_MEMORY;
533         }
534         CmSafeMemSet(m_kernelInfo->globalStrings, 0, m_kernelInfo->globalStringCount * sizeof(char*) );
535 
536         if (useVisaApi)
537         {
538             int i = 0;
539             for (vISA::StringPool *globalString : kernelBody->getStringPool())
540             {
541                 size_t stringLength = std::strlen(globalString->getString());
542                 char *string = (char*)malloc(stringLength + 1);
543                 if (string == nullptr)
544                 {
545                     CM_ASSERTMESSAGE("Error: Out of system memory.");
546                     return CM_OUT_OF_HOST_MEMORY;
547                 }
548                 CmSafeMemCopy(string, globalString->getString(), stringLength);
549                 string[stringLength] = '\0';
550                 m_kernelInfo->globalStrings[i] = string;
551                 i++;
552             }
553         }
554         else
555         {
556             for (int i = 0; i < (int)m_kernelInfo->globalStringCount; i++)
557             {
558                 char* string = (char*)malloc(CM_MAX_KERNEL_STRING_IN_BYTE + 1);
559                 if (string == nullptr)
560                 {
561                     CM_ASSERTMESSAGE("Error: Out of system memory.");
562                     return CM_OUT_OF_HOST_MEMORY;
563                 }
564                 int j = 0;
565                 while (buf[bytePosition] != '\0' && j < CM_MAX_KERNEL_STRING_IN_BYTE) {
566                     string[j++] = buf[bytePosition++];
567                 }
568                 string[j] = '\0';
569                 bytePosition++;
570                 m_kernelInfo->globalStrings[i] = string;
571             }
572         }
573     }
574 
575     uint32_t count = 0;
576     if (useVisaApi)
577     {
578         count = kernelBody->getNumInputs();
579     }
580     else
581     {
582         bytePosition = m_kernelInfo->inputCountOffset;
583 
584         uint8_t countTemp = 0;
585         READ_FIELD_FROM_BUF(countTemp, uint8_t);
586         count = countTemp;
587     }
588 
589     if( count > m_halMaxValues->maxArgsPerKernel )
590     {
591         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
592         return CM_EXCEED_KERNEL_ARG_AMOUNT;
593     }
594 
595     m_args = MOS_NewArray(CM_ARG, count);
596     if( (!m_args) && (count != 0) )
597     {
598         CM_ASSERTMESSAGE("Error: Out of system memory.");
599         MosSafeDeleteArray(m_options);
600         return CM_OUT_OF_HOST_MEMORY;
601     }
602     CmSafeMemSet(m_args, 0, sizeof(CM_ARG) * count);
603     m_argCount  = count;
604 
605     uint32_t preDefinedSurfNum;
606     if ( (m_program->m_cisaMajorVersion > 3) || ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion >=1)) )  //CISA 3.1 +
607     {
608         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_3_1;
609     }
610     else if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion == 0))
611     {
612         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2_1;
613     }
614     else //CISA 2.0
615     {
616         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2;
617     }
618 
619     uint32_t argSize = 0;
620 
621     for (uint32_t i = 0; i < m_argCount; i++)
622     {
623         vISA::InputInfo *inputInfo = nullptr;
624         uint8_t kind = 0;
625 
626         if (useVisaApi)
627         {
628             inputInfo = kernelBody->getInputInfo()[i];
629             kind = inputInfo->getKind();
630         }
631         else
632         {
633             READ_FIELD_FROM_BUF(kind, uint8_t);
634         }
635 
636         if (kind == 0x2) // compiler value for surface
637         {
638             kind = ARG_KIND_SURFACE;
639                 // runtime value for surface. surface will be further classified to 1D/2D/3D
640         }
641         else if (kind == 0x3) // compiler value for vme index
642         {
643             kind = ARG_KIND_VME_INDEX;
644         }
645         else if (kind == 0x8)
646         {
647             kind = ARG_KIND_IMPLICT_LOCALSIZE;
648             m_args[i].isSet = true;
649             m_args[i].unitCount = 1;
650         }
651         else if (kind == 0x10) {
652             kind = ARG_KIND_IMPLICT_GROUPSIZE;
653             m_args[i].isSet = true;
654             m_args[i].unitCount = 1;
655         }
656         else if (kind == 0x18) {
657             kind = ARG_KIND_IMPLICIT_LOCALID;
658             m_args[i].isSet = true;
659             m_args[i].unitCount = 1;
660             m_perKernelArgExists = true;  //only VISA3.3+, can come here, so, no matter it is there any explicit arg, implicit arg exits
661         }
662         else if (kind == 0x2A) {
663             kind = ARG_KIND_SURFACE_2D_SCOREBOARD;
664         }
665         else if (kind == 0x20) {
666             kind = ARG_KIND_GENERAL_DEPVEC;
667         }
668         else if (kind == 0x30) {
669             kind = ARG_KIND_GENERAL_DEPCNT;
670         }
671         else if (kind == 0x80) {
672             // IMP_PSEUDO_INPUT = 0x80 is pseudo input. All inputs after this
673             // will be ignored by CMRT without checking and payload copied.
674             // This resizes the argument count to achieve this.
675             m_argCount = i;
676             break;
677         }
678 
679         m_args[i].unitKind = kind;
680         m_args[i].unitKindOrig = kind;
681 
682         if (kind == ARG_KIND_SURFACE && m_kernelInfo->surfaceCount)
683         {
684             m_args[i].surfaceKind = DATA_PORT_SURF;
685         }
686 
687         if (useVisaApi)
688         {
689             m_args[i].unitOffsetInPayload = inputInfo->getOffset();
690             m_args[i].unitOffsetInPayloadOrig = inputInfo->getOffset();
691 
692             m_args[i].unitSize = inputInfo->getSize();
693             m_args[i].unitSizeOrig = inputInfo->getSize();
694         }
695         else
696         {
697             uint32_t varID;
698             READ_FIELD_FROM_BUF(varID, uint16_t);
699 
700             uint16_t tmpW;
701             READ_FIELD_FROM_BUF(tmpW, uint16_t);
702             m_args[i].unitOffsetInPayload = tmpW;
703             m_args[i].unitOffsetInPayloadOrig = tmpW;
704 
705             READ_FIELD_FROM_BUF(tmpW, uint16_t);
706             m_args[i].unitSize = tmpW;
707             m_args[i].unitSizeOrig = tmpW;
708         }
709 
710         argSize += m_args[i].unitSize;
711     }
712     //////////////////////////////////////////////////////////////////////////
713 
714     if (kernelInfoRefCount <= 2)    //Only read for 1st time Kernel creation, later we reuse them
715     {
716         uint16_t attributeCount = 0;
717         if (useVisaApi)
718         {
719             attributeCount = kernelBody->getAttributeCount();
720         }
721         else
722         {
723             /////////////////////////////////////////////////////////////////////////
724             // Get pre-defined kernel attributes, Start
725             //skipping size and entry
726             bytePosition += 8;
727 
728             READ_FIELD_FROM_BUF(attributeCount, uint16_t);
729         }
730 
731         for (int i = 0; i < attributeCount; i++)
732         {
733             vISA::AttributeInfo *attribute = nullptr;
734             uint32_t nameIndex = 0;
735             uint8_t size = 0;
736 
737             if (useVisaApi)
738             {
739                 attribute = kernelBody->getAttributeInfo()[i];
740                 nameIndex = attribute->getName();
741                 size = attribute->getSize();
742             }
743             else
744             {
745                 READ_FIELD_FROM_BUF(nameIndex, uint16_t);
746                 READ_FIELD_FROM_BUF(size, uint8_t);
747             }
748 
749             if( strcmp( m_kernelInfo->globalStrings[nameIndex], "AsmName" ) == 0 )
750             {
751                 if (useVisaApi)
752                 {
753                     CmSafeMemCopy(m_kernelInfo->kernelASMName, attribute->getValue(), size);
754                 }
755                 else
756                 {
757                     CmSafeMemCopy(m_kernelInfo->kernelASMName, &buf[bytePosition], size);
758                     bytePosition += size;
759                 }
760             }
761             else if (strcmp( m_kernelInfo->globalStrings[nameIndex], "SLMSize" ) == 0)
762             {
763                 if (useVisaApi)
764                 {
765                     m_kernelInfo->kernelSLMSize = attribute->getValue()[0];
766                 }
767                 else
768                 {
769                     READ_FIELD_FROM_BUF(m_kernelInfo->kernelSLMSize, uint8_t);
770                 }
771 
772                 /* Notes by Stony@2014-04-09
773                  * <=CISA3.1: the size is number of 4KB
774                  * > CISA3.1: the size is number of 1KB
775                  * Here convert it to the number of 1KB if <=CISA 3.1
776                  */
777                 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion <= 1))
778                 {
779                     m_kernelInfo->kernelSLMSize = m_kernelInfo->kernelSLMSize * 4;
780                 }
781 
782                 // align to power of 2
783                 uint32_t v = m_kernelInfo->kernelSLMSize;
784                 v--;
785                 v |= v >> 1;
786                 v |= v >> 2;
787                 v |= v >> 4;
788                 v |= v >> 8;
789                 v |= v >> 16;
790                 v++;
791                 m_kernelInfo->kernelSLMSize = ( uint8_t )v;
792             }
793             else if (strcmp(m_kernelInfo->globalStrings[nameIndex], "NoBarrier") == 0)
794             {
795                 m_kernelInfo->blNoBarrier = true;
796                 if (!useVisaApi)
797                 {
798                     bytePosition += size;
799                 }
800             }
801             else
802             {
803                 if (!useVisaApi)
804                 {
805                     bytePosition += size;
806                 }
807             }
808         }
809     }
810 
811     if(argSize > m_halMaxValues->maxArgByteSizePerKernel)
812     {
813         CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
814         return CM_EXCEED_KERNEL_ARG_SIZE_IN_BYTE;
815     }
816 
817     buf = (uint8_t*)commonISACode;
818 
819     if(m_program->IsJitterEnabled())
820     {
821         //m_JitterEnable = true;
822         char *programOptions;
823         m_program->GetKernelOptions(programOptions);
824         //if no options or same options, copy load program's binary. else re-jitter
825         {
826             m_binary = (char *)m_kernelInfo->jitBinaryCode;
827             m_binarySize = m_kernelInfo->jitBinarySize;
828             m_kernelInfo->origBinary = m_kernelInfo->jitBinaryCode;
829             m_kernelInfo->origBinarySize = m_kernelInfo->jitBinarySize;
830         }
831     }
832     else
833     {
834         char* binary = (char*)(buf + m_kernelInfo->genxBinaryOffset );
835 
836         //No copy, point to the binary offset in CISA code.
837         m_binary = binary;
838         m_binarySize = m_kernelInfo->genxBinarySize;
839 
840         m_kernelInfo->origBinary = binary;
841         m_kernelInfo->origBinarySize = m_kernelInfo->genxBinarySize;
842     }
843 
844     if (m_kernelInfo->blNoBarrier)
845     {
846         m_barrierMode = CM_NO_BARRIER;
847     }
848 
849     m_movInstConstructor = CmExtensionCreator<CmMovInstConstructor>::CreateClass();
850     if (m_movInstConstructor == nullptr)
851     {
852         CM_ASSERTMESSAGE("Error: Failed to allocate movInstConstructor due to out of system memory.");
853         return CM_OUT_OF_HOST_MEMORY;
854     }
855 
856     CmNotifierGroup *notifiers = m_device->GetNotifiers();
857     if (notifiers != nullptr)
858     {
859         notifiers->NotifyKernelCreated(this);
860     }
861 
862     return CM_SUCCESS;
863 }
864 
865 //*-----------------------------------------------------------------------------
866 //! A CmKernel can run in multiple threads concurrently. This
867 //! fucntion is to set the number of threads.
868 //! INPUT:
869 //!     number of threads
870 //! OUTPUT:
871 //!     CM_SUCCESS or
872 //!     CM_INVALID_ARG_VALUE if the number is larger than CmKernel's capacity
873 //*-----------------------------------------------------------------------------
SetThreadCount(uint32_t count)874 CM_RT_API int32_t CmKernelRT::SetThreadCount(uint32_t count )
875 {
876     INSERT_API_CALL_LOG();
877     // Check per kernel, per task check will be at enqueue time
878     if ((int)count <= 0)
879         return CM_INVALID_ARG_VALUE;
880 
881     if (m_threadSpace == nullptr)
882     {
883         if (m_threadCount)
884         {
885             // Setting threadCount twice with different values will cause reset of kernels
886             if (m_threadCount != count)
887             {
888                 Reset();
889                 m_threadCount = count;
890                 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
891             }
892         }
893         else // first time
894         {
895             m_threadCount = count;
896         }
897     }
898     return CM_SUCCESS;
899 }
900 
GetThreadCount(uint32_t & count)901 int32_t CmKernelRT::GetThreadCount(uint32_t& count )
902 {
903     count = m_threadCount;
904     return CM_SUCCESS;
905 }
906 
GetKernelSurfaces(bool * & surfArray)907 int32_t CmKernelRT::GetKernelSurfaces(bool  *&surfArray)
908 {
909     surfArray = m_surfaceArray;
910     return CM_SUCCESS;
911 }
912 
ResetKernelSurfaces()913 int32_t CmKernelRT::ResetKernelSurfaces()
914 {
915     uint32_t surfacePoolSize = m_surfaceMgr->GetSurfacePoolSize();
916     if (!m_surfaceArray)
917     {
918         m_surfaceArray = MOS_NewArray(bool, surfacePoolSize);
919         if (!m_surfaceArray)
920         {
921             CM_ASSERTMESSAGE("Error: Failed to rest kernel surfaces due to out of system memory.");
922             return CM_OUT_OF_HOST_MEMORY;
923         }
924     }
925     CmSafeMemSet( m_surfaceArray, 0, surfacePoolSize * sizeof( bool ) );
926 
927     return CM_SUCCESS;
928 }
929 
930 //*-----------------------------------------------------------------------------
931 //| Purpose:    Get CmSurface from surface manager.
932 //|             Use "value + indexSurfaceArray" to locate its surfaceIndex
933 //| Returns:    CmSurface. Null if not found
934 //*-----------------------------------------------------------------------------
GetSurfaceFromSurfaceArray(SurfaceIndex * value,uint32_t indexSurfaceArray)935 CmSurface* CmKernelRT::GetSurfaceFromSurfaceArray( SurfaceIndex* value, uint32_t indexSurfaceArray)
936 {
937     int32_t hr                          = CM_SUCCESS;
938     CmSurface *surface           = nullptr;
939     SurfaceIndex* surfaceIndex     = nullptr;
940 
941     surfaceIndex = value + indexSurfaceArray;
942     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceIndex);
943 
944     if (surfaceIndex->get_data() == CM_NULL_SURFACE
945         || surfaceIndex->get_data() == 0)
946     {
947         surface = (CmSurface *)CM_NULL_SURFACE;
948         goto finish;
949     }
950 
951     m_surfaceMgr->GetSurface(surfaceIndex->get_data(), surface);
952 
953 finish:
954     if(hr != CM_SUCCESS)
955     {
956         surface = nullptr;
957     }
958 
959     return surface;
960 }
961 
962 //*-----------------------------------------------------------------------------
963 //| Purpose:    Set kernel arg for single vme surface or multiple vme surfaces
964 //|             in surface array. So far, don't support vme surface array in thread arg.
965 //| Returns:    Result of the operation.
966 //*-----------------------------------------------------------------------------
SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t argIndex,const void * value,uint32_t nThreadID)967 int32_t CmKernelRT::SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t argIndex, const void *value, uint32_t nThreadID)
968 {
969     uint32_t elementNum = 0;
970     CM_ARG& arg        = m_args[ argIndex ];
971     uint32_t totalVmeArgValueSize       = 0;
972     uint32_t totalSurfacesInVme         = 0;
973     uint32_t tempVmeArgValueSize        = 0;
974     uint32_t vmeArgValueOffset          = 0;
975     uint32_t lastVmeSurfCount           = 0;
976     CmSurfaceVme* surfVme          = nullptr;
977     uint8_t *vmeArgValueArray         = nullptr;
978     uint16_t *vmeCmIndexArray          = nullptr;
979     int32_t hr = CM_SUCCESS;
980 
981     //Get Number of elements in surface array
982     if (arg.unitVmeArraySize == 0)
983     {  //First Time
984         elementNum = arg.unitSize / sizeof(uint32_t);
985     }
986     else
987     {
988         elementNum = arg.unitVmeArraySize;
989     }
990 
991     //Get Size of vmeIndexArray and vmeCmIndexArray.
992     for(uint32_t i=0; i< elementNum; i++)
993     {
994         if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
995         {
996             tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
997             totalVmeArgValueSize += tempVmeArgValueSize;
998             totalSurfacesInVme++;
999         }
1000         else
1001         {
1002             surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1003             CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1004             tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1005             totalVmeArgValueSize += tempVmeArgValueSize;
1006             totalSurfacesInVme += surfVme->GetTotalSurfacesCount();
1007         }
1008     }
1009 
1010     // Allocate and Zero Memory for arg.pValue and arg.surfIndex
1011     // arg.pValue    : an array of CM_HAL_VME_ARG_VALUE structure followed by an array of reference surfaces
1012     // arg.surfIndex : an array listing all the Cm surface indexes, in the order of current, fw surfaces, bw surfaces
1013 
1014     if (arg.unitSize < totalVmeArgValueSize) // need to re-allocate larger area)
1015     {
1016         if (arg.value)
1017         {
1018             MosSafeDeleteArray(arg.value);
1019         }
1020         arg.value = MOS_NewArray(uint8_t, totalVmeArgValueSize);
1021 
1022         if (arg.surfIndex)
1023         {
1024             MosSafeDeleteArray(arg.surfIndex);
1025         }
1026         arg.surfIndex = MOS_NewArray(uint16_t, totalSurfacesInVme);
1027     }
1028 
1029     CM_CHK_NULL_GOTOFINISH_CMERROR(arg.value);
1030     CmSafeMemSet(arg.value, 0, totalVmeArgValueSize);
1031     CM_CHK_NULL_GOTOFINISH_CMERROR(arg.surfIndex);
1032     CmSafeMemSet(arg.surfIndex, 0, totalSurfacesInVme * sizeof(uint16_t));
1033 
1034     //Set each Vme Surface
1035     for (uint32_t i = 0; i< elementNum; i++)
1036     {
1037         if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1038         {
1039             PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)(arg.value + vmeArgValueOffset);
1040             vmeArg->fwRefNum = 0;
1041             vmeArg->bwRefNum = 0;
1042             vmeArg->curSurface = CM_NULL_SURFACE;
1043             tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1044             vmeArgValueOffset += tempVmeArgValueSize;
1045             arg.surfIndex[lastVmeSurfCount] = CM_NULL_SURFACE;
1046             lastVmeSurfCount++;
1047         }
1048         else
1049         {
1050             surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1051             CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1052             SetArgsSingleVme(surfVme, arg.value + vmeArgValueOffset, arg.surfIndex + lastVmeSurfCount);
1053             tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1054             vmeArgValueOffset += tempVmeArgValueSize;
1055             lastVmeSurfCount += surfVme->GetTotalSurfacesCount();
1056         }
1057     }
1058 
1059     if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1060     {
1061         // First time set
1062         if( !arg.value )
1063         {   // Increment size kernel arguments will take up in CURBE
1064             m_sizeInCurbe += CM_ARGUMENT_SURFACE_SIZE * elementNum;
1065         }
1066 
1067         arg.unitCount = 1;
1068         arg.isDirty  = true;
1069         arg.isSet    = true;
1070         arg.unitKind  = ARG_KIND_SURFACE_VME;
1071         arg.unitSize = (uint16_t)totalVmeArgValueSize; // the unitSize can't represent surfaces count here
1072         arg.unitVmeArraySize = elementNum;
1073 
1074         m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1075         m_perKernelArgExists = true;
1076     }
1077     else
1078     {
1079         // Thread arg doesn't support VME surfaces as it is rarely used and it is complex to implement,
1080         // since each thread may has different surface number in its vme surface argment.
1081         hr = CM_THREAD_ARG_NOT_ALLOWED;
1082     }
1083 
1084 finish:
1085     if(hr != CM_SUCCESS)
1086     {
1087         MosSafeDeleteArray(arg.value);
1088         MosSafeDeleteArray(arg.surfIndex);
1089     }
1090     return hr;
1091 
1092 }
1093 
1094 //*-----------------------------------------------------------------------------
1095 //| Purpose:    Fill arg for a single vme surface.
1096 //|             vmeIndexArray points to arg.pValue
1097 //|             vmeCmIndexArray points to arg.surfIndex
1098 //| Returns:    Result of the operation.
1099 //*-----------------------------------------------------------------------------
SetArgsSingleVme(CmSurfaceVme * vmeSurface,uint8_t * vmeArgValueArray,uint16_t * cmSufacesArray)1100 int32_t CmKernelRT::SetArgsSingleVme(CmSurfaceVme* vmeSurface, uint8_t *vmeArgValueArray, uint16_t *cmSufacesArray)
1101 {
1102 
1103     int32_t hr = CM_SUCCESS;
1104     CM_SURFACE_MEM_OBJ_CTRL memCtl;
1105     uint32_t vmeBackwardSurfaceCount        = 0;
1106     uint32_t vmeForwardSurfaceCount         = 0;
1107     uint32_t vmeCurrentSurfaceIndex         = 0;
1108     uint16_t vmeCurrentCmIndex              = 0;
1109     int32_t vmeIndexArrayPosition          = 0; // Offset for vmeIndexArray
1110     int32_t vmeCmIndexArrayPosition        = 0; // Offset for vmeCmIndexArray
1111     uint32_t tempOutput                     = 0;
1112     uint32_t cmSurfArrayIdx                 = 0;
1113     uint32_t surfStateWidth                 = 0;
1114     uint32_t surfStateHeight                = 0;
1115 
1116     uint32_t *fArray       = nullptr;
1117     uint32_t *bArray       = nullptr;
1118     uint32_t *fCmIndex     = nullptr;
1119     uint32_t *bCmIndex     = nullptr;
1120 
1121     uint32_t *fwSurfInArg = nullptr;
1122     uint32_t *bwSurfInArg = nullptr;
1123 
1124     CmSurface *surface = nullptr;
1125     PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)vmeArgValueArray;
1126 
1127     CM_CHK_NULL_GOTOFINISH_CMERROR(vmeSurface);
1128     CM_CHK_NULL_GOTOFINISH_CMERROR(vmeArg);
1129     CM_CHK_NULL_GOTOFINISH_CMERROR(cmSufacesArray);
1130 
1131     if(vmeSurface == (CmSurfaceVme *)CM_NULL_SURFACE)
1132     {
1133         vmeArg->fwRefNum = 0;
1134         vmeArg->bwRefNum = 0;
1135         vmeArg->curSurface = CM_NULL_SURFACE;
1136         cmSufacesArray[cmSurfArrayIdx] =  CM_NULL_SURFACE;
1137         return hr;
1138     }
1139 
1140     // Get Vme Backward Forward Surface Count
1141     vmeSurface->GetIndexBackwardCount(vmeBackwardSurfaceCount);
1142     vmeSurface->GetIndexForwardCount(vmeForwardSurfaceCount);
1143 
1144     vmeArg->fwRefNum = vmeForwardSurfaceCount;
1145     vmeArg->bwRefNum = vmeBackwardSurfaceCount; // these two numbers must be set before any other operations
1146 
1147     vmeSurface->GetSurfaceStateResolution(vmeArg->surfStateParam.surfaceStateWidth, vmeArg->surfStateParam.surfaceStateHeight);
1148 
1149     vmeSurface->GetIndexForwardArray(fArray);
1150     vmeSurface->GetIndexBackwardArray(bArray);
1151     vmeSurface->GetIndexCurrent(vmeCurrentSurfaceIndex);
1152 
1153     vmeSurface->GetCmIndexCurrent(vmeCurrentCmIndex);
1154     vmeSurface->GetCmIndexForwardArray(fCmIndex);
1155     vmeSurface->GetCmIndexBackwardArray(bCmIndex);
1156 
1157     cmSufacesArray[cmSurfArrayIdx++] = vmeCurrentCmIndex;
1158 
1159     // Set Current Vme Surface
1160     m_surfaceMgr->GetSurface(vmeCurrentCmIndex, surface);
1161     CM_CHK_NULL_GOTOFINISH_CMERROR(surface);
1162 
1163     vmeArg->curSurface = vmeCurrentSurfaceIndex;
1164 
1165     //Set Forward Vme Surfaces
1166     fwSurfInArg = findFwRefInVmeArg(vmeArg);
1167     for (uint32_t i = 0; i < vmeForwardSurfaceCount; i++)
1168     {
1169         GetVmeSurfaceIndex( fArray, fCmIndex, i, &tempOutput);
1170         fwSurfInArg[i] = tempOutput;
1171         cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)fCmIndex[i];
1172     }
1173 
1174     //Set Backward Vme Surfaces
1175     bwSurfInArg = findBwRefInVmeArg(vmeArg);
1176     for (uint32_t i = 0; i < vmeBackwardSurfaceCount; i++)
1177     {
1178         GetVmeSurfaceIndex( bArray, bCmIndex, i, &tempOutput);
1179         bwSurfInArg[i] = tempOutput;
1180         cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)bCmIndex[i];
1181     }
1182 
1183 finish:
1184     return hr;
1185 }
1186 
1187 //*-----------------------------------------------------------------------------
1188 //| Purpose:    Get Vme Surface Index with memory object setting .
1189 //|             Output value will be filled into arg.pValue
1190 //| Returns:    Result of the operation.
1191 //*-----------------------------------------------------------------------------
GetVmeSurfaceIndex(uint32_t * vmeIndexArray,uint32_t * vmeCmIndexArray,uint32_t index,uint32_t * outputValue)1192 int32_t CmKernelRT::GetVmeSurfaceIndex(
1193     uint32_t *vmeIndexArray,
1194     uint32_t *vmeCmIndexArray,
1195     uint32_t index,
1196     uint32_t *outputValue)
1197 {
1198     int32_t hr = CM_SUCCESS;
1199     uint32_t value = vmeIndexArray[index];
1200 
1201     if (vmeIndexArray[index] == CM_INVALID_VME_SURFACE)
1202     {
1203         value = CM_NULL_SURFACE;
1204     }
1205 
1206     *outputValue = value;
1207 
1208     return hr;
1209 }
1210 
1211 //*-----------------------------------------------------------------------------
1212 //| Purpose:    Set arguments for function SetKernelArg().
1213 //|             Kernel argument is surface array.
1214 //! INPUT:
1215 //!             1) Current index in surface array
1216 //!             2) Index of kernel argument
1217 //!             3) Surface count in surface array
1218 //!             4) Pointer to current surface in surface array.
1219 //!             5) Current surface  index
1220 //!             6) Pointer to argument value
1221 //!             7) value of surface handle combined with memory object control
1222 //!             8) Original surface index for each surface in array
1223 //| Returns:    Result of the operation.
1224 //*-----------------------------------------------------------------------------
SetArgsInternalSurfArray(int32_t offset,uint32_t kernelArgIndex,int32_t surfCount,CmSurface * currentSurface,uint32_t currentSurfIndex,SurfaceIndex * value,uint32_t surfValue[],uint16_t origSurfIndex[])1225 int32_t CmKernelRT::SetArgsInternalSurfArray(
1226     int32_t offset,uint32_t kernelArgIndex,
1227     int32_t surfCount, CmSurface* currentSurface,
1228     uint32_t currentSurfIndex, SurfaceIndex* value,
1229     uint32_t surfValue[], uint16_t origSurfIndex[])
1230 {
1231     CM_SURFACE_MEM_OBJ_CTRL memCtl;
1232     uint32_t                surfRegTableIndex = 0;
1233     uint32_t                handle = 0;
1234     uint32_t                samplerIndex;
1235     uint16_t                samplerCmIndex;
1236     uint32_t                surfaceArraySize = 0;
1237 
1238     m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1239     MosSafeDeleteArray(m_args[kernelArgIndex].surfArrayArg); // delete it if it was allcated
1240     m_args[kernelArgIndex].surfArrayArg = MOS_NewArray(SURFACE_ARRAY_ARG, surfCount);
1241     if (!m_args[kernelArgIndex].surfArrayArg)
1242     {
1243         CM_ASSERTMESSAGE("Error: Out of system memory.");
1244         return CM_OUT_OF_HOST_MEMORY;
1245     }
1246     CmSafeMemSet((void *)m_args[kernelArgIndex].surfArrayArg, 0,  sizeof(SURFACE_ARRAY_ARG) * surfCount);
1247     while (offset < surfCount)
1248     {
1249         switch (currentSurface->Type())
1250         {
1251           case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1252           {
1253              CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(currentSurface);
1254 
1255              uint32_t numAliases = 0;
1256              surf2D->GetNumAliases(numAliases);
1257              if (numAliases)
1258              {
1259                  m_args[kernelArgIndex].aliasCreated = true;
1260              }
1261              else
1262              {
1263                  m_args[kernelArgIndex].aliasCreated = false;
1264              }
1265 
1266              //set memory object control
1267              surf2D->GetIndexFor2D(surfRegTableIndex);
1268 
1269              surfValue[offset] = surfRegTableIndex;
1270              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1271 
1272              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D;
1273              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D;
1274 
1275              break;
1276          }
1277          case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1278          {
1279              CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(currentSurface);
1280 
1281              uint32_t numAliases = 0;
1282              surf1D->GetNumAliases(numAliases);
1283              if (numAliases)
1284              {
1285                  m_args[kernelArgIndex].aliasCreated = true;
1286              }
1287              else
1288              {
1289                  m_args[kernelArgIndex].aliasCreated = false;
1290              }
1291 
1292              //set memory object control
1293              surf1D->GetHandle(handle);
1294 
1295              surfValue[offset] = handle;
1296              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1297 
1298              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_1D;
1299              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_1D;
1300              break;
1301          }
1302          case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1303          {
1304              CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(currentSurface);
1305 
1306              //set memory object
1307              surf2DUP->GetHandle(handle);
1308 
1309              surfValue[offset] = handle;
1310              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1311 
1312              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D_UP;
1313              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D_UP;
1314              break;
1315          }
1316          case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1317          {
1318              CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(currentSurface);
1319 
1320              surf3D->GetHandle(handle);
1321 
1322              surfValue[offset] = handle;
1323              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1324 
1325              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1326              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1327 
1328              break;
1329          }
1330 
1331          case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1332          {
1333              CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( currentSurface );
1334              stateBuffer->GetHandle( handle );
1335 
1336              surfValue[ offset ] = handle;
1337              origSurfIndex[ offset ] = ( uint16_t )currentSurfIndex;
1338 
1339              m_args[ kernelArgIndex ].surfArrayArg[ offset ].argKindForArray = ARG_KIND_STATE_BUFFER;
1340              m_args[ kernelArgIndex ].unitKind = ARG_KIND_STATE_BUFFER;
1341 
1342              break;
1343          }
1344 
1345          //sampler surface
1346          case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1347          {
1348              CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (currentSurface);
1349              surfSampler->GetHandle(samplerIndex);
1350              surfSampler->GetCmIndexCurrent(samplerCmIndex);
1351 
1352              m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1353              if (!currentSurface)
1354              {
1355                  CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1356                  return CM_NULL_POINTER;
1357              }
1358 
1359              surfValue[offset] = samplerIndex;
1360              origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1361 
1362              SAMPLER_SURFACE_TYPE type;
1363              surfSampler->GetSurfaceType(type);
1364              if (type == SAMPLER_SURFACE_TYPE_2D)
1365              {
1366                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER;
1367                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER;
1368              }
1369              else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1370              {
1371                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE2DUP_SAMPLER;
1372                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1373              }
1374              else if(type == SAMPLER_SURFACE_TYPE_3D)
1375              {
1376                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1377                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1378              }
1379              else
1380              {
1381                  CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1382                  return CM_FAILURE;
1383              }
1384              break;
1385          }
1386          //sampler8x8surface
1387          case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1388          {
1389              CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (currentSurface);
1390              surfSampler8x8->GetIndexCurrent(samplerIndex);
1391              surfSampler8x8->GetCmIndex(samplerCmIndex);
1392 
1393              m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1394              if (!currentSurface)
1395              {
1396                  CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1397                  return CM_FAILURE;
1398              }
1399 
1400              surfValue[offset] = samplerIndex;
1401              origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1402 
1403              CM_SAMPLER8x8_SURFACE type;
1404              type = surfSampler8x8->GetSampler8x8SurfaceType();
1405              if (type == CM_VA_SURFACE)
1406              {
1407                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1408                  m_args[kernelArgIndex].surfArrayArg[offset].addressModeForArray = surfSampler8x8->GetAddressControlMode();
1409                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1410              }
1411              else if(type == CM_AVS_SURFACE)
1412              {
1413                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1414                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1415              }
1416              else
1417              {
1418                  CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1419                  return CM_FAILURE;
1420              }
1421              break;
1422          }
1423          default:
1424          {
1425              CM_ASSERTMESSAGE("Error: No matched surface for surface array");
1426              return CM_INVALID_ARG_VALUE;
1427          }
1428        }
1429        offset++;
1430        if (offset < surfCount)
1431        {
1432            currentSurfIndex = value[offset].get_data();
1433 
1434            while ((!currentSurfIndex && (offset < surfCount))||(currentSurfIndex == CM_NULL_SURFACE))
1435            {
1436                surfValue[offset] = CM_NULL_SURFACE;
1437                origSurfIndex[offset] = 0;
1438                offset++;
1439                if (offset >= surfCount)
1440                    break;
1441                currentSurfIndex = value[offset].get_data();
1442            }
1443 
1444            if(surfaceArraySize == 0)
1445            {
1446                CM_ASSERTMESSAGE("Error: No surface in surface array");
1447                return CM_NO_AVAILABLE_SURFACE;
1448            }
1449            if (currentSurfIndex > surfaceArraySize)
1450            {
1451                currentSurfIndex = currentSurfIndex % surfaceArraySize;
1452            }
1453        }
1454        if (offset < surfCount)
1455        {
1456            m_surfaceMgr->GetSurface(currentSurfIndex, currentSurface);
1457            if (nullptr == currentSurface)
1458            {
1459                CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1460                return CM_FAILURE;
1461            }
1462        }
1463     }
1464     return CM_SUCCESS;
1465 }
1466 //*-----------------------------------------------------------------------------
1467 // Set arguments for function SetKernelArg() and SetThreadArg()
1468 // Set parameter nArgType to CM_KERNEL_INTERNEL_ARG_KERNEL to set a kernel
1469 // argument; set parameter nArgType to CM_KERNEL_INTERNEL_ARG_THREAD to set
1470 // a thread argument
1471 //*-----------------------------------------------------------------------------
SetArgsInternal(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t index,size_t size,const void * value,uint32_t nThreadID)1472 int32_t CmKernelRT::SetArgsInternal( CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t index, size_t size, const void *value, uint32_t nThreadID )
1473 {
1474     uint32_t surfRegTableIndex = 0; // for 2D surf
1475     uint32_t handle = 0; // for 1D surf
1476 
1477     uint32_t samplerIndex;
1478     uint16_t samplerCmIndex;
1479     uint32_t samplerIdx = 0;
1480     uint32_t vmeIdx = 0;
1481     uint16_t *surfIndexValue =  nullptr;
1482     uint32_t surfaces[CM_MAX_ARGS_PER_KERNEL];
1483     uint16_t surfIndexArray[CM_MAX_ARGS_PER_KERNEL];
1484     std::vector< int > sampler_index_array;
1485 
1486     //Clear "set" flag in case user call API to set the same one argument multiple times.
1487     m_args[index].isSet = false;
1488     if( m_args[ index ].unitKind == ARG_KIND_GENERAL || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPVEC) || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPCNT))
1489     {
1490         if( size != m_args[ index ].unitSize )
1491         {
1492             CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1493             return CM_INVALID_ARG_SIZE;
1494         }
1495     }
1496     //For surface type
1497     else if (CHECK_SURFACE_TYPE(m_args[index].unitKind,
1498         ARG_KIND_SURFACE,
1499         ARG_KIND_SURFACE_1D,
1500         ARG_KIND_SURFACE_2D,
1501         ARG_KIND_SURFACE_2D_UP,
1502         ARG_KIND_SURFACE_3D,
1503         ARG_KIND_SURFACE_SAMPLER,
1504         ARG_KIND_SURFACE2DUP_SAMPLER,
1505         ARG_KIND_SURFACE_VME,
1506         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
1507         ARG_KIND_SURFACE_SAMPLER8X8_VA,
1508         ARG_KIND_SURFACE_2D_SCOREBOARD,
1509         ARG_KIND_STATE_BUFFER
1510         ))
1511     {
1512 
1513         // this code is to convert SurfaceIndex object to index of type uint32_t,
1514         // which is expected by commonISA/genBinary
1515         // index is the index of the surface in surface registration table of CM device
1516         // in driver
1517 
1518         int signatureSize = m_args[index].unitSize;
1519         int numSurfaces = signatureSize / sizeof(int);
1520         SurfaceIndex* surfIndex = (SurfaceIndex*)value;
1521         if (surfIndex == (SurfaceIndex*)CM_NULL_SURFACE)
1522         {
1523             m_args[index].isSet = true;
1524             m_args[index].unitCount = 1; // per kernel arg
1525             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1526             m_perKernelArgExists = true;
1527             m_args[index].isDirty = true;
1528             m_args[index].isNull = true;
1529             return CM_SUCCESS;
1530         }
1531         else
1532         {
1533             // In case that CM_NULL_SURFACE was set at last time and will
1534             // set a read surface index this time. So need set isDirty as
1535             // well to indicate update kernel data.
1536             if (m_args[index].isNull == true)
1537             {
1538                 m_args[index].isDirty = true;
1539                 m_args[index].isNull = false;
1540             }
1541         }
1542 
1543         m_args[index].isNull = false;
1544         CM_SURFACE_MEM_OBJ_CTRL memCtl;
1545 
1546         if (m_args[index].unitKind != ARG_KIND_SURFACE_VME)
1547         {
1548             if (size != sizeof(SurfaceIndex)* numSurfaces)
1549             {
1550                 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1551                 return CM_INVALID_ARG_SIZE;
1552             }
1553         }
1554 
1555         uint32_t surfIndexData = surfIndex->get_data();
1556         int i = 0;
1557         uint32_t surfaceArraySize = 0;
1558         m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1559 
1560         if (surfIndexData > surfaceArraySize)
1561         {
1562             if (m_args[index].aliasIndex != surfIndexData)
1563             {
1564                 m_args[index].isDirty = true;
1565                 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1566                 m_args[index].aliasIndex = surfIndexData;
1567             }
1568 
1569             surfIndexData = surfIndexData % surfaceArraySize;
1570         }
1571         else
1572         {
1573             m_args[index].aliasIndex = 0;
1574         }
1575 
1576         while (!surfIndexData && (i < numSurfaces))
1577         {
1578             surfaces[i] = CM_NULL_SURFACE;
1579             surfIndexArray[i] = 0;
1580             i++;
1581             if (i >= numSurfaces)
1582                 break;
1583             surfIndexData = surfIndex[i].get_data();
1584         }
1585 
1586         if (i >= numSurfaces)
1587         {
1588             m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1589             value = surfaces;
1590             size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1591             m_args[index].unitSize = (uint16_t)size;
1592             goto finish;
1593         }
1594         CmSurface* surface = nullptr;
1595         m_surfaceMgr->GetSurface(surfIndexData, surface);
1596         if (nullptr == surface)
1597         {
1598             CM_ASSERTMESSAGE("Error: Invalid surface.");
1599             return CM_FAILURE;
1600         }
1601 
1602         if (SurfTypeToArgKind(surface->Type()) != m_args[index].unitKind)
1603         {   // if surface type changes i.e 2D <-> 2DUP  Need to set bIsDrity as true
1604             m_args[index].isDirty = true;
1605             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1606         }
1607 
1608         uint32_t cisaMajorVersion, cisaMinorVersion;
1609         m_program->GetCISAVersion(cisaMajorVersion, cisaMinorVersion);
1610 
1611         //This path is for surface array, including 1D, 2D, 3D,samplersurface, samplersurface8x8
1612         if ((numSurfaces > 1) && (surface->Type() != CM_ENUM_CLASS_TYPE_CMSURFACEVME))
1613         {
1614             int32_t hr = SetArgsInternalSurfArray(i,index, numSurfaces, surface, surfIndexData, surfIndex,surfaces, surfIndexArray);
1615             if (hr != CM_SUCCESS)
1616             {
1617                 CM_ASSERTMESSAGE("Error: SetArgsInternal for surface array failed!\n");
1618                 return CM_INVALID_ARG_VALUE;
1619             }
1620             value = surfaces;
1621             surfIndexValue = surfIndexArray;
1622             size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1623             m_args[index].unitSize = (uint16_t)size;
1624         }
1625         else
1626         {   //This is for single surface and surface array for VME surface
1627             switch (surface->Type())
1628             {
1629                  case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1630                  {
1631                      CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(surface);
1632 
1633                      uint32_t numAliases = 0;
1634                      surf2D->GetNumAliases(numAliases);
1635                      if (numAliases)
1636                      {
1637                          m_args[index].aliasCreated = true;
1638                      }
1639                      else
1640                      {
1641                          m_args[index].aliasCreated = false;
1642                      }
1643 
1644                      //set memory object control
1645                      surf2D->GetIndexFor2D(surfRegTableIndex);
1646 
1647                      surfaces[i] = surfRegTableIndex;
1648                      surfIndexArray[i] = (uint16_t)surfIndexData;
1649 
1650                      value = surfaces;
1651                      surfIndexValue = surfIndexArray;
1652 
1653                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1654                      m_args[index].unitSize = (uint16_t)size;
1655 
1656                      if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D_UP)) // first time or last time is set to 2DUP
1657                      {
1658                          m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1659                          if (m_args[index].surfaceKind == SAMPLER_SURF)
1660                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1661                      }
1662                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D &&
1663                          m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER &&
1664                          m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1665                          m_args[index].unitKind != ARG_KIND_SURFACE_2D_SCOREBOARD)
1666                      {
1667                          CM_ASSERTMESSAGE("Error: Assign a 2D surface to the arg which is previously assigned 1D surface, 3D surface, or VME surface.");
1668                          return CM_INVALID_ARG_VALUE;
1669                      }
1670                      break;
1671                  }
1672                  case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1673                  {
1674                      CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(surface);
1675 
1676                      uint32_t numAliases = 0;
1677                      surf1D->GetNumAliases(numAliases);
1678                      if (numAliases)
1679                      {
1680                          m_args[index].aliasCreated = true;
1681                      }
1682                      else
1683                      {
1684                          m_args[index].aliasCreated = false;
1685                      }
1686 
1687                      //set memory object control
1688                      surf1D->GetHandle(handle);
1689 
1690                      surfaces[i] = handle;
1691                      surfIndexArray[i] = (uint16_t)surfIndexData;
1692 
1693                      value = surfaces;
1694                      surfIndexValue = surfIndexArray;
1695 
1696                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1697                      m_args[index].unitSize = (uint16_t)size;
1698 
1699                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1700                      {
1701                          m_args[index].unitKind = ARG_KIND_SURFACE_1D;
1702                      }
1703                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_1D)
1704                      {
1705                          CM_ASSERTMESSAGE("Error: Assign a 1D surface to the arg which is previously assigned 2D surface, 3D surface, or VME surface.");
1706                          return CM_INVALID_ARG_VALUE;
1707                      }
1708                      break;
1709                  }
1710                  case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1711                  {
1712                      CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(surface);
1713 
1714                      //set memory object
1715                      surf2DUP->GetHandle(handle);
1716 
1717                      surfaces[i] = handle;
1718                      surfIndexArray[i] = (uint16_t)surfIndexData;
1719 
1720                      value = surfaces;
1721                      surfIndexValue = surfIndexArray;
1722 
1723                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1724                      m_args[index].unitSize = (uint16_t)size;
1725 
1726                      if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D)) // first time or last time is set to 2D
1727                      {
1728                          m_args[index].unitKind = ARG_KIND_SURFACE_2D_UP;
1729                      }
1730                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D_UP)
1731                      {
1732                          CM_ASSERTMESSAGE("Error: Assign a 2D surface UP to the arg which is previously assigned other surfaces.");
1733                          return CM_INVALID_ARG_VALUE;
1734                      }
1735 
1736                      break;
1737                  }
1738                  case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1739                  {
1740                      CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(surface);
1741 
1742                      surf3D->GetHandle(handle);
1743 
1744                      surfaces[i] = handle;
1745                      surfIndexArray[i] = (uint16_t)surfIndexData;
1746 
1747                      value = surfaces;
1748                      surfIndexValue = surfIndexArray;
1749 
1750                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1751                      m_args[index].unitSize = (uint16_t)size;
1752 
1753                      if (m_args[index].unitKind == ARG_KIND_SURFACE) // first time
1754                      {
1755                          m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1756                      }
1757                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_3D)
1758                      {
1759                          CM_ASSERTMESSAGE("Error: Assign a 3D surface to the arg which is previously assigned 1D surface, 2D surface or VME surface");
1760                          return CM_INVALID_ARG_VALUE;
1761                      }
1762                      break;
1763                  }
1764 
1765                  case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1766                  {
1767                      CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( surface );
1768                      stateBuffer->GetHandle( handle );
1769 
1770                      surfaces[ i ] = handle;
1771                      surfIndexArray[ i ] = ( uint16_t )surfIndexData;
1772 
1773                      value = surfaces;
1774                      surfIndexValue = surfIndexArray;
1775 
1776                      size = ( size / sizeof( SurfaceIndex ) ) * sizeof( uint32_t );
1777                      m_args[ index ].unitSize = ( uint16_t )size;
1778 
1779                      if ( m_args[ index ].unitKind == ARG_KIND_SURFACE ) // first time
1780                      {
1781                          m_args[ index ].unitKind = ARG_KIND_STATE_BUFFER;
1782                      }
1783                      else if ( m_args[ index ].unitKind != ARG_KIND_STATE_BUFFER )
1784                      {
1785                          CM_ASSERTMESSAGE( "Error: Assign a state buffer to the arg which is previously assigned 1D surface, 2D surface, 3D surface or VME surface" );
1786                          return CM_INVALID_ARG_VALUE;
1787                      }
1788                      break;
1789                  }
1790 
1791                  case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
1792                  {
1793                      return SetArgsVme(nArgType, index, value, nThreadID);
1794                  }
1795                  case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1796                  {
1797                      CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
1798                      surfSampler8x8->GetIndexCurrent(samplerIndex);
1799                      surfSampler8x8->GetCmIndex(samplerCmIndex);
1800                      if (samplerCmIndex > surfaceArraySize)
1801                      {
1802                          m_args[index].aliasIndex = samplerCmIndex;
1803                          m_args[index].aliasCreated = true;
1804                          samplerCmIndex %= surfaceArraySize;
1805                      }
1806 
1807                      m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1808                      if (!surface)
1809                      {
1810                          CM_ASSERTMESSAGE("Error: Invalid sampler8x8 surface.");
1811                          return CM_FAILURE;
1812                      }
1813 
1814                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1815                      m_args[index].unitSize = (uint16_t)size;
1816 
1817                      value = &samplerIndex;
1818                      surfIndexValue = &samplerCmIndex;
1819 
1820                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1821                      {
1822                          if (surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE)
1823                          {
1824                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1825                              m_args[index].nCustomValue = surfSampler8x8->GetAddressControlMode();
1826                          }
1827                          else
1828                          {
1829                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1830                          }
1831                      }
1832                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_AVS &&
1833                          m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_VA)
1834                      {
1835                          CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1836                          return CM_FAILURE;
1837                      }
1838                      break;
1839                  }
1840                  case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1841                  {
1842                      CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
1843                      surfSampler->GetHandle(samplerIndex);
1844                      surfSampler->GetCmIndexCurrent(samplerCmIndex);
1845 
1846                      m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1847                      if (!surface)
1848                      {
1849                          CM_ASSERTMESSAGE("Error: Invalid sampler surface.");
1850                          return CM_FAILURE;
1851                      }
1852 
1853                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1854                      m_args[index].unitSize = (uint16_t)size;
1855 
1856                      value = &samplerIndex;
1857                      surfIndexValue = &samplerCmIndex;
1858 
1859                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1860                      {   // first time
1861                          SAMPLER_SURFACE_TYPE type;
1862                          surfSampler->GetSurfaceType(type);
1863                          if (type == SAMPLER_SURFACE_TYPE_2D)
1864                          {
1865                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1866                          }
1867                          else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1868                          {
1869                              m_args[index].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1870                          }
1871                          else
1872                          {
1873                              m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1874                          }
1875 
1876                      }
1877                      else if ((m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER) &&
1878                          m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1879                          (m_args[index].unitKind != ARG_KIND_SURFACE_3D))
1880                      {
1881                          CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1882                          return CM_FAILURE;
1883                      }
1884                      break;
1885                  }
1886                  default:
1887                  {
1888                      CM_ASSERTMESSAGE("Error: Invalid surface type.");
1889                      return CM_INVALID_ARG_VALUE;
1890                  }
1891             }
1892         }
1893     }
1894     else if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1895     {
1896         unsigned int numSamplers = m_args[index].unitSize / sizeof(int);
1897 
1898         if (numSamplers > 1)
1899         {
1900             size = numSamplers * sizeof(unsigned int);
1901 
1902             for (unsigned int i = 0; i < numSamplers; i++)
1903             {
1904                 SamplerIndex* samplerIndex = (SamplerIndex*)value + i;
1905                 samplerIdx = samplerIndex->get_data();
1906                 sampler_index_array.push_back(samplerIdx);
1907             }
1908         }
1909         else
1910         {
1911             SamplerIndex* samplerIndex = (SamplerIndex*)value;
1912             samplerIdx = ((SamplerIndex*)value)->get_data();
1913             size = sizeof(unsigned int);
1914             m_args[index].unitSize = (uint16_t)size;
1915             value = &samplerIdx;
1916         }
1917     }
1918 
1919 finish:
1920     if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1921     {
1922         CM_ARG& arg = m_args[ index ];
1923 
1924         // Assume from now on, size is valid, i.e. confirmed with function signature
1925         if( !arg.value )
1926         {
1927             //Increment size kernel arguments will take up in CURBE
1928             uint32_t tempUnitSize = m_args[ index ].unitSize;
1929             if( (m_args[index].unitKind == ARG_KIND_SURFACE_VME ) ||
1930                 (m_args[index].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1931                 (m_args[index].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ))
1932             {
1933                 tempUnitSize = CM_ARGUMENT_SURFACE_SIZE;
1934             }
1935 
1936             // first setKernelArg or first setKernelArg after each enqueue
1937             arg.value = MOS_NewArray(uint8_t,size);
1938             if( !arg.value )
1939             {
1940                 CM_ASSERTMESSAGE("Error: Out of system memory.");
1941                 return CM_OUT_OF_HOST_MEMORY;
1942             }
1943 
1944             arg.unitCount = 1;
1945 
1946             CmSafeMemCopy((void *)arg.value, value, size);
1947 
1948             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
1949                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
1950                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
1951                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
1952                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1953                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
1954                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
1955                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
1956                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
1957                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
1958                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
1959                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
1960             {
1961                 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(int32_t)));
1962                 if (!arg.surfIndex)
1963                 {
1964                     CM_ASSERTMESSAGE("Error: Out of system memory.");
1965                     MosSafeDeleteArray(arg.value);
1966                     return CM_OUT_OF_HOST_MEMORY;
1967                 }
1968                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
1969                 if( surfIndexValue == nullptr )
1970                 {
1971                     CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
1972                     return CM_NULL_POINTER;
1973                 }
1974                 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size / sizeof(int32_t) * sizeof(uint16_t));
1975             }
1976 
1977             if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1978             {
1979                 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
1980                 {
1981                     *( (int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
1982                 }
1983             }
1984 
1985             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1986             arg.isDirty = true;
1987         }
1988         else
1989         {
1990             if( arg.unitCount != 1 )
1991             {
1992                 CM_ASSERTMESSAGE("Error: Invalid arg count.");
1993                 return CM_FAILURE;
1994             }
1995             if( memcmp( (void *)arg.value, value, size ) != 0 )
1996             {
1997                 CmSafeMemCopy((void *)arg.value, value, size);
1998                 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1999                 arg.isDirty = true;
2000             }
2001             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2002              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2003              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2004              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2005              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2006              ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2007              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2008              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2009              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2010              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2011              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2012              ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2013             {
2014                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
2015                 if( surfIndexValue == nullptr )
2016                 {
2017                     CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2018                     return CM_NULL_POINTER;
2019                 }
2020                 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size/sizeof(int32_t) * sizeof(uint16_t));
2021             }
2022 
2023             if (m_args[index].unitKind == ARG_KIND_SAMPLER)
2024             {
2025                 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
2026                 {
2027                     *((int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
2028                 }
2029             }
2030         }
2031 
2032         m_perKernelArgExists = true;
2033     }
2034     else //per thread arg
2035     {
2036         CM_ARG& arg = m_args[ index ];
2037 
2038         // Assume from now on, size is valid, i.e. confirmed with function signature
2039         if( !arg.value )
2040         {
2041             //Increment size per-thread arguments will take up in payload of media object or media object walker commands
2042             m_sizeInPayload += arg.unitSize;
2043             DW_ALIGNMENT(m_sizeInPayload);
2044 
2045             // first setThreadArg or first setThreadArg after each enqueue
2046             arg.value = MOS_NewArray(uint8_t, (size * m_threadCount));
2047             if( !arg.value )
2048             {
2049                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2050                 return CM_OUT_OF_HOST_MEMORY;
2051 
2052             }
2053             arg.unitCount = m_threadCount;
2054 
2055             uint32_t offset = size * nThreadID;
2056             uint8_t *threadValue = ( uint8_t *)arg.value;
2057             threadValue += offset;
2058 
2059             CmSafeMemCopy(threadValue, value, size);
2060             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2061                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2062                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2063                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2064                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2065                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2066                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2067                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2068                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2069                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2070                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2071                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2072             {
2073                 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(uint32_t) * m_threadCount));
2074                 if( !arg.surfIndex )
2075                 {
2076                     CM_ASSERTMESSAGE("Error: Out of system memory.");
2077                     MosSafeDeleteArray(arg.value);
2078                     return CM_OUT_OF_HOST_MEMORY;
2079                 }
2080                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(uint32_t) * sizeof(uint16_t) * m_threadCount);
2081                 if( surfIndexValue == nullptr )
2082                 {
2083                     CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2084                     return CM_NULL_POINTER;
2085                 }
2086                 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t)  * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2087             }
2088             m_perThreadArgExists = true;
2089         }
2090         else
2091         {
2092             if( arg.unitCount != m_threadCount )
2093             {
2094                 CM_ASSERTMESSAGE("Error: arg count is not matched with thread count.");
2095                 return CM_FAILURE;
2096 
2097             }
2098             uint32_t offset = size * nThreadID;
2099             uint8_t *threadValue = ( uint8_t *)arg.value;
2100             threadValue += offset;
2101 
2102             if( memcmp( threadValue, value, size ) != 0 )
2103             {
2104                 CmSafeMemCopy(threadValue, value, size);
2105                 m_dirty |= CM_KERNEL_DATA_THREAD_ARG_DIRTY;
2106                 arg.isDirty = true;
2107             }
2108             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2109                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2110                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2111                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2112                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2113                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2114                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2115                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2116                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2117                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2118                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2119                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2120             {
2121                 if( surfIndexValue == nullptr )
2122                 {
2123                     CM_ASSERTMESSAGE("Error: Pointer to surface index value is null.");
2124                     return CM_NULL_POINTER;
2125                 }
2126                 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t)  * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2127             }
2128         }
2129     }
2130 
2131     m_args[index].isSet = true;
2132 
2133     return CM_SUCCESS;
2134 }
2135 
2136 //*-----------------------------------------------------------------------------
2137 //! Set per kernel arguments. The total size of all per kernel arguments and per thread
2138 //! arguments should be less than or equal to 256 Bytes (CM_MAX_ARG_SIZE_IN_BYTE).
2139 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2140 //! i.e. after enqueue, ALL arguments need to be reset.
2141 //! INPUT:
2142 //!     1) Index of argument in CM kernel function (genx_main). The index is
2143 //!        global for per kernel arguments and per thread arguments.
2144 //!     2) Size of the argument.
2145 //!     3) Pointer to argument value.
2146 //! OUTPUT:
2147 //!     CM_SUCCESS or
2148 //!     CM_INVALID_ARG_INDEX if index is invalid;
2149 //!     CM_INVALID_ARG_SIZE if size is invalid;
2150 //!     CM_INVALID_ARG_VALUE if value is NULL.
2151 //*-----------------------------------------------------------------------------
SetKernelArg(uint32_t index,size_t size,const void * value)2152 CM_RT_API int32_t CmKernelRT::SetKernelArg(uint32_t index, size_t size, const void * value )
2153 {
2154     INSERT_API_CALL_LOG();
2155     //It should be mutual exclusive with Indirect Data
2156     if(m_kernelPayloadData)
2157     {
2158         CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2159         return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2160     }
2161 
2162     if( index >= m_argCount )
2163     {
2164         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2165         return CM_INVALID_ARG_INDEX;
2166 
2167     }
2168 
2169     if( !value)
2170     {
2171         CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2172         return CM_INVALID_ARG_VALUE;
2173     }
2174 
2175     if( size == 0)
2176     {
2177         CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
2178         return CM_INVALID_ARG_SIZE;
2179     }
2180 
2181     int32_t nRetVal = 0;
2182     if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERKERNEL, index, size, value ) ) != CM_SUCCESS )
2183     {
2184         return nRetVal;
2185     }
2186 
2187     return CM_SUCCESS;
2188 }
2189 
SetKernelArgPointer(uint32_t index,size_t size,const void * value)2190 CM_RT_API int32_t CmKernelRT::SetKernelArgPointer(uint32_t index, size_t size, const void *value)
2191 {
2192     INSERT_API_CALL_LOG();
2193 
2194     //It should be mutual exclusive with Indirect Data
2195     if (m_kernelPayloadData)
2196     {
2197         CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2198         return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2199     }
2200 
2201     if (index >= m_argCount)
2202     {
2203         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2204         return CM_INVALID_ARG_INDEX;
2205     }
2206 
2207     if (!value)
2208     {
2209         CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2210         return CM_INVALID_ARG_VALUE;
2211     }
2212 
2213     uint64_t *argValue = MOS_NewArray(uint64_t, 1);
2214     if (!argValue)
2215     {
2216         CM_ASSERTMESSAGE("Error: Out of system memory.");
2217         return CM_OUT_OF_HOST_MEMORY;
2218     }
2219     CmSafeMemSet(argValue, 0, sizeof(uint64_t));
2220     CmSafeMemCopy(argValue, value, size);
2221 
2222     // Get the gfx start address of SVM/stateless buffer.
2223     uint64_t gfxAddress = *(argValue);
2224     MosSafeDeleteArray(argValue);
2225 
2226     // Check the gfx start address is valid or not
2227     std::set<CmSurface *> statelessSurfArray = m_surfaceMgr->GetStatelessSurfaceArray();
2228     bool valid = false;
2229     for(auto surface : statelessSurfArray)
2230     {
2231         CmBuffer_RT *buffer = static_cast<CmBuffer_RT *>(surface);
2232         uint64_t startAddress = 0;
2233         buffer->GetGfxAddress(startAddress);
2234         size_t size = buffer->GetSize();
2235 
2236         if (gfxAddress >= startAddress
2237             && gfxAddress < (startAddress + size))
2238         {
2239             SurfaceIndex *surfIndex = nullptr;
2240             buffer->GetIndex(surfIndex);
2241             uint32_t surfIndexData = surfIndex->get_data();
2242             m_surfaceArray[surfIndexData] = true;
2243 
2244             m_args[index].isStatelessBuffer = true;
2245             m_args[index].index = (uint16_t)surfIndexData;
2246 
2247             valid = true;
2248             break;
2249         }
2250     }
2251     if (!valid)
2252     {
2253         CM_ASSERTMESSAGE("Error: the kernel arg pointer is not valid.");
2254         return CM_INVALID_KERNEL_ARG_POINTER;
2255     }
2256 
2257     int32_t nRetVal = SetArgsInternal(CM_KERNEL_INTERNEL_ARG_PERKERNEL,
2258                                       index,
2259                                       size,
2260                                       value);
2261     if (nRetVal != CM_SUCCESS)
2262     {
2263         return nRetVal;
2264     }
2265 
2266     return CM_SUCCESS;
2267 }
2268 
2269 //*-----------------------------------------------------------------------------
2270 //| Purpose:   Set Static Buffer
2271 //| Return :   The result of operation
2272 //*-----------------------------------------------------------------------------
SetStaticBuffer(uint32_t index,const void * value)2273 CM_RT_API int32_t CmKernelRT::SetStaticBuffer(uint32_t index, const void * value )
2274 {
2275     INSERT_API_CALL_LOG();
2276     if(index >= CM_GLOBAL_SURFACE_NUMBER)
2277     {
2278         CM_ASSERTMESSAGE("Error: Surface Index exceeds max global surface number.");
2279         return CM_INVALID_GLOBAL_BUFFER_INDEX;
2280     }
2281 
2282     if(!value)
2283     {
2284         CM_ASSERTMESSAGE("Error: Invalid StaticBuffer arg value.");
2285         return CM_INVALID_BUFFER_HANDLER;
2286     }
2287 
2288     SurfaceIndex* surfIndex = (SurfaceIndex* )value;
2289     uint32_t surfIndexData = surfIndex->get_data();
2290     if (surfIndexData >= m_surfaceMgr->GetSurfacePoolSize())
2291     {
2292         CM_ASSERTMESSAGE("Error: StaticBuffer doesn't allow alias index.");
2293         return CM_INVALID_ARG_INDEX;
2294     }
2295 
2296     CmSurface* surface  = nullptr;
2297     m_surfaceMgr->GetSurface( surfIndexData, surface );
2298     if(surface == nullptr)
2299     {
2300         CM_ASSERTMESSAGE("Error: Invalid surface.");
2301         return CM_INVALID_BUFFER_HANDLER;
2302     }
2303 
2304     CmBuffer_RT* surf1D = nullptr;
2305     if ( surface->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
2306     {
2307         uint32_t handle = 0; // for 1D surf
2308 
2309         surf1D = static_cast< CmBuffer_RT* >( surface );
2310         surf1D->GetHandle( handle );
2311 
2312         if (m_globalSurfaces[index] == nullptr)
2313         {
2314             m_globalSurfaces[index] = MOS_New(SurfaceIndex,0);
2315             if( !m_globalSurfaces[index] )
2316             {
2317                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2318                 return CM_OUT_OF_HOST_MEMORY;
2319             }
2320         }
2321         *m_globalSurfaces[index] = handle;
2322         m_globalCmIndex[index] = surfIndexData;
2323         m_dirty |= CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY;
2324     }
2325     else
2326     {
2327         CM_ASSERTMESSAGE("Error: StaticBuffer only supports CmBuffer type.");
2328          return CM_INVALID_BUFFER_HANDLER;
2329     }
2330     return CM_SUCCESS;
2331 }
2332 
2333 //*-----------------------------------------------------------------------------
2334 //! Set per thread arguments. The total size of all per kernel arguments and per thread
2335 //! arguments should be less than or equal to 256 Bytes
2336 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2337 //! i.e. after enqueue, ALL arguments need to be reset.
2338 //! INPUT:
2339 //!     1) Thread index.
2340 //!     2) Index of argument in CM kernel function (genx_main). The index is
2341 //!        global for per kernel arguments and per thread arguments.
2342 //!     3) Size of the argument.
2343 //!     4) Pointer to argument .
2344 //! OUTPUT:
2345 //!     CM_SUCCESS or
2346 //!     CM_INVALID_ARG_INDEX if index is invalid
2347 //!     CM_INVALID_ARG_SIZE if size is invalid
2348 //!     CM_INVALID_ARG_VALUE if value is nullptr
2349 //*-----------------------------------------------------------------------------
SetThreadArg(uint32_t threadId,uint32_t index,size_t size,const void * value)2350 CM_RT_API int32_t CmKernelRT::SetThreadArg(uint32_t threadId, uint32_t index, size_t size, const void * value )
2351 {
2352     INSERT_API_CALL_LOG();
2353 
2354     //It should be mutual exclusive with Indirect Data
2355     if(m_kernelPayloadData)
2356     {
2357         CM_ASSERTMESSAGE("Error: SetThredArg should be mutual exclusive with indirect data.");
2358         return CM_KERNELPAYLOAD_PERTHREADARG_MUTEX_FAIL;
2359     }
2360 
2361     if(m_threadCount > m_halMaxValues->maxUserThreadsPerTask || m_threadCount <=0)
2362     {
2363         CM_ASSERTMESSAGE("Error: Minimum or Maximum number of threads exceeded.");
2364         return CM_FAILURE;
2365     }
2366 
2367     if( index >= m_argCount )
2368     {
2369         CM_ASSERTMESSAGE("Error: Invalid thread arg count.");
2370         return CM_INVALID_ARG_INDEX;
2371 
2372     }
2373 
2374     if( threadId >= m_threadCount )
2375     {
2376         CM_ASSERTMESSAGE("Error: thread id exceeds the threadcount.");
2377         return CM_INVALID_THREAD_INDEX;
2378 
2379     }
2380 
2381     if( !value)
2382     {
2383         CM_ASSERTMESSAGE("Error: Invalid thread arg value.");
2384         return CM_INVALID_ARG_VALUE;
2385     }
2386 
2387     if( size == 0)
2388     {
2389         CM_ASSERTMESSAGE("Error: Invalid thread arg size.");
2390         return CM_INVALID_ARG_SIZE;
2391     }
2392 
2393     int32_t nRetVal = 0;
2394     if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERTHREAD, index, size, value, threadId ) ) != CM_SUCCESS )
2395     {
2396         return nRetVal;
2397     }
2398 
2399     return CM_SUCCESS;
2400 }
2401 
2402 //*-----------------------------------------------------------------------------
2403 //| Purpose:  Calculate the total size of kernel data
2404 //*-----------------------------------------------------------------------------
CalcKernelDataSize(uint32_t movInstNum,uint32_t numArgs,uint32_t argSize,uint32_t & totalKernelDataSize)2405 int32_t CmKernelRT::CalcKernelDataSize(
2406                 uint32_t movInstNum,                 // [in] the number of move instructions
2407                 uint32_t numArgs,                   // [in] number of args , surface array count
2408                 uint32_t argSize,                   // [in] Size of arguments
2409                 uint32_t & totalKernelDataSize)      // [out] total size of kernel data
2410 {
2411     int32_t hr             = CM_SUCCESS;
2412 
2413     uint32_t headSize = ( KERNEL_INFO_SIZE_IN_DWORD + numArgs * PER_ARG_SIZE_IN_DWORD ) * sizeof( uint32_t );
2414     uint32_t totalSize =  headSize + movInstNum * CM_MOVE_INSTRUCTION_SIZE + m_binarySize + argSize;
2415 
2416     totalSize += 4; // one dword for flag. the first bit is curbe on/off
2417     totalSize += 8; //sizeof( uint64_t ) for id
2418 
2419     totalSize += 16; // static buffer indices
2420     totalSize += 12; // GT Pin buffer indices
2421 
2422     ////////////////////////////////////////////////////////////////////////////
2423     // Calculate indirect data size (start)
2424     ////////////////////////////////////////////////////////////////////////////
2425     // Memory layout for indirect data:
2426     // Indirect Data Size -------------------- 2 bytes (must present)
2427     // Below area is present only if above value is not ZERO
2428     // Indirect Data Buffer ------------------ Size indicated above
2429     totalSize += sizeof(uint16_t);  //field for indirect data size
2430     if(m_usKernelPayloadDataSize)
2431     {
2432         totalSize += m_usKernelPayloadDataSize;
2433     }
2434     // Memory layout for indirect surface:
2435     // Indirect Surface Count ----------------- 2 bytes (must present)
2436     // Below are present only if the above value is not ZERO
2437     // Kind of Indirect Surface 0 ------------- 2 Bytes
2438     // Handle of Indirect Surface 0 ----------- 2 Bytes
2439     // Surface Index of Indirect Surface 0 ---- 2 Bytes
2440     // ..........
2441     // Kind of Indirect Surface n-1 ----------- 2 Bytes
2442     // Handle of Indirect Surface n-1---------- 2 Bytes
2443     // Surface Index of Indirect Surface n-1 -- 2 Bytes
2444     totalSize +=  sizeof(uint16_t); //field for indirect surface count
2445     if(m_usKernelPayloadSurfaceCount)
2446     {
2447         totalSize +=  m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO);
2448     }
2449 
2450     totalKernelDataSize = totalSize;
2451 
2452     return hr;
2453 }
2454 
2455 //*-----------------------------------------------------------------------------
2456 //| Purpose:   Create mov instructions
2457 //|            instructions will be copied into DstMem
2458 //*-----------------------------------------------------------------------------
CreateMovInstructions(uint32_t & movInstNum,uint8_t * & codeDst,CM_ARG * tempArgs,uint32_t numArgs)2459 int32_t CmKernelRT::CreateMovInstructions( uint32_t &movInstNum, uint8_t *&codeDst, CM_ARG* tempArgs, uint32_t numArgs)
2460 {
2461     //Create Mov Instruction
2462     CmDynamicArray      movInsts( numArgs );
2463     uint32_t renderGen = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState->platform.eRenderCoreFamily;
2464     CM_RETURN_CODE ret = m_movInstConstructor->SetInstDistanceConfig(movInsts.GetMaxSize(), renderGen);
2465     if (ret != CM_SUCCESS && ret != CM_NOT_IMPLEMENTED)
2466     {
2467         return ret;
2468     }
2469 
2470     movInstNum = 0;
2471 
2472     //Note: if no thread arg and no per kernel arg, no need move instrcutions at all.
2473     if( m_curbeEnabled && (m_perThreadArgExists || m_perKernelArgExists))
2474     {
2475         if( ( m_argCount > 0 ) && ( m_threadCount > 1) )
2476         {
2477             PCM_ARG* sortedArgs = MOS_NewArray(PCM_ARG,numArgs);
2478             if( !sortedArgs )
2479             {
2480                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2481                 return CM_OUT_OF_HOST_MEMORY;
2482             }
2483             for( uint32_t j = 0; j < numArgs; j++ )
2484             {
2485                 sortedArgs[ j ] = tempArgs + j;
2486             }
2487             // sort arg to sortedArgs accorind to offsetinPayload
2488             QuickSort( sortedArgs, 0, numArgs - 1 );
2489 
2490             // record compiler generated offset, used as move dst later
2491             uint16_t *unitOffsetInPayloadSorted = MOS_NewArray(uint16_t, numArgs);
2492             if( !unitOffsetInPayloadSorted )
2493             {
2494                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2495                 MosSafeDeleteArray(sortedArgs);
2496                 return CM_OUT_OF_HOST_MEMORY;
2497             }
2498             for( uint32_t j = 0; j < numArgs; j++ )
2499             {
2500                 unitOffsetInPayloadSorted[j] = sortedArgs[j]->unitOffsetInPayload;
2501             }
2502 
2503             uint16_t kernelArgEnd = 32;
2504             bool beforeFirstThreadArg = true;
2505             for( uint32_t j = 0; j < numArgs; j++ )
2506             {
2507                 if( sortedArgs[j]->unitCount == 1 )
2508                     // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2509                 {
2510                     if( beforeFirstThreadArg )
2511                     {
2512                         kernelArgEnd = sortedArgs[j]->unitOffsetInPayload + sortedArgs[j]->unitSize;
2513                     }
2514                     else
2515                     {
2516                         DW_ALIGNMENT( kernelArgEnd ); // necessary ?
2517                         sortedArgs[j]->unitOffsetInPayload = kernelArgEnd;
2518                         kernelArgEnd += sortedArgs[j]->unitSize;
2519                     }
2520                 }
2521                 else // per thread
2522                 {
2523                     if( beforeFirstThreadArg )
2524                     {
2525                         beforeFirstThreadArg = false;
2526                     }
2527                 }
2528             }
2529 
2530             GRF_ALIGNMENT(kernelArgEnd); // offset of thread arg start related to R0
2531             uint32_t threadArgStart = kernelArgEnd;
2532 
2533             for (uint32_t j = 0; j < numArgs; j++)
2534             {
2535                 if (sortedArgs[j]->unitCount > 1) // per thread
2536                 {
2537                     sortedArgs[j]->unitOffsetInPayload = (uint16_t)threadArgStart;
2538                     threadArgStart += sortedArgs[j]->unitSize;
2539                     DW_ALIGNMENT(threadArgStart);
2540                 }
2541             }
2542 
2543             bool needMovInstructions = false;
2544             for( uint32_t j = 0; j < numArgs; j++ )
2545             {
2546                 if ( unitOffsetInPayloadSorted[j] != sortedArgs[j]->unitOffsetInPayload )
2547                 {
2548                     needMovInstructions = true;
2549                     break;
2550                 }
2551             }
2552 
2553             if (needMovInstructions)
2554             {
2555                 // Add move
2556                 GRF_ALIGNMENT(threadArgStart);
2557                 uint32_t threadArgEnd = threadArgStart;
2558                 uint32_t size = threadArgEnd - 32;
2559                 CM_ASSERT((size % 32) == 0);
2560 
2561                 // move all arguments starting from R1 (32 ) through threadArgEnd to R64 (R0 reserved for media dispatch)
2562                 uint32_t nextIndex = 0;
2563                 nextIndex += m_movInstConstructor->ConstructObjMovs(R64_OFFSET, 32, size, movInsts, nextIndex, true, m_blhwDebugEnable);
2564 
2565                 beforeFirstThreadArg = true;
2566                 for (uint32_t j = 0; j < numArgs; j++)
2567                 {
2568                     if (sortedArgs[j]->unitCount == 1)
2569                         // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2570                     {
2571                         if (beforeFirstThreadArg == false)
2572                         {
2573                             // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2574                             nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2575                                 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - 32,
2576                                 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2577                         }
2578                     }
2579                     else // per thread
2580                     {
2581                         if (beforeFirstThreadArg)
2582                         {
2583                             beforeFirstThreadArg = false;
2584                         }
2585 
2586                         // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2587                         nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2588                             R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - CM_PAYLOAD_OFFSET,
2589                             sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2590                     }
2591                 }
2592 
2593                 movInstNum = nextIndex;
2594             }
2595 
2596             MosSafeDeleteArray(sortedArgs);
2597             MosSafeDeleteArray(unitOffsetInPayloadSorted);
2598         }
2599     }// End of if( m_curbeEnabled && m_ThreadArgExists)
2600 
2601     uint32_t addInstDW[4];
2602     MOS_ZeroMemory(addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2603     uint32_t addInstNum =0;
2604 
2605     if(m_threadSpace && m_adjustScoreboardY)
2606     {
2607         addInstNum = 1;
2608 
2609         addInstDW[0] = CM_BDW_ADJUST_Y_SCOREBOARD_DW0;
2610         addInstDW[1] = CM_BDW_ADJUST_Y_SCOREBOARD_DW1;
2611         addInstDW[2] = CM_BDW_ADJUST_Y_SCOREBOARD_DW2;
2612 
2613         // constant word needs high 16 bits to be same as low 16 bits
2614         uint16_t tmp = - (int32_t)(m_adjustScoreboardY);
2615         addInstDW[3] = (tmp << 16) + tmp;
2616 
2617     }
2618 
2619     if (movInstNum || addInstNum)
2620     {
2621         codeDst = MOS_NewArray(uint8_t, ((movInstNum + addInstNum)  * CM_MOVE_INSTRUCTION_SIZE));
2622         if (!codeDst)
2623         {
2624             return CM_OUT_OF_HOST_MEMORY;
2625         }
2626     }
2627 
2628     for( uint32_t j = 0; j < movInstNum; j ++ )
2629     {
2630         MovInst_RT* movInst = (MovInst_RT*)movInsts.GetElement( j );
2631         if (!movInst)
2632         {
2633             CM_ASSERTMESSAGE("Error: Invalid move instructions.");
2634             MosSafeDeleteArray(codeDst);
2635             return CM_FAILURE;
2636         }
2637         if (j != 0)
2638         {
2639             movInst->ClearDebug();
2640         }
2641         CmSafeMemCopy(codeDst + j * CM_MOVE_INSTRUCTION_SIZE, movInst->GetBinary(), CM_MOVE_INSTRUCTION_SIZE);
2642         CmSafeDelete(movInst); // delete each element in movInsts
2643     }
2644     movInsts.Delete();
2645 
2646     if(addInstNum != 0)
2647     {
2648        CmSafeMemCopy(codeDst + movInstNum * CM_MOVE_INSTRUCTION_SIZE, addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2649 
2650        movInstNum += addInstNum; // take add Y instruction into consideration
2651     }
2652 
2653     return CM_SUCCESS;
2654 }
2655 
CreateKernelArgDataGroup(uint8_t * & data,uint32_t value)2656 int32_t CmKernelRT::CreateKernelArgDataGroup(
2657     uint8_t   *&data,
2658     uint32_t   value)
2659 {
2660     if (data == nullptr)
2661     {
2662         data = MOS_NewArray(uint8_t, sizeof(uint32_t));
2663         if(!data)
2664         {
2665             return CM_OUT_OF_HOST_MEMORY;
2666         }
2667     }
2668     *(uint32_t *)data = value;
2669     return CM_SUCCESS;
2670 }
2671 
CreateKernelImplicitArgDataGroup(uint8_t * & data,uint32_t size)2672 int32_t CmKernelRT::CreateKernelImplicitArgDataGroup(
2673     uint8_t   *&data,
2674     uint32_t   size)
2675 {
2676     data = MOS_NewArray(uint8_t, (size * sizeof(uint32_t)));
2677     if (!data)
2678     {
2679         return CM_OUT_OF_HOST_MEMORY;
2680     }
2681     *(uint32_t *)data = 0;
2682     return CM_SUCCESS;
2683 }
2684 
2685 //*-----------------------------------------------------------------------------
2686 //| Purpose:   Create mov instructions
2687 //|            instructions will be copied into DstMem
2688 //*-----------------------------------------------------------------------------
CreateThreadArgData(PCM_HAL_KERNEL_ARG_PARAM kernelArg,uint32_t threadArgIndex,CmThreadSpaceRT * threadSpace,CM_ARG * cmArgs)2689 int32_t CmKernelRT::CreateThreadArgData(
2690     PCM_HAL_KERNEL_ARG_PARAM    kernelArg,
2691     uint32_t                    threadArgIndex,
2692     CmThreadSpaceRT*              threadSpace,
2693     CM_ARG*                     cmArgs )
2694 {
2695     int32_t         hr              = CM_SUCCESS;
2696     uint32_t        threadArgCount  = cmArgs[ threadArgIndex].unitCount;
2697     uint32_t        threadArgSize   = cmArgs[ threadArgIndex ].unitSize;
2698 
2699     if (CHECK_SURFACE_TYPE(cmArgs->unitKind,  ARG_KIND_SURFACE_VME))
2700     {
2701         // reallocate the memory since the number of surfaces in a vme surface could vary
2702         MosSafeDeleteArray(kernelArg->firstValue);
2703     }
2704 
2705     if( kernelArg->firstValue  == nullptr)
2706     {
2707         // if firstValue = nullptr, then create a new one, otherwise, update the exisitng one
2708         kernelArg->firstValue = MOS_NewArray(uint8_t, (cmArgs[threadArgIndex].unitCount * cmArgs[threadArgIndex].unitSize));
2709         if( !kernelArg->firstValue )
2710         {
2711             hr = CM_OUT_OF_HOST_MEMORY;
2712             goto finish;
2713         }
2714     }
2715 
2716     if(kernelArg->unitCount == 1 ) // kernel arg
2717     {
2718         if (cmArgs[threadArgIndex].value)
2719         {
2720             CmSafeMemCopy(kernelArg->firstValue, cmArgs[threadArgIndex].value, threadArgCount * threadArgSize);
2721         }
2722         goto finish;
2723     }
2724 
2725     if( threadSpace != nullptr )
2726     {
2727         CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2728         threadSpace->GetDependencyPatternType(dependencyPatternType);
2729 
2730         if ((m_threadSpaceAssociated == true) &&  (dependencyPatternType != CM_NONE_DEPENDENCY))
2731         {
2732             CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
2733             threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
2734 
2735             uint32_t *boardOrder = nullptr;
2736             threadSpace->GetBoardOrder(boardOrder);
2737 
2738             for (uint32_t index = 0; index < threadArgCount; index++)
2739             {
2740                 uint32_t offset = threadSpaceUnit[boardOrder[index]].threadId;
2741                 uint8_t *argSrc = (uint8_t*)cmArgs[threadArgIndex].value + offset * threadArgSize;
2742                 uint8_t *argDst = kernelArg->firstValue + index * threadArgSize;
2743                 CmSafeMemCopy(argDst, argSrc, threadArgSize);
2744             }
2745         }
2746         else
2747         {
2748            CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2749         }
2750     }
2751     else
2752     {
2753         CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2754     }
2755 
2756 finish:
2757     return hr;
2758 }
2759 
2760 //*-----------------------------------------------------------------------------
2761 //| Purpose:   Sort thread space for scorboarding
2762 //*-----------------------------------------------------------------------------
SortThreadSpace(CmThreadSpaceRT * threadSpace)2763 int32_t CmKernelRT::SortThreadSpace( CmThreadSpaceRT*  threadSpace )
2764 {
2765     int32_t                   hr = CM_SUCCESS;
2766     CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2767 
2768     CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2769 
2770     threadSpace->GetDependencyPatternType(dependencyPatternType);
2771 
2772     if(!threadSpace->IsThreadAssociated())
2773     {//Skip Sort if it is media walker
2774         return CM_SUCCESS;
2775     }
2776 
2777     if (threadSpace->CheckDependencyVectorsSet())
2778     {
2779         threadSpace->WavefrontDependencyVectors();
2780     }
2781     else
2782     {
2783         switch (dependencyPatternType)
2784         {
2785             case CM_WAVEFRONT:
2786                 threadSpace->Wavefront45Sequence();
2787                 break;
2788 
2789             case CM_WAVEFRONT26:
2790                 threadSpace->Wavefront26Sequence();
2791                 break;
2792 
2793             case CM_WAVEFRONT26Z:
2794                 threadSpace->Wavefront26ZSequence();
2795                 break;
2796 
2797             case CM_WAVEFRONT26ZI:
2798                 CM_26ZI_DISPATCH_PATTERN dispatchPattern;
2799                 threadSpace->Get26ZIDispatchPattern(dispatchPattern);
2800                 switch (dispatchPattern)
2801                 {
2802                 case VVERTICAL_HVERTICAL_26:
2803                     threadSpace->Wavefront26ZISeqVVHV26();
2804                     break;
2805                 case VVERTICAL_HHORIZONTAL_26:
2806                     threadSpace->Wavefront26ZISeqVVHH26();
2807                     break;
2808                 case VVERTICAL26_HHORIZONTAL26:
2809                     threadSpace->Wavefront26ZISeqVV26HH26();
2810                     break;
2811                 case VVERTICAL1X26_HHORIZONTAL1X26:
2812                     threadSpace->Wavefront26ZISeqVV1x26HH1x26();
2813                     break;
2814                 default:
2815                     threadSpace->Wavefront26ZISeqVVHV26();
2816                     break;
2817                 }
2818                 break;
2819 
2820             case CM_HORIZONTAL_WAVE:
2821                 threadSpace->HorizentalSequence();
2822                 break;
2823 
2824             case CM_VERTICAL_WAVE:
2825                 threadSpace->VerticalSequence();
2826                 break;
2827 
2828             case CM_NONE_DEPENDENCY:
2829             case CM_WAVEFRONT26X:
2830             case CM_WAVEFRONT26ZIG:
2831                 break;
2832 
2833             default:
2834                 CM_ASSERTMESSAGE("Error: Invalid thread dependency type.");
2835                 hr = CM_FAILURE;
2836                 break;
2837         }
2838     }
2839 
2840 finish:
2841     return hr;
2842 }
2843 
2844 //*-----------------------------------------------------------------------------
2845 //| Purpose:   Create temp args array with surface array broken down
2846 //|            instructions will be copied into DstMem
2847 //*-----------------------------------------------------------------------------
CreateTempArgs(uint32_t numArgs,CM_ARG * & tempArgs)2848 int32_t CmKernelRT::CreateTempArgs(
2849     uint32_t     numArgs,
2850     CM_ARG*      &tempArgs)
2851 {
2852     int32_t     hr              = CM_SUCCESS;
2853     int32_t     numSurfaces    = 0;
2854     int32_t     increasedArgs  = 0;
2855 
2856     if( numArgs < m_argCount || tempArgs != nullptr )
2857     {
2858         CM_ASSERTMESSAGE("Error: Invalid arg number or arg value.");
2859         hr = CM_FAILURE;
2860         goto finish;
2861     }
2862 
2863     tempArgs = MOS_NewArray(CM_ARG, numArgs);
2864     CM_CHK_NULL_GOTOFINISH(tempArgs, CM_OUT_OF_HOST_MEMORY);
2865     CmSafeMemSet(tempArgs, 0, numArgs* sizeof(CM_ARG) );
2866 
2867     for( uint32_t j = 0; j < m_argCount; j++ )
2868     {
2869         if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind, // first time
2870                                 ARG_KIND_SURFACE,
2871                                 ARG_KIND_SURFACE_1D,
2872                                 ARG_KIND_SURFACE_2D,
2873                                 ARG_KIND_SURFACE_2D_UP,
2874                                 ARG_KIND_SURFACE_SAMPLER,
2875                                 ARG_KIND_SURFACE2DUP_SAMPLER,
2876                                 ARG_KIND_SURFACE_3D,
2877                                 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
2878                                 ARG_KIND_SURFACE_SAMPLER8X8_VA,
2879                                 ARG_KIND_SURFACE_2D_SCOREBOARD,
2880                                 ARG_KIND_STATE_BUFFER ) )
2881         {
2882             numSurfaces = m_args[j].unitSize/sizeof(int);
2883 
2884             if (numSurfaces > 1)
2885             {
2886                 if (m_args[j].unitCount == 1)
2887                 { //Kernel arg
2888                     for (int32_t k = 0; k < numSurfaces; k++)
2889                     {
2890                         tempArgs[j + increasedArgs + k] = m_args[j];
2891                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2892                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2893                         tempArgs[j + increasedArgs + k].value = (uint8_t *)((uint32_t *)m_args[j].value + k);
2894                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2895                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2896                         //For each surface kind and custom value  in surface array
2897                         if (!m_args[j].surfIndex[k])
2898                         {
2899                             //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
2900                             //This is for special usage if there is empty element in surface array.
2901                             tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SURFACE2D;
2902                             continue;
2903                         }
2904                         tempArgs[j + increasedArgs + k].unitKind = m_args[j].surfArrayArg[k].argKindForArray;
2905                         tempArgs[j + increasedArgs + k].nCustomValue = m_args[j].surfArrayArg[k].addressModeForArray;
2906                     }
2907                 }
2908                 else
2909                 {
2910                     uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2911                     CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
2912                     for (int32_t k = 0; k < numSurfaces; k++)
2913                     {
2914                         tempArgs[j + increasedArgs + k] = m_args[j];
2915                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2916                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2917                         tempArgs[j + increasedArgs + k].value = MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2918                         if(tempArgs[j + increasedArgs + k].value == nullptr)
2919                         {
2920                             CM_ASSERTMESSAGE("Error: Out of system memory.");
2921                             hr = CM_OUT_OF_HOST_MEMORY;
2922                             MosSafeDeleteArray(surfaces);
2923                             goto finish;
2924                         }
2925                         for (uint32_t s = 0; s < m_args[j].unitCount; s++)
2926                         {
2927                             surfaces[s] = *(uint32_t *)((uint32_t *)m_args[j].value + k + numSurfaces * s);
2928                         }
2929                         CmSafeMemCopy(tempArgs[j + increasedArgs + k].value, surfaces, sizeof(int32_t) * m_args[j].unitCount);
2930                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2931                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = (uint16_t)-1;
2932                     }
2933                     MosSafeDeleteArray(surfaces);
2934                 }
2935                 increasedArgs += numSurfaces - 1;
2936             }
2937             else
2938             {
2939                 tempArgs[j + increasedArgs] = m_args[j];
2940             }
2941         }
2942         else if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
2943         {
2944             numSurfaces = m_args[ j ].unitVmeArraySize;
2945             if(numSurfaces == 1)
2946             {  // single vme surface
2947                tempArgs[j + increasedArgs] = m_args[j];
2948             }
2949             else
2950             {  // multiple vme surfaces in surface array
2951                 if (m_args[j].unitCount == 1) { //Kernel arg
2952                     uint32_t vmeSurfOffset = 0;
2953 
2954                     for (int32_t k = 0; k < numSurfaces; k++)
2955                     {
2956                         uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[j].value + vmeSurfOffset));
2957 
2958                         tempArgs[j + increasedArgs + k] = m_args[j];
2959                         tempArgs[j + increasedArgs + k].unitSize = vmeSize;
2960                         tempArgs[j + increasedArgs + k].unitSizeOrig = vmeSize;
2961                         tempArgs[j + increasedArgs + k].value = (uint8_t *)(m_args[j].value + vmeSurfOffset);
2962                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + k*4;
2963                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2964 
2965                         vmeSurfOffset += vmeSize;
2966                     }
2967                 }
2968              }
2969             increasedArgs += numSurfaces - 1;
2970         }
2971         else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
2972         {
2973             unsigned int numSamplers = m_args[j].unitSize / sizeof(int);
2974 
2975             if (numSamplers > 1)
2976             {
2977                 if (m_args[j].unitCount == 1)
2978                 {
2979                     //Kernel arg
2980                     for (unsigned int k = 0; k < numSamplers; k++)
2981                     {
2982                         tempArgs[j + increasedArgs + k] = m_args[j];
2983                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int);
2984                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int);
2985                         tempArgs[j + increasedArgs + k].value = (unsigned char *)((unsigned int *)m_args[j].value + k);
2986                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2987                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2988                         tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SAMPLER;
2989                     }
2990                 }
2991                 else
2992                 {
2993                     // Use sampler index array as thread arg.
2994                     // Not implemented yet.
2995                     return CM_NOT_IMPLEMENTED;
2996                 }
2997                 increasedArgs += numSamplers - 1;
2998             }
2999             else
3000             {
3001                 tempArgs[j + increasedArgs] = m_args[j];
3002             }
3003         }
3004         else
3005         {
3006             tempArgs[j + increasedArgs] = m_args[j];
3007         }
3008     }
3009 
3010 finish:
3011     if(hr == CM_OUT_OF_HOST_MEMORY)
3012     {
3013         if(tempArgs)
3014         {
3015             for (uint32_t j = 0; j < numArgs; j++)
3016             {
3017                 MosSafeDeleteArray(tempArgs[j].value);
3018             }
3019         }
3020         MosSafeDeleteArray( tempArgs );
3021     }
3022     return hr;
3023 }
3024 
3025 //*-----------------------------------------------------------------------------
3026 //| Purpose:   Get the number of args includes the num of surfaces in surface array
3027 //*-----------------------------------------------------------------------------
GetArgCountPlusSurfArray(uint32_t & argSize,uint32_t & argCountPlus)3028 int32_t CmKernelRT::GetArgCountPlusSurfArray(uint32_t &argSize, uint32_t & argCountPlus)
3029 {
3030     argCountPlus = m_argCount;
3031     argSize      = 0;
3032 
3033     if(m_usKernelPayloadDataSize)
3034     { // if payload data exists, the number of args is zero
3035         argCountPlus  = 0;
3036         argSize       = 0;
3037         return CM_SUCCESS;
3038     }
3039 
3040     if( m_argCount != 0 )   //Need pass the arg either by arguments area, or by indirect payload area
3041     {
3042          //Sanity check for argument setting
3043         if((m_perThreadArgExists == false) && (m_perKernelArgExists == false) && (m_usKernelPayloadDataSize == 0))
3044         {
3045             if ( m_stateBufferBounded == CM_STATE_BUFFER_NONE )
3046             {
3047                 CM_ASSERTMESSAGE( "Error: Kernel arguments are not set." );
3048                 return CM_NOT_SET_KERNEL_ARGUMENT;
3049             }
3050         }
3051 
3052         if(m_perThreadArgExists || m_perKernelArgExists)
3053         {
3054             unsigned int extraArgs = 0;
3055 
3056             for( uint32_t j = 0; j < m_argCount; j ++ )
3057             {
3058                 //Sanity checking for every argument setting
3059                 if ( !m_args[j].isSet )
3060                 {
3061                     CM_ASSERTMESSAGE("Error: One Kernel argument is not set.");
3062                     return CM_KERNEL_ARG_SETTING_FAILED;
3063                 }
3064 
3065                 argSize += m_args[j].unitSize * m_args[j].unitCount;
3066 
3067                 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind,
3068                                         ARG_KIND_SURFACE,
3069                                         ARG_KIND_SURFACE_1D,
3070                                         ARG_KIND_SURFACE_2D,
3071                                         ARG_KIND_SURFACE_2D_UP,
3072                                         ARG_KIND_SURFACE_SAMPLER,
3073                                         ARG_KIND_SURFACE2DUP_SAMPLER,
3074                                         ARG_KIND_SURFACE_3D,
3075                                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
3076                                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
3077                                         ARG_KIND_SURFACE_2D_SCOREBOARD,
3078                                         ARG_KIND_STATE_BUFFER ) )
3079                 {
3080                      int numSurfaces = m_args[j].unitSize/sizeof(int);
3081                      if (numSurfaces > 1) {
3082                            extraArgs += numSurfaces - 1;
3083                      }
3084                 }
3085                 else if (CHECK_SURFACE_TYPE(m_args[j].unitKind, ARG_KIND_SURFACE_VME))
3086                 {
3087                     int numSurfaces = m_args[j].unitVmeArraySize;
3088                     if (numSurfaces > 1) {
3089                         extraArgs += numSurfaces - 1;
3090                     }
3091                 }
3092                 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
3093                 {
3094                     int numSamplers = m_args[j].unitSize / sizeof(int);
3095                     if (numSamplers > 1)
3096                     {
3097                         extraArgs += (numSamplers - 1);
3098                     }
3099                 }
3100             }
3101 
3102             argCountPlus = m_argCount + extraArgs;
3103         }
3104     }
3105     return CM_SUCCESS;
3106 }
3107 
3108 //*-----------------------------------------------------------------------------
3109 //| Purpose:   Create Thread Space Param
3110 //*-----------------------------------------------------------------------------
CreateThreadSpaceParam(PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,CmThreadSpaceRT * threadSpace)3111 int32_t CmKernelRT::CreateThreadSpaceParam(
3112     PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,
3113     CmThreadSpaceRT*                   threadSpace     )
3114 {
3115     int32_t                      hr = CM_SUCCESS;
3116     CM_HAL_DEPENDENCY*           dependency = nullptr;
3117     uint32_t                     threadSpaceWidth = 0;
3118     uint32_t                     threadSpaceHeight =0;
3119     CM_THREAD_SPACE_UNIT         *threadSpaceUnit = nullptr;
3120     CM_THREAD_SPACE_DIRTY_STATUS dirtyStatus = CM_THREAD_SPACE_CLEAN;
3121 
3122     if (kernelThreadSpaceParam == nullptr || threadSpace == nullptr)
3123     {
3124         CM_ASSERTMESSAGE("Error: Pointer to CmKernelThreadSpaceParam or thread space is null.");
3125         hr = CM_NULL_POINTER;
3126         goto finish;
3127     }
3128 
3129     threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
3130     kernelThreadSpaceParam->threadSpaceWidth =  (uint16_t)threadSpaceWidth;
3131     kernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
3132 
3133     threadSpace->GetDependencyPatternType(kernelThreadSpaceParam->patternType);
3134     threadSpace->GetWalkingPattern(kernelThreadSpaceParam->walkingPattern);
3135     threadSpace->GetDependency( dependency);
3136 
3137     if(dependency != nullptr)
3138     {
3139         CmSafeMemCopy(&kernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
3140     }
3141 
3142     if( threadSpace->CheckWalkingParametersSet( ) )
3143     {
3144         kernelThreadSpaceParam->walkingParamsValid = 1;
3145         CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetWalkingParameters(kernelThreadSpaceParam->walkingParams));
3146     }
3147     else
3148     {
3149         kernelThreadSpaceParam->walkingParamsValid = 0;
3150     }
3151 
3152     if( threadSpace->CheckDependencyVectorsSet( ) )
3153     {
3154         kernelThreadSpaceParam->dependencyVectorsValid = 1;
3155         CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetDependencyVectors(kernelThreadSpaceParam->dependencyVectors));
3156     }
3157     else
3158     {
3159         kernelThreadSpaceParam->dependencyVectorsValid = 0;
3160     }
3161 
3162     threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
3163 
3164     if(threadSpaceUnit)
3165     {
3166         kernelThreadSpaceParam->threadCoordinates = MOS_NewArray(CM_HAL_SCOREBOARD, (threadSpaceWidth * threadSpaceHeight));
3167         CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->threadCoordinates , CM_OUT_OF_HOST_MEMORY);
3168         CmSafeMemSet(kernelThreadSpaceParam->threadCoordinates, 0, threadSpaceHeight * threadSpaceWidth * sizeof(CM_HAL_SCOREBOARD));
3169 
3170         uint32_t *boardOrder = nullptr;
3171         threadSpace->GetBoardOrder(boardOrder);
3172         CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
3173 
3174         kernelThreadSpaceParam->reuseBBUpdateMask  = 0;
3175         for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
3176         {
3177             kernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
3178             kernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
3179             kernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
3180             kernelThreadSpaceParam->threadCoordinates[i].resetMask= threadSpaceUnit[boardOrder[i]].reset;
3181             kernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
3182             kernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
3183             kernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
3184             kernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
3185         }
3186 
3187         if( kernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
3188         {
3189             CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
3190             threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
3191 
3192             kernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
3193             kernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
3194             CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
3195             CmSafeMemCopy(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave,
3196                 dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
3197 
3198          }
3199     }
3200 
3201     //Get group select setting information
3202     threadSpace->GetMediaWalkerGroupSelect(kernelThreadSpaceParam->groupSelect);
3203 
3204     //Get color count
3205     threadSpace->GetColorCountMinusOne(kernelThreadSpaceParam->colorCountMinusOne);
3206 
3207     dirtyStatus = threadSpace->GetDirtyStatus();
3208     switch (dirtyStatus)
3209     {
3210     case CM_THREAD_SPACE_CLEAN:
3211         kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_CLEAN;
3212         break;
3213     default:
3214         kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_DIRTY;
3215         break;
3216     }
3217 
3218 finish:
3219     if( hr == CM_OUT_OF_HOST_MEMORY)
3220     {
3221         if( kernelThreadSpaceParam )
3222         {
3223             MosSafeDeleteArray(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
3224             MosSafeDeleteArray(kernelThreadSpaceParam->threadCoordinates);
3225         }
3226     }
3227 
3228     return hr;
3229 }
3230 
3231 //*-----------------------------------------------------------------------------
3232 //| Purpose:   Delete the args array
3233 //*-----------------------------------------------------------------------------
DestroyArgs(void)3234 int32_t CmKernelRT::DestroyArgs( void )
3235 {
3236     for( uint32_t i =0 ; i < m_argCount; i ++ )
3237     {
3238         CM_ARG& arg = m_args[ i ];
3239         MosSafeDeleteArray( arg.value );
3240         MosSafeDeleteArray(arg.surfIndex);
3241         MosSafeDeleteArray(arg.surfArrayArg);
3242         arg.unitCount = 0;
3243         arg.unitSize = 0;
3244         arg.unitKind = 0;
3245         arg.unitOffsetInPayload = 0;
3246         arg.isDirty = true;
3247         arg.isSet = false;
3248     }
3249 
3250     MosSafeDeleteArray( m_args );
3251 
3252     m_threadSpaceAssociated        = false;
3253     m_threadSpace          = nullptr;
3254 
3255     m_perThreadArgExists  = false;
3256     m_perKernelArgExists  = false;
3257 
3258     m_sizeInCurbe = 0;
3259     m_curbeEnabled = true;
3260 
3261     m_sizeInPayload = 0;
3262     m_adjustScoreboardY = 0;
3263 
3264     ResetKernelSurfaces();
3265 
3266     return CM_SUCCESS;
3267 }
3268 
3269 //*-----------------------------------------------------------------------------
3270 // Calling reset makes it possible to change the per kernel or per thread
3271 // property of the argurments b/c it reset releases the memory for arguments
3272 //*-----------------------------------------------------------------------------
Reset(void)3273 int32_t CmKernelRT::Reset( void )
3274 {
3275     for( uint32_t i =0 ; i < m_argCount; i ++ )
3276     {
3277         CM_ARG& arg = m_args[ i ];
3278         MosSafeDeleteArray( arg.value );
3279         MosSafeDeleteArray( arg.surfIndex);
3280         MosSafeDeleteArray(arg.surfArrayArg);
3281         arg.value = nullptr;
3282         arg.unitCount = 0;
3283 
3284         arg.unitSize = arg.unitSizeOrig;
3285         arg.unitKind = arg.unitKindOrig;
3286         arg.unitOffsetInPayload = arg.unitOffsetInPayloadOrig;
3287 
3288         arg.isDirty = true;
3289         arg.isSet = false;
3290         arg.unitVmeArraySize = 0;
3291 
3292         arg.isStatelessBuffer = false;
3293         arg.index = 0;
3294     }
3295 
3296     m_threadCount = 0;
3297 
3298     m_indexInTask = 0;
3299 
3300     m_perThreadArgExists = false;
3301     m_perKernelArgExists = false;
3302 
3303     m_sizeInCurbe = 0;
3304     m_curbeEnabled = true;
3305 
3306     m_sizeInPayload = 0;
3307 
3308     m_threadSpaceAssociated = false;
3309     m_threadSpace = nullptr;
3310     m_adjustScoreboardY = 0;
3311 
3312     m_threadGroupSpace = nullptr;
3313 
3314     MosSafeDeleteArray(m_kernelPayloadData);
3315     m_usKernelPayloadDataSize = 0;
3316 
3317     if (m_usKernelPayloadSurfaceCount)
3318     {
3319         CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, m_usKernelPayloadSurfaceCount * sizeof(SurfaceIndex *));
3320         CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
3321         m_usKernelPayloadSurfaceCount = 0;
3322     }
3323 
3324     ResetKernelSurfaces();
3325 
3326     return CM_SUCCESS;
3327 }
3328 
3329 //*-----------------------------------------------------------------------------
3330 //| Purpose:   Get the pointer to arguments array
3331 //*-----------------------------------------------------------------------------
GetArgs(CM_ARG * & arg)3332 int32_t CmKernelRT::GetArgs( CM_ARG* & arg )
3333 {
3334     arg = m_args;
3335     return CM_SUCCESS;
3336 }
3337 
3338 //*-----------------------------------------------------------------------------
3339 //| Purpose:   Get the arguments' count
3340 //*-----------------------------------------------------------------------------
GetArgCount(uint32_t & argCount)3341 int32_t CmKernelRT::GetArgCount( uint32_t & argCount )
3342 {
3343     argCount = m_argCount;
3344     return CM_SUCCESS;
3345 }
3346 
3347 //*-----------------------------------------------------------------------------
3348 //| Purpose:    Get the value of member CurbeEnable
3349 //*-----------------------------------------------------------------------------
GetCurbeEnable(bool & b)3350 int32_t CmKernelRT::GetCurbeEnable( bool& b )
3351 {
3352     b = m_curbeEnabled;
3353     return CM_SUCCESS;
3354 }
3355 
3356 //*-----------------------------------------------------------------------------
3357 //| Purpose:    Set the CurbeEnable member
3358 //*-----------------------------------------------------------------------------
SetCurbeEnable(bool b)3359 int32_t CmKernelRT::SetCurbeEnable( bool b )
3360 {
3361     m_curbeEnabled = b;
3362     return CM_SUCCESS;
3363 }
3364 
3365 //*-----------------------------------------------------------------------------
3366 //| Purpose:   Get the kernel's size in Curbe
3367 //*-----------------------------------------------------------------------------
GetSizeInCurbe(uint32_t & size)3368 int32_t CmKernelRT::GetSizeInCurbe( uint32_t& size )
3369 {
3370     size = m_sizeInCurbe;
3371     return CM_SUCCESS;
3372 }
3373 
3374 //*-----------------------------------------------------------------------------
3375 //| Purpose:   Get the total size in payload of meida object or media walker
3376 //*-----------------------------------------------------------------------------
GetSizeInPayload(uint32_t & size)3377 int32_t CmKernelRT::GetSizeInPayload( uint32_t& size )
3378 {
3379     size = m_sizeInPayload;
3380     return CM_SUCCESS;
3381 }
3382 
3383 //*-----------------------------------------------------------------------------
3384 //| Purpose:    Get the pointer to CM device
3385 //*-----------------------------------------------------------------------------
GetCmDevice(CmDeviceRT * & device)3386 int32_t CmKernelRT::GetCmDevice(CmDeviceRT* &device)
3387 {
3388     device = m_device;
3389     return CM_SUCCESS;
3390 }
3391 
GetCmProgram(CmProgramRT * & program)3392 int32_t CmKernelRT::GetCmProgram( CmProgramRT* & program )
3393 {
3394     program = m_program;
3395     return CM_SUCCESS;
3396 }
3397 
CollectKernelSurface()3398 int32_t CmKernelRT::CollectKernelSurface()
3399 {
3400     m_vmeSurfaceCount = 0;
3401     m_maxSurfaceIndexAllocated = 0;
3402 
3403     for( uint32_t j = 0; j < m_argCount; j ++ )
3404     {
3405         if ((m_args[ j ].unitKind == ARG_KIND_SURFACE ) || // first time
3406              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_1D ) ||
3407              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D ) ||
3408              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
3409              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
3410              ( m_args[ j ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
3411              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_3D ) ||
3412              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
3413              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
3414              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_VME ) ||
3415              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
3416              ( m_args[ j ].unitKind == ARG_KIND_STATE_BUFFER ) )
3417         {
3418             int numSurfaces;
3419             int numValidSurfaces = 0;
3420 
3421             if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3422             {
3423                 numSurfaces = getSurfNumFromArgArraySize(m_args[j].unitSize, m_args[j].unitVmeArraySize);
3424             }
3425             else
3426             {
3427                 numSurfaces = m_args[j].unitSize/sizeof(int);
3428             }
3429 
3430             for (uint32_t k = 0; k < numSurfaces * m_args[j].unitCount; k ++)
3431             {
3432                 uint16_t surfIndex = 0;
3433                 if (m_args[j].surfIndex)
3434                 {
3435                     surfIndex = m_args[j].surfIndex[k];
3436                 }
3437                 if (surfIndex != 0 && surfIndex != CM_NULL_SURFACE)
3438                 {
3439                     m_surfaceArray[surfIndex] = true;
3440                     numValidSurfaces ++;
3441                     m_maxSurfaceIndexAllocated = Max(m_maxSurfaceIndexAllocated, surfIndex);
3442                 }
3443             }
3444             if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3445             {
3446                 m_vmeSurfaceCount += numValidSurfaces;
3447             }
3448         }
3449 
3450         if (m_args[ j ].isStatelessBuffer)
3451         {
3452             uint32_t surfIndex = m_args[j].index;
3453             m_surfaceArray[surfIndex] = true;
3454         }
3455     }
3456 
3457     for( int32_t i=0; i < CM_GLOBAL_SURFACE_NUMBER; ++i )
3458     {
3459         if( m_globalSurfaces[i] != nullptr )
3460         {
3461             uint32_t surfIndex = m_globalCmIndex[i];
3462             m_surfaceArray[surfIndex] = true;
3463         }
3464     }
3465 
3466     for (int32_t i = 0; i < m_usKernelPayloadSurfaceCount; i++)
3467     {
3468         if (m_pKernelPayloadSurfaceArray[i] != nullptr)
3469         {
3470             uint32_t surfIndex = m_pKernelPayloadSurfaceArray[i]->get_data();
3471             m_surfaceArray[surfIndex] = true;
3472         }
3473     }
3474 
3475     return CM_SUCCESS;
3476 }
3477 
IsKernelDataReusable(CmThreadSpaceRT * threadSpace)3478 int32_t CmKernelRT::IsKernelDataReusable( CmThreadSpaceRT* threadSpace)
3479 {
3480     if(threadSpace)
3481     {
3482         if(threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN))
3483         {
3484             return false;
3485         }
3486     }
3487 
3488     if(m_threadSpace)
3489     {
3490         if(m_threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN)
3491         {
3492             return  false;
3493         }
3494     }
3495 
3496     if(m_dirty !=  CM_KERNEL_DATA_CLEAN)
3497     {
3498         return false;
3499     }
3500 
3501     return true;
3502 }
3503 
3504 //*-----------------------------------------------------------------------------
3505 //| Purpose:    Prepare Kernel Data including thread args, kernel args
3506 //| Returns:    Result of the operation.
3507 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)3508 int32_t CmKernelRT::CreateKernelData(
3509     CmKernelData* & kernelData,  // out
3510     uint32_t& kernelDataSize,         // out
3511     const CmThreadSpaceRT* threadSpace )    // in
3512 {
3513     int32_t              hr              = CM_SUCCESS;
3514     PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3515 
3516     if( (threadSpace != nullptr) && (m_threadSpace != nullptr) )
3517     {
3518         // per-kernel threadspace and per-task threadspace cannot be set at the same time
3519         return CM_INVALID_THREAD_SPACE;
3520     }
3521 
3522     if(m_lastKernelData == nullptr)
3523     {
3524         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3525         CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3526         CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3527     }
3528     else
3529     {
3530         if(IsKernelDataReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
3531         {
3532             // nothing changed; Reuse m_lastKernelData
3533             kernelData = m_lastKernelData;
3534             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3535             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3536             kernelDataSize = kernelData->GetKernelDataSize();
3537 
3538             if (m_threadSpace)
3539             {
3540                 halKernelParam = kernelData->GetHalCmKernelData();
3541                 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3542                 // need to set to clean here because CmThreadSpaceParam.BBdirtyStatus is only set in CreateKernelDataInternal
3543                 // flag used to re-use batch buffer, don't care if BB is busy if it is "clean"
3544                 halKernelParam->kernelThreadSpaceParam.bbDirtyStatus = CM_HAL_BB_CLEAN;
3545             }
3546         }
3547         else
3548         {
3549             if(m_lastKernelData->IsInUse())
3550             { // Need to Create a new one , if the kernel data is in use
3551                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3552                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3553                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3554             }
3555             else if(threadSpace && threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus() != CM_THREAD_SPACE_CLEAN))
3556             { // if thread space is assocaited , don't support reuse
3557                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3558                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3559                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3560             }
3561             else if(m_dirty < CM_KERNEL_DATA_THREAD_COUNT_DIRTY || // Kernel arg or thread arg dirty
3562                 (m_threadSpace && m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DEPENDENCY_MASK_DIRTY))
3563             {
3564                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData,threadSpace));
3565                 kernelData = m_lastKernelData;
3566                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3567                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3568                 kernelDataSize = kernelData->GetKernelDataSize();
3569 
3570             }
3571             else
3572             {
3573                CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3574                CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3575                CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3576             }
3577         }
3578     }
3579 
3580     CleanArgDirtyFlag();
3581     if(threadSpace)
3582     {
3583         threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3584     }
3585     if (m_threadSpace)
3586     {
3587         m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3588     }
3589 
3590 finish:
3591     return hr;
3592 }
3593 
GetName()3594 char* CmKernelRT::GetName() { return (char*)m_kernelInfo->kernelName; }
3595 
3596 //*-----------------------------------------------------------------------------
3597 //| Purpose:    Create Kernel Data
3598 //| Returns:    Result of the operation.
3599 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3600 int32_t CmKernelRT::CreateKernelData(
3601     CmKernelData* & kernelData,  // out
3602     uint32_t& kernelDataSize,         // out
3603     const CmThreadGroupSpace* threadGroupSpace )    // in
3604 {
3605     int32_t     hr   = CM_SUCCESS;
3606     CmThreadGroupSpace* usedThreadGroupSpace = nullptr;
3607 
3608     //If kernel has associated TGS, we will use it, instead of per-task TGS
3609     if (m_threadGroupSpace)
3610     {
3611         usedThreadGroupSpace = m_threadGroupSpace;
3612     }
3613     else
3614     {
3615         usedThreadGroupSpace = const_cast<CmThreadGroupSpace*>(threadGroupSpace);
3616     }
3617 
3618     if(m_lastKernelData == nullptr)
3619     {
3620         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3621         CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3622         CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3623     }
3624     else
3625     {
3626         if (!((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) || (m_dirty & CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY)))
3627         {
3628             // nothing changed; Reuse m_lastKernelData
3629             kernelData = m_lastKernelData;
3630             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3631             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3632             kernelDataSize = kernelData->GetKernelDataSize();
3633         }
3634         else
3635         {
3636             if(m_lastKernelData->IsInUse())
3637             { // Need to Clone a new one
3638                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3639                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3640                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3641             }
3642             else
3643             {
3644                 // change happend -> Reuse m_lastKernelData but need to change its content accordingly
3645                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData, usedThreadGroupSpace));
3646                 kernelData = m_lastKernelData;
3647                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3648                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3649                 kernelDataSize = kernelData->GetKernelDataSize();
3650             }
3651         }
3652     }
3653 
3654     CleanArgDirtyFlag();
3655 
3656 finish:
3657     return hr;
3658 }
3659 
CleanArgDirtyFlag()3660 int32_t CmKernelRT::CleanArgDirtyFlag()
3661 {
3662 
3663     for(uint32_t i =0 ; i< m_argCount; i++)
3664     {
3665         m_args[i].isDirty = false;
3666     }
3667 
3668     if(m_threadSpace && m_threadSpace->GetDirtyStatus())
3669     {
3670         m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3671     }
3672 
3673     m_dirty                 = CM_KERNEL_DATA_CLEAN;
3674 
3675     return CM_SUCCESS;
3676 }
3677 
3678 //*-----------------------------------------------------------------------------
3679 //| Purpose:    Update the global surface and gtpin surface info to kernel data
3680 //| Returns:    Result of the operation.
3681 //*-----------------------------------------------------------------------------
UpdateKernelDataGlobalSurfaceInfo(PCM_HAL_KERNEL_PARAM halKernelParam)3682 int32_t CmKernelRT::UpdateKernelDataGlobalSurfaceInfo( PCM_HAL_KERNEL_PARAM halKernelParam )
3683 {
3684     int32_t hr = CM_SUCCESS;
3685 
3686     //global surface
3687     for ( uint32_t j = 0; j < CM_GLOBAL_SURFACE_NUMBER; j++ )
3688     {
3689         if ( m_globalSurfaces[ j ] != nullptr )
3690         {
3691             halKernelParam->globalSurface[ j ] = m_globalSurfaces[ j ]->get_data();
3692             halKernelParam->globalSurfaceUsed = true;
3693         }
3694         else
3695         {
3696             halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3697         }
3698     }
3699 
3700     for ( uint32_t j = CM_GLOBAL_SURFACE_NUMBER; j < CM_MAX_GLOBAL_SURFACE_NUMBER; j++ )
3701     {
3702         halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3703     }
3704 #if USE_EXTENSION_CODE
3705     UpdateKernelDataGTPinSurfaceInfo(halKernelParam);
3706 #endif
3707 
3708     return hr;
3709 }
3710 
3711 //*-----------------------------------------------------------------------------
3712 //| Purpose:    Prepare Kernel Data including thread args, kernel args
3713 //| Returns:    Result of the operation.
3714 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3715 int32_t CmKernelRT::CreateKernelDataInternal(
3716     CmKernelData* & kernelData,  // out
3717     uint32_t& kernelDataSize,         // out
3718     const CmThreadGroupSpace* threadGroupSpace)    // in
3719 {
3720     PCM_HAL_KERNEL_PARAM  halKernelParam = nullptr;
3721     int32_t               hr = CM_SUCCESS;
3722     uint32_t              movInstNum = 0;
3723     uint32_t              kernelCurbeSize = 0;
3724     uint32_t              numArgs = 0;
3725     CM_ARG                *tempArgs = nullptr;
3726     uint32_t              argSize = 0;
3727     uint32_t              surfNum = 0; //Pass needed BT entry numbers to HAL CM
3728     CmKernelRT            *cmKernel = nullptr;
3729     uint32_t              minKernelPlayloadOffset = 0;
3730     bool                  adjustLocalIdPayloadOffset = false;
3731 
3732     CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create(this, kernelData));
3733     halKernelParam = kernelData->GetHalCmKernelData();
3734     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3735 
3736     //Get Num of args with surface array
3737     CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
3738 
3739     //Create Temp args
3740     CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
3741 
3742     //Create move instructions
3743     CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
3744     CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
3745     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
3746 
3747     halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
3748     halKernelParam->clonedKernelParam.kernelID       = m_cloneKernelID;
3749     halKernelParam->clonedKernelParam.hasClones      = m_hasClones;
3750 
3751     halKernelParam->kernelId = m_id++;
3752     if ((m_program->m_cisaMajorVersion >= 3 && m_program->m_cisaMinorVersion >= 3))
3753         halKernelParam->numArgs = numArgs;
3754     else
3755         halKernelParam->numArgs = numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM;
3756     halKernelParam->numThreads = m_threadCount;
3757     halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3758     halKernelParam->kernelDataSize = kernelDataSize;
3759     halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3760     halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
3761 
3762     halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
3763     halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
3764 
3765     halKernelParam->kernelBinary = (uint8_t*)m_binary;
3766 
3767     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->GetCmKernel(cmKernel));
3768     if (cmKernel == nullptr)
3769     {
3770         return CM_NULL_POINTER;
3771     }
3772     MOS_SecureStrcpy(halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName());
3773 
3774     uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
3775     threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
3776 
3777     for (uint32_t i = 0; i < numArgs; i++)
3778     {
3779         // get the min kernel payload offset
3780         if ((halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE) && IsKernelArg(tempArgs[i]))
3781         {
3782             if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3783             {
3784                 if (minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload)
3785                 {
3786                     minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3787                 }
3788             }
3789             else
3790             {
3791                 if ((minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3792                 {
3793                     minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3794                 }
3795             }
3796         }
3797     }
3798 
3799     for (uint32_t i = 0; i < numArgs; i++)
3800     {
3801         halKernelParam->argParams[i].unitCount = tempArgs[i].unitCount;
3802         halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[i].unitKind);
3803         halKernelParam->argParams[i].unitSize = tempArgs[i].unitSize;
3804         halKernelParam->argParams[i].payloadOffset = tempArgs[i].unitOffsetInPayload;
3805         halKernelParam->argParams[i].perThread = false;
3806         halKernelParam->argParams[i].nCustomValue = tempArgs[i].nCustomValue;
3807         halKernelParam->argParams[i].aliasIndex = tempArgs[i].aliasIndex;
3808         halKernelParam->argParams[i].aliasCreated = tempArgs[i].aliasCreated;
3809         halKernelParam->argParams[i].isNull = tempArgs[i].isNull;
3810 
3811         if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_LOCALSIZE) {
3812             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3813             *(uint32_t *)halKernelParam->argParams[i].firstValue = thrdSpaceWidth;
3814             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = thrdSpaceHeight;
3815             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = thrdSpaceDepth;
3816         }
3817         else if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_GROUPSIZE) {
3818             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3819             *(uint32_t *)halKernelParam->argParams[i].firstValue = grpSpaceWidth;
3820             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = grpSpaceHeight;
3821             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = grpSpaceDepth;
3822         }
3823         else if (tempArgs[i].unitKind == ARG_KIND_IMPLICIT_LOCALID) {
3824             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3825             halKernelParam->localIdIndex = i;
3826         }
3827         else
3828             CreateThreadArgData(&halKernelParam->argParams[i], i, nullptr, tempArgs);
3829 
3830         if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
3831         {
3832             if (IsKernelArg(halKernelParam->argParams[i]))
3833             {
3834                 // Kernel arg : calculate curbe size & adjust payloadoffset
3835                 if (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID)
3836                 {
3837                     halKernelParam->argParams[i].payloadOffset -= minKernelPlayloadOffset;
3838                 }
3839                 else
3840                 {
3841                     // ARG_KIND_IMPLICIT_LOCALID is only for visa3.3+, need to adjust payloadOffset of local id for visa3.3+ later.
3842                     adjustLocalIdPayloadOffset = true;
3843                 }
3844 
3845                 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3846                     if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize))
3847                     {  // The largest one
3848                         kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3849                     }
3850                 }
3851                 else
3852                 {
3853                     if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3854                     {  // The largest one
3855                         kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3856                     }
3857                 }
3858             }
3859         }
3860     }
3861 
3862     if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
3863     {
3864         PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
3865         PCM_HAL_STATE state = cmData->cmHalState;
3866         kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
3867         halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
3868     }
3869 
3870     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3871     {
3872         // GPGPU walker - implicit args
3873         for (uint32_t i = numArgs; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3874         {
3875             halKernelParam->argParams[i].unitCount = 1;
3876             halKernelParam->argParams[i].kind = CM_ARGUMENT_GENERAL;
3877             halKernelParam->argParams[i].unitSize = 4;
3878             halKernelParam->argParams[i].payloadOffset = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + (i - numArgs) * sizeof(uint32_t);
3879             halKernelParam->argParams[i].perThread = false;
3880         }
3881 
3882         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 0].firstValue, thrdSpaceWidth));
3883         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 1].firstValue, thrdSpaceHeight));
3884         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 2].firstValue, grpSpaceWidth));
3885         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 3].firstValue, grpSpaceHeight));
3886         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 4].firstValue, thrdSpaceWidth));
3887         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 5].firstValue, thrdSpaceHeight));
3888         halKernelParam->localIdIndex = halKernelParam->numArgs - 2;
3889     }
3890     halKernelParam->gpgpuWalkerParams.gpgpuEnabled = true;
3891     halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
3892     halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
3893     halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
3894     halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
3895     halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
3896     halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
3897     //Get SLM size
3898     halKernelParam->slmSize = GetSLMSize();
3899 
3900     //Get spill area to adjust scratch space
3901     halKernelParam->spillSize = GetSpillMemUsed();
3902 
3903     //Set Barrier mode
3904     halKernelParam->barrierMode = m_barrierMode;
3905     halKernelParam->numberThreadsInGroup = thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3906     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3907         kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + CM_GPUWALKER_IMPLICIT_ARG_NUM * sizeof(uint32_t);
3908     else
3909         kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4);
3910     if ((kernelCurbeSize % 32) == 4) //The per-thread data occupy 2 GRF.
3911     {
3912         halKernelParam->curbeSizePerThread = 64;
3913     }
3914     else
3915     {
3916         halKernelParam->curbeSizePerThread = 32;
3917     }
3918     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3919         halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread + halKernelParam->curbeSizePerThread *
3920             thrdSpaceWidth * thrdSpaceHeight;
3921         //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3922         halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread;
3923     }
3924     else {
3925         halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) + halKernelParam->curbeSizePerThread *
3926             thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3927         //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3928         halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
3929     }
3930     halKernelParam->payloadSize = 0; // no thread arg allowed
3931 
3932     // adjust payloadOffset of local id for visa3.3+
3933     if (adjustLocalIdPayloadOffset)
3934     {
3935         halKernelParam->argParams[halKernelParam->localIdIndex].payloadOffset = halKernelParam->crossThreadConstDataLen;
3936     }
3937 
3938     m_sizeInCurbe = GetAlignedCurbeSize(halKernelParam->totalCurbeSize);
3939 
3940     CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
3941 
3942     if (m_samplerBtiCount != 0)
3943     {
3944         CmSafeMemCopy((void*)halKernelParam->samplerBTIParam.samplerInfo, (void*)m_samplerBtiEntry, sizeof(m_samplerBtiEntry));
3945         halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
3946 
3947         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
3948         m_samplerBtiCount = 0;
3949     }
3950 
3951     CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
3952 
3953     UpdateKernelDataGlobalSurfaceInfo(halKernelParam);
3954 
3955     //Destroy Temp Args
3956     for (uint32_t j = 0; j < numArgs; j++)
3957     {
3958         if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3959         {
3960             MosSafeDeleteArray(tempArgs[j].value);
3961         }
3962     }
3963     MosSafeDeleteArray(tempArgs);
3964 
3965     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
3966 finish:
3967     if (hr != CM_SUCCESS)
3968     {
3969         //Clean allocated memory : need to count the implicit args
3970         if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3971 
3972             for (uint32_t i = 0; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3973             {
3974                 if (halKernelParam)
3975                 {
3976                     if (halKernelParam->argParams[i].firstValue)
3977                     {
3978                         MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3979                     }
3980                 }
3981             }
3982         }
3983         else
3984         {
3985             for (uint32_t i = 0; i < numArgs; i++)
3986             {
3987                 if (halKernelParam)
3988                 {
3989                     if (halKernelParam->argParams[i].firstValue)
3990                     {
3991                         MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3992                     }
3993                 }
3994             }
3995         }
3996         //Destroy Temp Args in failing case
3997         if (tempArgs)
3998         {
3999             for (uint32_t j = 0; j < numArgs; j++)
4000             {
4001                 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4002                 {
4003                     MosSafeDeleteArray(tempArgs[j].value);
4004                 }
4005             }
4006             MosSafeDeleteArray(tempArgs);
4007         }
4008     }
4009     return hr;
4010 }
4011 
4012 //*-----------------------------------------------------------------------------
4013 //| Purpose:    Prepare Kernel Data including thread args, kernel args
4014 //| Returns:    Result of the operation.
4015 //*-----------------------------------------------------------------------------
IsBatchBufferReusable(CmThreadSpaceRT * taskThreadSpace)4016 bool CmKernelRT::IsBatchBufferReusable( CmThreadSpaceRT * taskThreadSpace )
4017 {
4018     bool reusable = true;
4019     //Update m_id if the batch buffer is not reusable.
4020     if (m_dirty & CM_KERNEL_DATA_THREAD_ARG_DIRTY)
4021     {
4022         reusable = false; // if thread arg dirty
4023     }
4024     else if ((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) && (m_curbeEnabled == false))
4025     {
4026         reusable = false; // if kernel arg dirty and curbe disabled
4027     }
4028     else if (m_dirty & CM_KERNEL_DATA_THREAD_COUNT_DIRTY)
4029     {
4030         reusable = false; // if thread count dirty
4031     }
4032     else if (m_threadSpace)
4033     {
4034        if (m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4035        {
4036           reusable = false; // if per kernel thread space exists and it is completely dirty
4037        }
4038     }
4039     else if (taskThreadSpace)
4040     {
4041        if (taskThreadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4042        {
4043           reusable = false; // if per task thread space change and it is completely dirty
4044        }
4045     }
4046     return reusable;
4047 
4048 }
4049 
4050 //*-----------------------------------------------------------------------------
4051 //| Purpose:    Checks to see if kernel prologue has changed
4052 //| Returns:    Result of the operation.
4053 //*-----------------------------------------------------------------------------
IsPrologueDirty(void)4054 bool CmKernelRT::IsPrologueDirty( void )
4055 {
4056     bool prologueDirty = false;
4057 
4058     if( m_threadCount != m_lastThreadCount )
4059     {
4060         if( m_lastThreadCount )
4061         {
4062             if( m_threadCount == 1 || m_lastThreadCount == 1 )
4063             {
4064                 prologueDirty = true;
4065             }
4066         }
4067         m_lastThreadCount = m_threadCount;
4068     }
4069 
4070     if( m_adjustScoreboardY != m_lastAdjustScoreboardY )
4071     {
4072         if( m_lastAdjustScoreboardY )
4073         {
4074             prologueDirty = true;
4075         }
4076         m_lastAdjustScoreboardY = m_adjustScoreboardY;
4077     }
4078 
4079     return prologueDirty;
4080 }
4081 
4082 //*-----------------------------------------------------------------------------
4083 //| Purpose:    Prepare Kernel Data including thread args, kernel args
4084 //| Returns:    Result of the operation.
4085 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)4086 int32_t CmKernelRT::CreateKernelDataInternal(
4087     CmKernelData* & kernelData,  // out
4088     uint32_t& kernelDataSize,         // out
4089     const CmThreadSpaceRT* threadSpace )    // in
4090 {
4091     PCM_HAL_KERNEL_PARAM  halKernelParam       = nullptr;
4092     int32_t               hr                    = CM_SUCCESS;
4093     uint32_t              movInstNum            = 0;
4094     uint32_t              kernelCurbeSize          = 0;
4095     uint32_t              numArgs               = 0;
4096     uint32_t              bottomRange         = 1024;
4097     uint32_t              upRange             = 0;
4098     uint32_t              unitSize              = 0;
4099     bool                  hasThreadArg          = false;
4100     CmThreadSpaceRT         *cmThreadSpace       = nullptr;
4101     bool                  isKernelThreadSpace   = false;
4102     CM_ARG                *tempArgs            = nullptr;
4103     uint32_t              argSize               = 0;
4104     uint32_t              surfNum               = 0; //Pass needed BT entry numbers to HAL CM
4105     CmKernelRT             *cmKernel             = nullptr;
4106 
4107     if( threadSpace == nullptr && m_threadSpace!= nullptr)
4108     {
4109         cmThreadSpace = m_threadSpace;
4110         isKernelThreadSpace = true;
4111     }
4112     else
4113     {
4114         cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4115     }
4116 
4117     CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create( this, kernelData ));
4118     halKernelParam = kernelData->GetHalCmKernelData();
4119     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4120 
4121     //Get Num of args with surface array
4122     CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
4123 
4124     if( numArgs > 0)
4125     {
4126         //Create Temp args
4127         CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
4128         //Create move instructions
4129         CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum,   halKernelParam->movInsData, tempArgs, numArgs));
4130     }
4131 
4132     CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
4133     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
4134 
4135     if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4136     {
4137         m_id ++;
4138     }
4139 
4140     if( IsPrologueDirty( ) )
4141     {
4142         // can't re-use kernel binary in GSH
4143         // just update upper 16 bits
4144         uint64_t tempID = m_id;
4145         tempID >>= 48;
4146         tempID++;
4147         tempID <<= 48;
4148         // get rid of old values in upper 16 bits
4149         m_id <<= 16;
4150         m_id >>= 16;
4151         m_id |= tempID;
4152     }
4153 
4154     halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
4155     halKernelParam->clonedKernelParam.kernelID       = m_cloneKernelID;
4156     halKernelParam->clonedKernelParam.hasClones      = m_hasClones;
4157     halKernelParam->kernelId           = m_id; // kernel id , high 32-bit is kernel id, low 32-bit is kernel data id for batch buffer reuse
4158     halKernelParam->numArgs             = numArgs;
4159     halKernelParam->numThreads          = m_threadCount;
4160     halKernelParam->kernelBinarySize    = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4161     halKernelParam->kernelDataSize      = kernelDataSize;
4162     halKernelParam->movInsDataSize      = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4163 
4164     halKernelParam->cmFlags             = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
4165     halKernelParam->cmFlags            |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
4166     halKernelParam->kernelDebugEnabled  = m_blhwDebugEnable;
4167 
4168     halKernelParam->kernelBinary        = (uint8_t*)m_binary;
4169 
4170     CM_CHK_CMSTATUS_GOTOFINISH( kernelData->GetCmKernel( cmKernel ) );
4171     if ( cmKernel == nullptr )
4172     {
4173         return CM_NULL_POINTER;
4174     }
4175     MOS_SecureStrcpy( halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName() );
4176 
4177     if ( cmThreadSpace )
4178     {// either from per kernel thread space or per task thread space
4179         CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(cmThreadSpace)); // must be called before CreateThreadArgData
4180     }
4181 
4182     for(uint32_t i =0 ; i< numArgs; i++)
4183     {
4184         halKernelParam->argParams[i].unitCount        = tempArgs[ i ].unitCount;
4185         halKernelParam->argParams[i].kind              = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[ i ].unitKind);
4186         halKernelParam->argParams[i].unitSize         = tempArgs[ i ].unitSize;
4187         halKernelParam->argParams[i].payloadOffset    = tempArgs[ i ].unitOffsetInPayload;
4188         halKernelParam->argParams[i].perThread        = (tempArgs[ i ].unitCount > 1) ? true :false;
4189         halKernelParam->argParams[i].nCustomValue      = tempArgs[ i ].nCustomValue;
4190         halKernelParam->argParams[i].aliasIndex       = tempArgs[ i ].aliasIndex;
4191         halKernelParam->argParams[i].aliasCreated     = tempArgs[ i ].aliasCreated;
4192         halKernelParam->argParams[i].isNull           = tempArgs[ i ].isNull;
4193 
4194         CreateThreadArgData(&halKernelParam->argParams[i], i, cmThreadSpace, tempArgs);
4195 
4196         if(CHECK_SURFACE_TYPE ( halKernelParam->argParams[i].kind,
4197             ARG_KIND_SURFACE_VME,
4198             ARG_KIND_SURFACE_SAMPLER,
4199             ARG_KIND_SURFACE2DUP_SAMPLER))
4200         {
4201             unitSize = CM_ARGUMENT_SURFACE_SIZE;
4202         }
4203         else
4204         {
4205             unitSize = halKernelParam->argParams[i].unitSize;
4206         }
4207 
4208         if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
4209         {
4210             if(IsKernelArg(halKernelParam->argParams[i]))
4211             {
4212                 // Kernel arg : calculate curbe size & adjust payloadoffset
4213                 // Note: Here the payloadOffset may be different from original value
4214                 uint32_t offset = halKernelParam->argParams[i].payloadOffset - CM_PAYLOAD_OFFSET;
4215                 if (offset >= kernelCurbeSize)
4216                 {
4217                     kernelCurbeSize = offset + unitSize;
4218                 }
4219                 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4220             }
4221         }
4222 
4223         if(!IsKernelArg(halKernelParam->argParams[i]))
4224         {   //Thread arg : Calculate payload size & adjust payloadoffset
4225             hasThreadArg  = true;
4226             halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4227 
4228             if(halKernelParam->argParams[i].payloadOffset < bottomRange)
4229             {
4230                bottomRange = halKernelParam->argParams[i].payloadOffset;
4231             }
4232             if(halKernelParam->argParams[i].payloadOffset >=  upRange)
4233             {
4234                upRange = halKernelParam->argParams[i].payloadOffset + unitSize;
4235             }
4236         }
4237     }
4238 
4239     if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
4240     {
4241         PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
4242         PCM_HAL_STATE state = cmData->cmHalState;
4243         kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
4244         halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
4245     }
4246 
4247     halKernelParam->payloadSize         = hasThreadArg ? MOS_ALIGN_CEIL(upRange -  bottomRange, 4): 0;
4248     halKernelParam->totalCurbeSize      = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
4249     halKernelParam->curbeSizePerThread  = halKernelParam->totalCurbeSize;
4250 
4251     halKernelParam->perThreadArgExisted = hasThreadArg;
4252 
4253     m_sizeInCurbe = GetAlignedCurbeSize( kernelCurbeSize );
4254 
4255     if ( halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE )
4256     {
4257         for(uint32_t i=0; i< numArgs; i++)
4258         {
4259             if(!IsKernelArg(halKernelParam->argParams[i]))
4260             {  // thread arg: need to minus curbe size
4261                 halKernelParam->argParams[i].payloadOffset -= halKernelParam->curbeSizePerThread;
4262             }
4263         }
4264     }
4265 
4266     //Create indirect data
4267     CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
4268 
4269     if ( m_samplerBtiCount != 0 )
4270     {
4271         CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4272         halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4273 
4274         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4275         m_samplerBtiCount = 0;
4276     }
4277 
4278     CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
4279 
4280     //Create thread space param: only avaliable if per kernel ts exists
4281     if(m_threadSpace)
4282     {
4283         CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadSpaceParam(&halKernelParam->kernelThreadSpaceParam, m_threadSpace));
4284     }
4285 
4286     //Get SLM size
4287     halKernelParam->slmSize = GetSLMSize();
4288 
4289     //Get Spill mem used
4290     halKernelParam->spillSize = GetSpillMemUsed();
4291 
4292     //Set Barrier mode
4293     halKernelParam->barrierMode = m_barrierMode;
4294 
4295     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4296 
4297     //Destroy Temp Args
4298     for (uint32_t j = 0; j < numArgs; j++)
4299     {
4300         if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4301         {
4302             MosSafeDeleteArray(tempArgs[j].value);
4303         }
4304     }
4305     MosSafeDeleteArray( tempArgs );
4306 
4307     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4308 finish:
4309     if(hr != CM_SUCCESS)
4310     {
4311          if(halKernelParam)
4312          {
4313              //Clean allocated memory
4314              for(uint32_t i =0 ; i< numArgs; i++)
4315              {
4316                 if( halKernelParam->argParams[i].firstValue )
4317                 {
4318                     MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
4319                 }
4320              }
4321          }
4322 
4323          //Destroy Temp Args
4324          if (tempArgs)
4325          {
4326              for (uint32_t j = 0; j < numArgs; j++)
4327              {
4328                  if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4329                  {
4330                      MosSafeDeleteArray(tempArgs[j].value);
4331                  }
4332              }
4333              MosSafeDeleteArray(tempArgs);
4334          }
4335     }
4336     return hr;
4337 }
4338 
4339 //*-----------------------------------------------------------------------------
4340 //| Purpose:    Update kernel data's kernel arg, thread arg, thread count
4341 //| Returns:    Result of the operation.
4342 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadSpaceRT * threadSpace)4343 int32_t CmKernelRT::UpdateKernelData(
4344     CmKernelData*   kernelData,  // in
4345     const CmThreadSpaceRT* threadSpace)
4346 {
4347     int32_t               hr                      = CM_SUCCESS;
4348     PCM_HAL_KERNEL_PARAM  halKernelParam         = nullptr;
4349     bool                  bbResuable             = true;
4350     CmThreadSpaceRT         *cmThreadSpace         = nullptr;
4351     bool                  isKernelThreadSpace     = false;
4352     uint32_t              argIndexStep            = 0;
4353     uint32_t              argIndex                = 0;
4354     uint32_t              surfNum                 = 0; //Update Number of surface used by kernel
4355 
4356     if( threadSpace == nullptr && m_threadSpace!= nullptr)
4357     {
4358         cmThreadSpace = m_threadSpace;
4359         isKernelThreadSpace = true;
4360     }
4361     else
4362     {
4363         cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4364     }
4365 
4366     CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4367     CM_ASSERT(kernelData->IsInUse() == false);
4368 
4369     halKernelParam = kernelData->GetHalCmKernelData();
4370     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4371 
4372     if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4373     {
4374         m_id ++;
4375         halKernelParam->kernelId = m_id;
4376     }
4377 
4378     //Update arguments
4379     for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4380     {
4381         argIndexStep = 1;
4382 
4383         if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4384                         ARG_KIND_SURFACE,
4385                         ARG_KIND_SURFACE_1D,
4386                         ARG_KIND_SURFACE_2D,
4387                         ARG_KIND_SURFACE_2D_UP,
4388                         ARG_KIND_SURFACE_SAMPLER,
4389                         ARG_KIND_SURFACE2DUP_SAMPLER,
4390                         ARG_KIND_SURFACE_3D,
4391                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4392                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4393                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4394                         ARG_KIND_STATE_BUFFER ) )
4395         {
4396             argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4397         }
4398         else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind,  ARG_KIND_SURFACE_VME))
4399         {
4400             argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4401         }
4402 
4403         if(m_args[ orgArgIndex ].isDirty)
4404         {
4405             if(m_args[ orgArgIndex ].unitCount > 1)
4406             { // thread arg is dirty
4407                 bbResuable          = false;
4408             }
4409 
4410             if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4411                         ARG_KIND_SURFACE,
4412                         ARG_KIND_SURFACE_1D,
4413                         ARG_KIND_SURFACE_2D,
4414                         ARG_KIND_SURFACE_2D_UP,
4415                         ARG_KIND_SURFACE_SAMPLER,
4416                         ARG_KIND_SURFACE2DUP_SAMPLER,
4417                         ARG_KIND_SURFACE_3D,
4418                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4419                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4420                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4421                         ARG_KIND_STATE_BUFFER ) )
4422             {  // for surface args
4423 
4424                 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4425                 if(m_args[ orgArgIndex ].unitCount ==  1) // kernel arg
4426                 {
4427                     if (numSurfaces > 1)
4428                     {
4429                         for (uint32_t kk = 0; kk < numSurfaces; kk++)
4430                         {
4431                             CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4432                             CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4433                                 m_args[orgArgIndex].value + kk*sizeof(uint32_t), sizeof(uint32_t));
4434                             halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4435                             halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4436                             halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4437 
4438                             if (!m_args[orgArgIndex].surfIndex[kk])
4439                             {
4440                                 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4441                                 //This is for special usage if there is empty element in surface array.
4442                                 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4443                                 continue;
4444                             }
4445 
4446                             halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4447                             halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4448                         }
4449                     }
4450                     else
4451                     {
4452                         CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4453                         CmSafeMemCopy(halKernelParam->argParams[argIndex].firstValue,
4454                                 m_args[ orgArgIndex ].value, sizeof(uint32_t));
4455                         halKernelParam->argParams[argIndex].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4456                         halKernelParam->argParams[argIndex].aliasIndex   = m_args[orgArgIndex].aliasIndex;
4457                         halKernelParam->argParams[argIndex].aliasCreated = m_args[orgArgIndex].aliasCreated;
4458                         halKernelParam->argParams[argIndex].isNull = m_args[orgArgIndex].isNull;
4459                     }
4460 
4461                  }
4462                  else // thread arg
4463                  {
4464                     uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4465                     uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, (sizeof(uint32_t) * m_args[orgArgIndex].unitCount));
4466                     CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
4467                     for (uint32_t kk=0;  kk< numSurfaces ; kk++)
4468                     {
4469                         for (uint32_t s = 0; s < m_args[orgArgIndex].unitCount; s++)
4470                         {
4471                             surfaces[s] = *(uint32_t *)((uint32_t *)m_args[orgArgIndex].value + kk + numSurfaces * s);
4472                         }
4473                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4474                             surfaces, sizeof(uint32_t) * m_args[orgArgIndex].unitCount);
4475 
4476                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4477 
4478                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4479                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4480                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4481 
4482                     }
4483                     MosSafeDeleteArray(surfaces);
4484                  }
4485 
4486             }
4487             else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4488             {
4489                 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4490                 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4491                 {
4492                     uint32_t vmeSurfOffset = 0;
4493                     for (uint32_t kk = 0; kk< numSurfaces; kk++)
4494                     {
4495                         uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4496 
4497                         // reallocate the firstValue for VME surface every time
4498                         // since the number of surfaces may vary
4499                         MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4500                         halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4501                         CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4502                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4503                             m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4504 
4505                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4506 
4507                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4508                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4509                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4510                         halKernelParam->argParams[argIndex + kk].unitSize = vmeSize;
4511                         vmeSurfOffset += vmeSize;
4512                     }
4513                 }
4514             }
4515             else
4516             {
4517                 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, cmThreadSpace, m_args));
4518             }
4519         }
4520         argIndex += argIndexStep;
4521     }
4522 
4523     //Update Thread space param
4524     if(m_threadSpace && m_threadSpace->GetDirtyStatus())
4525     {
4526 
4527         CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(m_threadSpace));
4528 
4529         uint32_t threadSpaceWidth = 0, threadSpaceHeight = 0;
4530         PCM_HAL_KERNEL_THREADSPACE_PARAM  cmKernelThreadSpaceParam = &halKernelParam->kernelThreadSpaceParam;
4531         m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
4532 
4533         cmKernelThreadSpaceParam->threadSpaceWidth  = (uint16_t)threadSpaceWidth;
4534         cmKernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
4535         m_threadSpace->GetDependencyPatternType(cmKernelThreadSpaceParam->patternType);
4536         m_threadSpace->GetWalkingPattern(cmKernelThreadSpaceParam->walkingPattern);
4537         m_threadSpace->GetColorCountMinusOne(cmKernelThreadSpaceParam->colorCountMinusOne);
4538 
4539         CM_HAL_DEPENDENCY*     dependency = nullptr;
4540         m_threadSpace->GetDependency( dependency);
4541 
4542         if(dependency != nullptr)
4543         {
4544             CmSafeMemCopy(&cmKernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
4545         }
4546 
4547         if( m_threadSpace->CheckWalkingParametersSet() )
4548         {
4549             CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetWalkingParameters(cmKernelThreadSpaceParam->walkingParams));
4550         }
4551 
4552         if( m_threadSpace->CheckDependencyVectorsSet() )
4553         {
4554             CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetDependencyVectors(cmKernelThreadSpaceParam->dependencyVectors));
4555         }
4556 
4557         if(m_threadSpace->IsThreadAssociated())
4558         {// media object only
4559             uint32_t *boardOrder = nullptr;
4560             m_threadSpace->GetBoardOrder(boardOrder);
4561             CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
4562 
4563             CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
4564             m_threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
4565             CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceUnit);
4566 
4567             cmKernelThreadSpaceParam->reuseBBUpdateMask = 0;
4568             for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
4569             {
4570                 cmKernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
4571                 cmKernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
4572                 cmKernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
4573                 cmKernelThreadSpaceParam->threadCoordinates[i].resetMask = threadSpaceUnit[boardOrder[i]].reset;
4574                 cmKernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
4575                 cmKernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
4576                 cmKernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
4577                 cmKernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
4578             }
4579 
4580             if( cmKernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
4581             {
4582                 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
4583                 m_threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
4584 
4585                 if (cmKernelThreadSpaceParam->dispatchInfo.numWaves >= dispatchInfo.numWaves)
4586                 {
4587                     cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4588                     CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4589                 }
4590                 else
4591                 {
4592                     cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4593                     MosSafeDeleteArray(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
4594                     cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
4595                     CM_CHK_NULL_GOTOFINISH(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
4596                     CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4597                 }
4598             }
4599         }
4600     }
4601 
4602     // Update indirect data
4603     if( m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY)
4604     {
4605         halKernelParam->indirectDataParam.indirectDataSize = m_usKernelPayloadDataSize;
4606         halKernelParam->indirectDataParam.surfaceCount     = m_usKernelPayloadSurfaceCount;
4607 
4608         if(m_usKernelPayloadDataSize != 0)
4609         {
4610             if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4611             { // size change, need to reallocate
4612                 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4613                 halKernelParam->indirectDataParam.indirectData = MOS_NewArray(uint8_t, m_usKernelPayloadDataSize);
4614                 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.indirectData, CM_OUT_OF_HOST_MEMORY);
4615             }
4616             CmSafeMemCopy(halKernelParam->indirectDataParam.indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4617         }
4618 
4619         if(m_usKernelPayloadSurfaceCount != 0)
4620         {
4621             if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4622             { // size change, need to reallocate
4623                 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4624                 halKernelParam->indirectDataParam.surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, m_usKernelPayloadSurfaceCount);
4625                 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4626 
4627             }
4628             CmSafeMemCopy((void*)halKernelParam->indirectDataParam.surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4629                              m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4630             //clear m_IndirectSurfaceInfoArray every enqueue
4631             CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4632             m_usKernelPayloadSurfaceCount = 0;
4633         }
4634     }
4635 
4636     if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4637     {
4638         if ( m_samplerBtiCount != 0 )
4639         {
4640             CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4641             halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4642 
4643             CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4644             m_samplerBtiCount = 0;
4645         }
4646     }
4647     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4648 
4649     CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4650 
4651     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4652 
4653 finish:
4654     if( hr != CM_SUCCESS)
4655     {
4656         if( halKernelParam )
4657         {
4658             MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4659             MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4660         }
4661     }
4662     return hr;
4663 }
4664 
4665 //*-----------------------------------------------------------------------------
4666 //| Purpose:    Update kernel data's kernel arg, thread arg, thread count
4667 //| Returns:    Result of the operation.
4668 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadGroupSpace * threadGroupSpace)4669 int32_t CmKernelRT::UpdateKernelData(
4670     CmKernelData*   kernelData,  // in
4671     const CmThreadGroupSpace* threadGroupSpace )    // in
4672 {
4673     int32_t               hr                      = CM_SUCCESS;
4674     PCM_HAL_KERNEL_PARAM  halKernelParam         = nullptr;
4675     uint32_t              argIndexStep            = 0;
4676     uint32_t              argIndex                = 0;
4677     uint32_t              surfNum                 = 0;
4678     auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
4679 
4680     CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4681     CM_ASSERT(kernelData->IsInUse() == false);
4682 
4683     halKernelParam = kernelData->GetHalCmKernelData();
4684     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4685 
4686     CM_CHK_NULL_GOTOFINISH_CMERROR(threadGroupSpace);
4687 
4688     //Update arguments
4689     for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4690     {
4691         argIndexStep = 1;
4692 
4693         if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4694                         ARG_KIND_SURFACE,
4695                         ARG_KIND_SURFACE_1D,
4696                         ARG_KIND_SURFACE_2D,
4697                         ARG_KIND_SURFACE_2D_UP,
4698                         ARG_KIND_SURFACE_SAMPLER,
4699                         ARG_KIND_SURFACE2DUP_SAMPLER,
4700                         ARG_KIND_SURFACE_3D,
4701                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4702                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4703                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4704                         ARG_KIND_STATE_BUFFER ) )
4705         {
4706             argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4707         }
4708         else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4709         {
4710             argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4711         }
4712 
4713         if(m_args[ orgArgIndex ].isDirty)
4714         {
4715             if(m_args[ orgArgIndex ].unitCount > 1)
4716             { // thread arg is dirty
4717                 CM_ASSERTMESSAGE("Error: Thread arg is not allowed in GPGPU walker.");
4718                 hr = CM_FAILURE; // Thread arg is not allowed in GPGPU walker
4719                 goto finish;
4720             }
4721 
4722             if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4723                         ARG_KIND_SURFACE,
4724                         ARG_KIND_SURFACE_1D,
4725                         ARG_KIND_SURFACE_2D,
4726                         ARG_KIND_SURFACE_2D_UP,
4727                         ARG_KIND_SURFACE_SAMPLER,
4728                         ARG_KIND_SURFACE2DUP_SAMPLER,
4729                         ARG_KIND_SURFACE_3D,
4730                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4731                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4732                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4733                         ARG_KIND_STATE_BUFFER ) )
4734             {  // for surface args
4735                 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4736                 if(m_args[ orgArgIndex ].unitCount ==  1) // kernel arg
4737                 {
4738                     if (numSurfaces > 1 )
4739                     {
4740                         for(uint32_t kk=0;  kk< numSurfaces ; kk++)
4741                         {
4742                             CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue
4743                                       != nullptr);
4744                             CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4745                                           m_args[ orgArgIndex ].value + kk*sizeof(uint32_t),
4746                                           sizeof(uint32_t));
4747                             halKernelParam->argParams[argIndex + kk].aliasIndex
4748                                     = m_args[orgArgIndex].aliasIndex;
4749                             halKernelParam->argParams[argIndex + kk].aliasCreated
4750                                     = m_args[orgArgIndex].aliasCreated;
4751                             halKernelParam->argParams[argIndex + kk].isNull
4752                                     = m_args[orgArgIndex].isNull;
4753 
4754                             if (!m_args[orgArgIndex].surfIndex[kk])
4755                             {
4756                                 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4757                                 //This is for special usage if there is empty element in surface array.
4758                                 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4759                                 continue;
4760                             }
4761                             halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4762                             halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4763                             halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4764 
4765                         }
4766                     }
4767                     else
4768                     {
4769                         CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4770                         halKernelParam->argParams[argIndex].kind
4771                                 = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4772                         halKernelParam->argParams[argIndex].aliasIndex
4773                                 = m_args[orgArgIndex].aliasIndex;
4774                         halKernelParam->argParams[argIndex].aliasCreated
4775                                 = m_args[orgArgIndex].aliasCreated;
4776                         halKernelParam->argParams[argIndex].isNull
4777                                 = m_args[orgArgIndex].isNull;
4778                         if (halKernelParam->argParams[argIndex].isNull)
4779                         {
4780                             *(halKernelParam->argParams[argIndex].firstValue)
4781                                     = 0;
4782                         }
4783                         else
4784                         {
4785                             CmSafeMemCopy(
4786                                 halKernelParam->argParams[argIndex].firstValue,
4787                                 m_args[orgArgIndex].value, sizeof(uint32_t));
4788                         }
4789                     }
4790                 }
4791             }
4792             else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4793             {
4794                 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4795                 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4796                 {
4797                     uint32_t vmeSurfOffset = 0;
4798                     for (uint32_t kk = 0; kk< numSurfaces; kk++)
4799                     {
4800                         uint32_t vmeSize = getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4801 
4802                         // reallocate the firstValue for VME surface every time
4803                         // since the number of surfaces may vary
4804                         MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4805                         halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4806                         CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4807                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4808                             m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4809 
4810                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4811 
4812                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4813                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4814                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4815                         halKernelParam->argParams[argIndex + kk].unitSize = m_args[orgArgIndex].unitSize;
4816                         vmeSurfOffset += vmeSize;
4817                     }
4818                 }
4819             }
4820             else
4821             {
4822                 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, nullptr, m_args));
4823             }
4824         }
4825         argIndex += argIndexStep;
4826     }
4827 
4828     if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4829     {
4830         if ( m_samplerBtiCount != 0 )
4831         {
4832             CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4833             halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4834 
4835             CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4836             m_samplerBtiCount = 0;
4837         }
4838     }
4839 
4840     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4841 
4842     CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4843 
4844     // GPGPU walker - implicit args
4845     uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
4846     threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
4847 
4848     halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
4849     halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
4850     halKernelParam->gpgpuWalkerParams.groupWidth  = grpSpaceWidth;
4851     halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
4852     halKernelParam->gpgpuWalkerParams.threadWidth  = thrdSpaceWidth;
4853     halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
4854 
4855     if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 3))
4856     {
4857         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 0].firstValue, thrdSpaceWidth));
4858         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 1].firstValue, thrdSpaceHeight));
4859         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 2].firstValue, grpSpaceWidth));
4860         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 3].firstValue, grpSpaceHeight));
4861         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 4].firstValue, thrdSpaceWidth));
4862         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 5].firstValue, thrdSpaceHeight));
4863     }
4864 
4865     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4866 finish:
4867     return hr;
4868 }
4869 
4870 //*-----------------------------------------------------------------------------
4871 //| Purpose:    Create kernel indirect data
4872 //| Returns:    Result of the operation.
4873 //*-----------------------------------------------------------------------------
CreateKernelIndirectData(PCM_HAL_INDIRECT_DATA_PARAM halIndirectData)4874 int32_t CmKernelRT::CreateKernelIndirectData(
4875     PCM_HAL_INDIRECT_DATA_PARAM  halIndirectData )    // in/out
4876 {
4877     int32_t hr = CM_SUCCESS;
4878 
4879     halIndirectData->indirectDataSize = m_usKernelPayloadDataSize;
4880     halIndirectData->surfaceCount     = m_usKernelPayloadSurfaceCount;
4881 
4882     if( halIndirectData->indirectData == nullptr &&  m_usKernelPayloadDataSize != 0)
4883     {
4884         halIndirectData->indirectData = MOS_NewArray(uint8_t, halIndirectData->indirectDataSize);
4885         CM_CHK_NULL_GOTOFINISH(halIndirectData->indirectData, CM_OUT_OF_HOST_MEMORY);
4886     }
4887 
4888     // For future kernel data, pKbyte is starting point
4889     if( halIndirectData->surfaceInfo == nullptr &&  m_usKernelPayloadSurfaceCount != 0)
4890     {
4891         halIndirectData->surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, halIndirectData->surfaceCount);
4892         CM_CHK_NULL_GOTOFINISH(halIndirectData->surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4893     }
4894 
4895     if(m_usKernelPayloadDataSize != 0)
4896     {
4897         CmSafeMemCopy(halIndirectData->indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4898     }
4899 
4900     if(m_usKernelPayloadSurfaceCount != 0)
4901     {
4902         CmSafeMemCopy((void*)halIndirectData->surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4903                     m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4904         //clear m_IndirectSurfaceInfoArray every enqueue
4905         CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4906         m_usKernelPayloadSurfaceCount = 0;
4907     }
4908 finish:
4909     if( hr != CM_SUCCESS)
4910     {
4911         if(halIndirectData->indirectData)                 MosSafeDeleteArray(halIndirectData->indirectData);
4912         if(halIndirectData->surfaceInfo)                  MosSafeDeleteArray(halIndirectData->surfaceInfo);
4913     }
4914     return hr;
4915 }
4916 
4917 //*-----------------------------------------------------------------------------
4918 //| Purpose:    UpdateLastKernelData
4919 //| Returns:    Result of the operation.
4920 //*-----------------------------------------------------------------------------
UpdateLastKernelData(CmKernelData * & kernelData)4921 int32_t CmKernelRT::UpdateLastKernelData(
4922     CmKernelData* & kernelData)    // in
4923 {
4924     int32_t hr = CM_SUCCESS;
4925 
4926     if( kernelData == nullptr || m_lastKernelData == kernelData )
4927     {
4928         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4929         return CM_NULL_POINTER;
4930     }
4931 
4932     if(m_lastKernelData)
4933     {
4934         CmKernelData::Destroy(m_lastKernelData); // reduce ref count or delete it
4935     }
4936     CSync* kernelLock = m_device->GetProgramKernelLock();
4937     CLock locker(*kernelLock);
4938     m_lastKernelData = kernelData;
4939     m_lastKernelData->Acquire();
4940     m_lastKernelDataSize = m_lastKernelData->GetKernelDataSize();
4941 
4942     return hr;
4943 }
4944 
4945 //*-----------------------------------------------------------------------------
4946 //| Purpose:    Wrapper of  CmKernelData::Destroy.
4947 //| Returns:    Result of the operation.
4948 //*-----------------------------------------------------------------------------
ReleaseKernelData(CmKernelData * & kernelData)4949 int32_t CmKernelRT::ReleaseKernelData(
4950     CmKernelData* & kernelData)
4951 {
4952     int32_t hr = CM_SUCCESS;
4953 
4954     if( kernelData == nullptr)
4955     {
4956         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4957         return CM_NULL_POINTER;
4958     }
4959 
4960     CSync* kernelLock = m_device->GetProgramKernelLock();
4961     CLock locker(*kernelLock);
4962 
4963     if(m_lastKernelData == kernelData)
4964     {
4965         // If the kernel data is the last kernel data
4966         // Need to update m_lastKernelData.
4967         hr = CmKernelData::Destroy(m_lastKernelData);
4968     }
4969     else
4970     {
4971         hr = CmKernelData::Destroy(kernelData);
4972     }
4973 
4974     return hr;
4975 }
4976 
4977 //*-----------------------------------------------------------------------------
4978 //| Purpose:   Acquire Kernel and Program
4979 //*-----------------------------------------------------------------------------
AcquireKernelProgram()4980 int32_t CmKernelRT::AcquireKernelProgram()
4981 {
4982     CSync* kernelLock = m_device->GetProgramKernelLock();
4983     CLock locker(*kernelLock);
4984 
4985     this->Acquire(); // increase kernel's ref count
4986     m_program->Acquire(); // increase program's ref count
4987 
4988     return CM_SUCCESS;
4989 }
4990 
4991 //*-----------------------------------------------------------------------------
4992 //| Purpose:   Acquire KenrelData, Kernel and Program
4993 //*-----------------------------------------------------------------------------
AcquireKernelData(CmKernelData * & kernelData)4994 int32_t CmKernelRT::AcquireKernelData(
4995     CmKernelData * &kernelData)
4996 {
4997     int32_t hr = CM_SUCCESS;
4998 
4999     if (kernelData == nullptr)
5000     {
5001         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
5002         return CM_NULL_POINTER;
5003     }
5004 
5005     CSync* kernelLock = m_device->GetProgramKernelLock();
5006     CLock locker(*kernelLock);
5007     kernelData->Acquire(); // increase kernel data's ref count
5008 
5009     return hr;
5010 }
5011 
SetAsClonedKernel(uint32_t cloneKernelID)5012 void CmKernelRT::SetAsClonedKernel(uint32_t cloneKernelID)
5013 {
5014     m_isClonedKernel = true;
5015     m_cloneKernelID = cloneKernelID;
5016 }
5017 
GetCloneKernelID(uint32_t & cloneKernelID)5018 bool CmKernelRT::GetCloneKernelID(uint32_t& cloneKernelID)
5019 {
5020     if (m_isClonedKernel)
5021     {
5022         cloneKernelID = m_cloneKernelID;
5023         return true;
5024     }
5025 
5026     return false;
5027 }
5028 
SetHasClones()5029 void CmKernelRT::SetHasClones()
5030 {
5031     m_hasClones = true;
5032 }
5033 
5034 //*-----------------------------------------------------------------------------
5035 //| Purpose:   Clone/copy current kernel
5036 //| Returns:   New kernel with content of source kernel
5037 //*-----------------------------------------------------------------------------
CloneKernel(CmKernelRT * & kernelOut,uint32_t id)5038 int32_t CmKernelRT::CloneKernel(CmKernelRT *& kernelOut, uint32_t id)
5039 {
5040     int32_t hr = CM_SUCCESS;
5041 
5042     CSync* kernelLock = m_device->GetProgramKernelLock();
5043     CLock locker(*kernelLock);
5044 
5045     CmDynamicArray * kernelArray = m_device->GetKernelArray();
5046 
5047     uint32_t freeSlotinKernelArray = kernelArray->GetFirstFreeIndex();
5048 
5049     hr = Create(m_device, m_program, (char*)GetName(), freeSlotinKernelArray, id, kernelOut, m_options);
5050 
5051     if (hr == CM_SUCCESS)
5052     {
5053         kernelOut->SetAsClonedKernel(m_id >> 32);
5054         kernelArray->SetElement(freeSlotinKernelArray, kernelOut);
5055         uint32_t *kernelCount = m_device->GetKernelCount();
5056         *kernelCount = *kernelCount + 1;
5057 
5058         SetHasClones();
5059     }
5060 
5061     return hr;
5062 }
5063 
5064 //*-----------------------------------------------------------------------------
5065 //| Purpose:    Set Kernel's index in one task
5066 //| Returns:    Result of the operation.
5067 //*-----------------------------------------------------------------------------
SetIndexInTask(uint32_t index)5068 int32_t CmKernelRT::SetIndexInTask(uint32_t index)
5069 {
5070     m_indexInTask = index;
5071     return CM_SUCCESS;
5072 }
5073 
5074 //*-----------------------------------------------------------------------------
5075 //| Purpose:    Get Kernel's index in one task
5076 //| Returns:    Result of the operation.
5077 //*-----------------------------------------------------------------------------
GetIndexInTask(void)5078 uint32_t CmKernelRT::GetIndexInTask(void)
5079 {
5080     return m_indexInTask;
5081 }
5082 
5083 //*-----------------------------------------------------------------------------
5084 //| Purpose:    Set Associated Flag
5085 //| Returns:    Result of the operation.
5086 //*-----------------------------------------------------------------------------
SetAssociatedToTSFlag(bool b)5087 int32_t CmKernelRT::SetAssociatedToTSFlag(bool b)
5088 {
5089     m_threadSpaceAssociated = b;
5090     return CM_SUCCESS;
5091 }
5092 
5093 //*-----------------------------------------------------------------------------
5094 //| Purpose: Set threadspace for kernel
5095 //| Returns: Result of the operation.
5096 //| Note: It's exclusive with AssociateThreadGroupSpace()
5097 //*-----------------------------------------------------------------------------
AssociateThreadSpace(CmThreadSpace * & threadSpace)5098 CM_RT_API int32_t CmKernelRT::AssociateThreadSpace(CmThreadSpace *&threadSpace)
5099 {
5100     if( threadSpace == nullptr )
5101     {
5102         CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5103         return CM_INVALID_ARG_VALUE;
5104     }
5105 
5106     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5107     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5108     {
5109         CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5110         if (threadSpaceRTConst == nullptr)
5111         {
5112             CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5113             return CM_INVALID_ARG_VALUE;
5114         }
5115         CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5116         return AssociateThreadGroupSpace(threadGroupSpace);
5117     }
5118     else
5119     {
5120         if (m_threadGroupSpace != nullptr)
5121         {
5122             CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadGroupSpace().");
5123             return CM_INVALID_KERNEL_THREADSPACE;
5124         }
5125     }
5126 
5127     bool threadSpaceChanged = false;
5128     if( m_threadSpace )
5129     {
5130         if( m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace) )
5131         {
5132             threadSpaceChanged = true;
5133         }
5134     }
5135 
5136     m_threadSpace = static_cast<CmThreadSpaceRT *>(threadSpace);
5137 
5138     uint32_t threadSpaceWidth = 0;
5139     uint32_t threadSpaceHeight = 0;
5140     m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
5141     uint32_t threadCount = threadSpaceWidth * threadSpaceHeight;
5142     if (m_threadCount)
5143     {
5144         // Setting threadCount twice with different values will cause reset of kernels
5145         if (m_threadCount != threadCount)
5146         {
5147             m_threadCount = threadCount;
5148             m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
5149         }
5150     }
5151     else // first time
5152     {
5153         m_threadCount = threadCount;
5154     }
5155 
5156     if( threadSpaceChanged )
5157     {
5158         m_threadSpace->SetDirtyStatus( CM_THREAD_SPACE_DATA_DIRTY);
5159     }
5160 
5161     return CM_SUCCESS;
5162 }
5163 
5164 //*-----------------------------------------------------------------------------
5165 //| Purpose: Set thread group space for kernel
5166 //| Returns: Result of the operation.
5167 //| Note: It's exclusive with AssociateThreadSpace()
5168 //*-----------------------------------------------------------------------------
AssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5169 CM_RT_API int32_t CmKernelRT::AssociateThreadGroupSpace(CmThreadGroupSpace *&threadGroupSpace)
5170 {
5171     if( threadGroupSpace == nullptr )
5172     {
5173         CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5174         return CM_INVALID_ARG_VALUE;
5175     }
5176 
5177     if (m_threadSpace != nullptr)
5178     {
5179         CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadSpace().");
5180         return CM_INVALID_KERNEL_THREADGROUPSPACE;
5181     }
5182 
5183     m_threadGroupSpace = threadGroupSpace;
5184 
5185     return CM_SUCCESS;
5186 }
5187 
5188 //*-----------------------------------------------------------------------------
5189 //| Purpose: Clear threadspace for kernel
5190 //| Returns: Result of the operation.
5191 //*-----------------------------------------------------------------------------
DeAssociateThreadSpace(CmThreadSpace * & threadSpace)5192 CM_RT_API int32_t CmKernelRT::DeAssociateThreadSpace(CmThreadSpace * &threadSpace)
5193 {
5194     if (threadSpace == nullptr)
5195     {
5196         CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5197         return CM_NULL_POINTER;
5198     }
5199 
5200     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5201     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5202     {
5203         CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5204         if (threadSpaceRTConst == nullptr)
5205         {
5206             CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5207             return CM_INVALID_ARG_VALUE;
5208         }
5209 
5210         CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5211         if (m_threadGroupSpace != threadGroupSpace)
5212         {
5213             CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5214             return CM_INVALID_ARG_VALUE;
5215         }
5216         m_threadGroupSpace = nullptr;
5217     }
5218     else
5219     {
5220         if (m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace))
5221         {
5222             CM_ASSERTMESSAGE("Error: Invalid thread space handle.");
5223             return CM_INVALID_ARG_VALUE;
5224         }
5225         m_threadSpace = nullptr;
5226     }
5227 
5228     return CM_SUCCESS;
5229 }
5230 //*--------------------------------------------------------------------------------------------
5231 //| Purpose: query spill memory size, the function can only take effect when jitter is enabled
5232 //| Return: Result of the operation.
5233 //*---------------------------------------------------------------------------------------------
5234 
QuerySpillSize(uint32_t & spillMemorySize)5235 CM_RT_API int32_t CmKernelRT::QuerySpillSize(uint32_t &spillMemorySize)
5236 {
5237     CM_KERNEL_INFO  *kernelInfo = nullptr;
5238 
5239     int32_t hr = m_program->GetKernelInfo(m_kernelIndex, kernelInfo);
5240     if (hr != CM_SUCCESS || kernelInfo == nullptr)
5241         return hr;
5242 
5243     if (m_program->IsJitterEnabled()) {
5244         if (kernelInfo->jitInfo != nullptr) {
5245             spillMemorySize = (kernelInfo->jitInfo)->spillMemUsed;
5246             return hr;
5247         }
5248         else
5249             return CM_FAILURE;
5250     }
5251 
5252     return CM_FAILURE;
5253 }
5254 
5255 //*-----------------------------------------------------------------------------
5256 //| Purpose: Clear threadgroupspace for kernel
5257 //| Returns: Result of the operation.
5258 //*-----------------------------------------------------------------------------
DeAssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5259 int32_t CmKernelRT::DeAssociateThreadGroupSpace(CmThreadGroupSpace * &threadGroupSpace)
5260 {
5261     if (threadGroupSpace == nullptr)
5262     {
5263         CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5264         return CM_NULL_POINTER;
5265     }
5266     if (m_threadGroupSpace != threadGroupSpace)
5267     {
5268         CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5269         return CM_INVALID_ARG_VALUE;
5270     }
5271     m_threadGroupSpace = nullptr;
5272     m_dirty            = CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY;
5273 
5274     return CM_SUCCESS;
5275 }
5276 
5277 //*-----------------------------------------------------------------------------
5278 //| Purpose:    Indicate whether thread arg existed.
5279 //| Returns:    Result of the operation.
5280 //*-----------------------------------------------------------------------------
IsThreadArgExisted()5281 bool CmKernelRT::IsThreadArgExisted()
5282 {
5283     return m_perThreadArgExists;
5284 }
5285 
5286 //*-----------------------------------------------------------------------------
5287 //| Purpose:    Get the size of SharedLocalMemory
5288 //| Returns:    Result of the operation.
5289 //*-----------------------------------------------------------------------------
GetSLMSize()5290 uint32_t CmKernelRT::GetSLMSize()
5291 {
5292     return (uint32_t)m_kernelInfo->kernelSLMSize;
5293 }
5294 
5295 //*-----------------------------------------------------------------------------
5296 //| Purpose:    Get the spill size of the kernel from JIT
5297 //| Returns:    Result of the operation.
5298 //*-----------------------------------------------------------------------------
GetSpillMemUsed()5299 uint32_t CmKernelRT::GetSpillMemUsed()
5300 {
5301     uint32_t spillSize;
5302 
5303     if (m_program->IsJitterEnabled() && m_kernelInfo->jitInfo != nullptr)
5304     {
5305         spillSize = (m_kernelInfo->jitInfo)->spillMemUsed;
5306     }
5307     else
5308     {
5309         // kernel uses "--nojitter" option, don't allocate scratch space
5310         spillSize = 0;
5311     }
5312 
5313     return spillSize;
5314 }
5315 
SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind,uint32_t surfaceIndex,uint32_t bti)5316 int32_t CmKernelRT::SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind, uint32_t surfaceIndex, uint32_t bti)
5317 {
5318     uint16_t i = 0;
5319     for ( i = 0; i < CM_MAX_STATIC_SURFACE_STATES_PER_BT; i++ )
5320     {
5321         if ( ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == surfaceIndex ) && ( m_IndirectSurfaceInfoArray[ i ].kind == kind ) && ( m_IndirectSurfaceInfoArray[ i ].bindingTableIndex == bti ) ) ||
5322             ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == 0 ) && ( m_IndirectSurfaceInfoArray[ i ].kind == 0 ) ) )
5323         {
5324             return i;
5325         }
5326     }
5327     // should never reach this
5328     CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5329     return CM_FAILURE;
5330 }
5331 
5332 //-----------------------------------------------------------------------------------------------------------------
5333 //! Set surface binding table index count for each indirect surface
5334 //! INPUT:
5335 //!     1) Surface format
5336 //!     2) Surface type.
5337 //! OUTPUT:
5338 //!     binding table index count
5339 //-----------------------------------------------------------------------------------------------------------------
SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format,CM_ENUM_CLASS_TYPE surfaceType)5340 int32_t CmKernelRT::SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format, CM_ENUM_CLASS_TYPE surfaceType)
5341 {
5342     if (surfaceType == CM_ENUM_CLASS_TYPE_CMBUFFER_RT)
5343     {
5344         return 1;
5345     }
5346     else
5347     {
5348         if ((format == CM_SURFACE_FORMAT_NV12) ||
5349             (format == CM_SURFACE_FORMAT_P010) ||
5350             (format == CM_SURFACE_FORMAT_P208) ||
5351             (format == CM_SURFACE_FORMAT_P016))
5352         {
5353             return 2;
5354         }
5355         else if (format == CM_SURFACE_FORMAT_422H ||
5356             format == CM_SURFACE_FORMAT_411P ||
5357             format == CM_SURFACE_FORMAT_IMC3 ||
5358             format == CM_SURFACE_FORMAT_422V ||
5359             format == CM_SURFACE_FORMAT_444P)
5360         {   // 3 planes surface
5361             return 3;
5362         }
5363         else
5364         {
5365             return 1;
5366         }
5367     }
5368     // should never reach this
5369     CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5370     return 0;
5371 }
5372 
5373 //-----------------------------------------------------------------------------------------------------------------
5374 //! Set surface binding table index by user.
5375 //! If application hope to assign a specific binding table index for a surface, it should call this function.
5376 //! The assigned binding table index should be an valid value for general surface ( say >=1 and <=242),
5377 //! otherwise, this call will return failure.
5378 //! INPUT:
5379 //!     1) Surface whose binding table index need be set.
5380 //!     2) Assiend binding table index.
5381 //! OUTPUT:
5382 //!     CM_SUCCESS
5383 //!     CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX if the surface index is not a valid binding table index (valid: 1~242)
5384 //!     CM_FAILURE otherwise
5385 //-----------------------------------------------------------------------------------------------------------------
SetSurfaceBTI(SurfaceIndex * surface,uint32_t btIndex)5386 CM_RT_API int32_t CmKernelRT::SetSurfaceBTI(SurfaceIndex* surface, uint32_t btIndex)
5387 {
5388 
5389     uint32_t                    width, height, bytesPerPixel;
5390     CM_SURFACE_FORMAT           format = CM_SURFACE_FORMAT_INVALID;
5391     //Sanity check
5392     if (surface == nullptr)
5393     {
5394         CM_ASSERTMESSAGE("Error: Pointer to surface is null.");
5395         return CM_NULL_POINTER;
5396     }
5397     if (!m_surfaceMgr->IsValidSurfaceIndex(btIndex))
5398     {
5399         CM_ASSERTMESSAGE("Error: Invalid binding table index.");
5400         return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5401     }
5402 
5403     //Sanity check: if the BTI has been used once enqueue
5404     uint32_t i = 0;
5405     for (i = 0; i < m_usKernelPayloadSurfaceCount; i++)
5406     {
5407         if (m_IndirectSurfaceInfoArray[i].bindingTableIndex == (uint16_t)btIndex)
5408         {
5409             CM_ASSERTMESSAGE("Error: Binding table index has been used once enqueue.");
5410             return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5411         }
5412     }
5413 
5414     uint32_t index = surface->get_data();
5415     uint32_t handle = 0;
5416 
5417     CmSurface* surfaceRT = nullptr;
5418     m_surfaceMgr->GetSurface( index, surfaceRT );
5419     if(surfaceRT == nullptr)
5420     {
5421         CM_ASSERTMESSAGE("Error: Invalid surface.");
5422         return CM_NULL_POINTER;
5423     }
5424 
5425     CmSurface2DRT* surf2D = nullptr;
5426     uint32_t indirectSurfInfoEntry = 0;
5427     if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2D )
5428     {
5429         surf2D = static_cast< CmSurface2DRT* >( surfaceRT );
5430         surf2D->GetHandle( handle );
5431         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D, handle, btIndex);
5432         if (indirectSurfInfoEntry == CM_FAILURE)
5433         {
5434             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5435             return CM_FAILURE;
5436         }
5437         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D;
5438         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5439         surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5440     }
5441     else
5442     {
5443         CmBuffer_RT* cmBuffer = nullptr;
5444         if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
5445         {
5446             cmBuffer = static_cast< CmBuffer_RT* >( surfaceRT );
5447             cmBuffer->GetHandle( handle );
5448             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_1D, handle, btIndex);
5449             if (indirectSurfInfoEntry == CM_FAILURE)
5450             {
5451                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5452                 return CM_FAILURE;
5453             }
5454             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_1D;
5455             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5456         }
5457         else
5458         {
5459             CmSurface2DUPRT* surf2DUP = nullptr;
5460             if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2DUP )
5461             {
5462                 surf2DUP = static_cast< CmSurface2DUPRT* >( surfaceRT );
5463                 surf2DUP->GetHandle( handle );
5464                 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D_UP, handle, btIndex);
5465                 if (indirectSurfInfoEntry == CM_FAILURE)
5466                 {
5467                     CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5468                     return CM_FAILURE;
5469                 }
5470                 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D_UP;
5471                 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5472                 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5473             }
5474             else
5475             {
5476                 CmSurfaceSampler* surfSampler = nullptr;
5477                 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER )
5478                 {
5479                     surfSampler = static_cast< CmSurfaceSampler* >(surfaceRT);
5480 
5481                     //Get  actually SurfaceIndex ID for 2D
5482                     uint16_t surfIndexForCurrent = 0;
5483                     surfSampler->GetCmIndexCurrent(surfIndexForCurrent);
5484                     CmSurface* surfSampRT= nullptr;
5485                     m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSampRT);
5486                     if(surfSampRT == nullptr)
5487                     {
5488                         CM_ASSERTMESSAGE("Error: Invalid surface.");
5489                         return CM_NULL_POINTER;
5490                     }
5491 
5492                     SAMPLER_SURFACE_TYPE surfaceType;
5493                     surfSampler->GetSurfaceType(surfaceType);
5494                     surfSampler->GetHandle( handle );
5495                     if ( surfaceType == SAMPLER_SURFACE_TYPE_2D )
5496                     {
5497                         CmSurface2DRT* surfSamp2D = nullptr;
5498                         surfSamp2D = static_cast<CmSurface2DRT*>(surfSampRT);
5499                         surfSamp2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5500 
5501                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER, handle, btIndex);
5502                         if (indirectSurfInfoEntry == CM_FAILURE)
5503                         {
5504                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5505                             return CM_FAILURE;
5506                         }
5507                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER;
5508                     }
5509                     else if ( surfaceType == SAMPLER_SURFACE_TYPE_2DUP )
5510                     {
5511                         CmSurface2DUPRT* surfSamp2DUP = nullptr;
5512                         surfSamp2DUP = static_cast<CmSurface2DUPRT*>(surfSampRT);
5513                         surfSamp2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5514 
5515                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE2DUP_SAMPLER, handle, btIndex);
5516                         if (indirectSurfInfoEntry == CM_FAILURE)
5517                         {
5518                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5519                             return CM_FAILURE;
5520                         }
5521                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE2DUP_SAMPLER;
5522                     }
5523                     else if ( surfaceType == SAMPLER_SURFACE_TYPE_3D )
5524                     {
5525                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_3D, handle, btIndex);
5526                         if (indirectSurfInfoEntry == CM_FAILURE)
5527                         {
5528                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5529                             return CM_FAILURE;
5530                         }
5531                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_3D;
5532                     }
5533                     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5534                 }
5535                 else
5536                 {
5537                     CmSurfaceSampler8x8* surfSampler8x8 = nullptr;
5538                     if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 )
5539                     {
5540                         surfSampler8x8 = static_cast< CmSurfaceSampler8x8* >( surfaceRT );
5541                         surfSampler8x8->GetIndexCurrent( handle );
5542 
5543                         //Get  actually SurfaceIndex ID for 2D
5544                         uint16_t surfIndexForCurrent = 0;
5545                         surfSampler8x8->GetCmIndex(surfIndexForCurrent);
5546                         CmSurface* surfSamp8x8RT = nullptr;
5547                         m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSamp8x8RT);
5548                         if(surfSamp8x8RT == nullptr)
5549                         {
5550                             CM_ASSERTMESSAGE("Error: Invalid surface.");
5551                             return CM_NULL_POINTER;
5552                         }
5553 
5554                         CmSurface2DRT* surfSamp8x82D = nullptr;
5555                         surfSamp8x82D = static_cast<CmSurface2DRT*>(surfSamp8x8RT);
5556                         surfSamp8x82D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5557 
5558                         if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_AVS_SURFACE )
5559                         {
5560                             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_AVS, handle, btIndex);
5561                             if (indirectSurfInfoEntry == CM_FAILURE)
5562                             {
5563                                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5564                                 return CM_FAILURE;
5565                             }
5566                             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5567                         }
5568                         else if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE )
5569                         {
5570                             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_VA, handle, btIndex);
5571                             if (indirectSurfInfoEntry == CM_FAILURE)
5572                             {
5573                                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5574                                 return CM_FAILURE;
5575                             }
5576                             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
5577                         }
5578                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5579                     }
5580                     else
5581                     {
5582                             return CM_FAILURE;
5583                     }
5584                 }
5585             }
5586         }
5587     }
5588 
5589     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].bindingTableIndex = (uint16_t)btIndex;
5590     if (SetSurfBTINumForIndirectData(format, surfaceRT->Type())== 0)
5591     {
5592         CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5593         return CM_FAILURE;
5594     }
5595     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].numBTIPerSurf = (uint16_t)SetSurfBTINumForIndirectData(format, surfaceRT->Type());
5596 
5597     //Copy it to surface index array
5598 
5599     m_pKernelPayloadSurfaceArray[indirectSurfInfoEntry] = surface;
5600 
5601 
5602     // count is actally one larger than the actual index
5603     m_usKernelPayloadSurfaceCount = indirectSurfInfoEntry + 1;
5604     m_dirty |= (CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY | CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY);
5605     return CM_SUCCESS;
5606 }
5607 
GetKernelIndex()5608 uint32_t CmKernelRT::GetKernelIndex()
5609 {
5610     return m_kernelIndex;
5611 }
GetKernelGenxBinarySize(void)5612 uint32_t CmKernelRT::GetKernelGenxBinarySize(void)
5613 {
5614     if(m_kernelInfo == nullptr)
5615     {
5616         CM_ASSERTMESSAGE("Error: Invalid kernel genx binary size.");
5617         return 0;
5618     }
5619     else
5620     {
5621         return m_kernelInfo->genxBinarySize;
5622     }
5623 }
5624 
5625 //-----------------------------------------------------------------------------------------------------------------
5626 //! Map Surface type to Kernel arg Kind.
5627 //! INPUT:  Surface type    :CM_ENUM_CLASS_TYPE
5628 //! OUTPUT: Kernel arg Kind :CM_ARG_KIND
5629 //-----------------------------------------------------------------------------------------------------------------
SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)5630 CM_ARG_KIND CmKernelRT::SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)
5631 {
5632     switch(surfType)
5633     {
5634         case CM_ENUM_CLASS_TYPE_CMBUFFER_RT          :return ARG_KIND_SURFACE_1D;
5635         case CM_ENUM_CLASS_TYPE_CMSURFACE2D          :return ARG_KIND_SURFACE_2D;
5636         case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP        :return ARG_KIND_SURFACE_2D_UP;
5637         case CM_ENUM_CLASS_TYPE_CMSURFACE3D          :return ARG_KIND_SURFACE_3D;
5638         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER     :return ARG_KIND_SURFACE_SAMPLER;
5639         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8  :return ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5640         case CM_ENUM_CLASS_TYPE_CMSURFACEVME         :return ARG_KIND_SURFACE_VME;
5641         case CM_ENUM_CLASS_TYPE_CMSAMPLER_RT         :return ARG_KIND_SAMPLER;
5642         case CM_ENUM_CLASS_TYPE_CMSAMPLER8X8STATE_RT :return ARG_KIND_SAMPLER;
5643         case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER      :return ARG_KIND_STATE_BUFFER;
5644 
5645         default:
5646             CM_ASSERTMESSAGE("Error: Invalid surface type.");
5647             break;
5648    }
5649    return ARG_KIND_GENERAL;
5650 }
5651 
CalculateKernelSurfacesNum(uint32_t & kernelSurfaceNum,uint32_t & neededBTEntryNum)5652 int32_t CmKernelRT::CalculateKernelSurfacesNum(uint32_t& kernelSurfaceNum, uint32_t& neededBTEntryNum)
5653 {
5654     uint32_t            surfaceArraySize = 0;
5655     CmSurface*          surf = nullptr;
5656     CmSurface2DRT*        surf2D = nullptr;
5657     CmSurface2DUPRT*      surf2DUP = nullptr;
5658     uint32_t              width, height, bytesPerPixel;
5659     CM_SURFACE_FORMAT     format;
5660     uint32_t              maxBTIndex = 0;
5661 
5662     kernelSurfaceNum = 0;
5663     neededBTEntryNum = 0;
5664 
5665     surfaceArraySize = m_surfaceMgr->GetSurfacePoolSize();
5666 
5667     //Calculate surface number and needed binding table entries
5668     for (uint32_t surfIndex = 0; surfIndex <= m_maxSurfaceIndexAllocated; surfIndex ++)
5669     {
5670         if (m_surfaceArray[surfIndex%surfaceArraySize])
5671         {
5672             surf = nullptr;
5673             m_surfaceMgr->GetSurface(surfIndex, surf);
5674             if (surf)
5675             {
5676                 switch(surf->Type())
5677                 {
5678                     case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
5679                     case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
5680                         kernelSurfaceNum ++;
5681                         neededBTEntryNum ++;
5682                         break;
5683 
5684                     case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
5685                     case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
5686                     case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
5687                         //virtual surface, no need increase count
5688                         break;
5689 
5690                     case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
5691                         kernelSurfaceNum++;
5692                         surf2D = static_cast<CmSurface2DRT*>(surf);
5693                         format = CM_SURFACE_FORMAT_INVALID;
5694                         surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5695                         if ((format == CM_SURFACE_FORMAT_NV12) ||
5696                             (format == CM_SURFACE_FORMAT_P010) ||
5697                             (format == CM_SURFACE_FORMAT_P208) ||
5698                             (format == CM_SURFACE_FORMAT_P016))
5699                         {
5700                             neededBTEntryNum += 2;
5701                         }
5702                         else if (format == CM_SURFACE_FORMAT_422H ||
5703                             format == CM_SURFACE_FORMAT_411P ||
5704                             format == CM_SURFACE_FORMAT_IMC3 ||
5705                             format == CM_SURFACE_FORMAT_422V ||
5706                             format == CM_SURFACE_FORMAT_444P)
5707                         {   // 3 planes surface
5708                             neededBTEntryNum += 3;
5709                         }
5710                         else
5711                         {
5712                             neededBTEntryNum += 1;
5713                         }
5714                         break;
5715 
5716                     case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
5717                         kernelSurfaceNum++;
5718                         surf2DUP = static_cast<CmSurface2DUPRT*>(surf);
5719                         format = CM_SURFACE_FORMAT_INVALID;
5720                         surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5721                         if ((format == CM_SURFACE_FORMAT_NV12) ||
5722                             (format == CM_SURFACE_FORMAT_P010) ||
5723                             (format == CM_SURFACE_FORMAT_P208) ||
5724                             (format == CM_SURFACE_FORMAT_P016))
5725                         {
5726                             neededBTEntryNum += 2;
5727                         }
5728                         else if (format == CM_SURFACE_FORMAT_422H ||
5729                             format == CM_SURFACE_FORMAT_411P ||
5730                             format == CM_SURFACE_FORMAT_IMC3 ||
5731                             format == CM_SURFACE_FORMAT_422V ||
5732                             format == CM_SURFACE_FORMAT_444P)
5733                         {   // 3 planes surface
5734                             neededBTEntryNum += 3;
5735                         }
5736                         else
5737                         {
5738                             neededBTEntryNum += 1;
5739                         }
5740                         break;
5741 
5742                     default:
5743                         break;
5744                 }
5745             }
5746         }
5747     }
5748 
5749     if ((maxBTIndex + 1) > neededBTEntryNum)
5750     {
5751         neededBTEntryNum = maxBTIndex + 1;
5752     }
5753 
5754     //Wordaround: the calculation maybe not accurate if the VME surfaces are existed
5755     neededBTEntryNum += m_vmeSurfaceCount;
5756 
5757     return CM_SUCCESS;
5758 }
5759 
5760 //*-----------------------------------------------------------------------------
5761 //| Purpose:    Get aligned curbe size for different platforms
5762 //| Returns:    Result of operation.
5763 //*-----------------------------------------------------------------------------
GetAlignedCurbeSize(uint32_t value)5764 uint32_t CmKernelRT::GetAlignedCurbeSize(uint32_t value)
5765 {
5766     uint32_t curbeAlignedSize    = 0;
5767 
5768     curbeAlignedSize = MOS_ALIGN_CEIL(value, RENDERHAL_CURBE_BLOCK_ALIGN);
5769     return curbeAlignedSize;
5770 }
5771 
5772 #if CM_LOG_ON
Log()5773 std::string CmKernelRT::Log()
5774 {
5775 
5776     std::ostringstream  oss;
5777 
5778     oss << " Kernel Name:"         << m_kernelInfo->kernelName << std::endl
5779         << " Kernel Binary Size:"  << m_kernelInfo->jitBinarySize
5780         << " Index In Task:"       << m_indexInTask
5781         << " Thread Count:"        << m_threadCount
5782         << " Curbe Size:"          << m_sizeInCurbe
5783         << " Kernel arg Count:"    << m_argCount
5784         << std::endl;
5785 
5786      // Per Kernel Thread Space Log
5787     if(m_threadSpace)
5788     {
5789         oss << m_threadSpace->Log();
5790     }
5791 
5792     // Per Kernel Thread Group Space Log
5793     if(m_threadGroupSpace)
5794     {
5795         oss << m_threadGroupSpace->Log();
5796     }
5797 
5798     // Arguments Log
5799     for (uint32_t argIndex= 0; argIndex< m_argCount; argIndex++ )
5800     {
5801         if (m_args[argIndex].value) // filter out the implicit arguments
5802         {
5803             ArgLog(oss, argIndex, m_args[argIndex]);
5804         }
5805     }
5806 
5807     return oss.str();
5808 }
5809 
ArgLog(std::ostringstream & oss,uint32_t index,CM_ARG arg)5810 void CmKernelRT::ArgLog(std::ostringstream &oss, uint32_t index, CM_ARG arg)
5811 {
5812 
5813     oss << "[" << index << "] th Argument"
5814         << " Type :" << arg.unitKind
5815         << " Count:" << arg.unitCount
5816         << " Size:" << arg.unitSize
5817         << " Surface Kind:" << (int)arg.surfaceKind
5818         << " OffsetInPayload:" << arg.unitOffsetInPayload
5819         << " OffsetInPayloadOrig:" << arg.unitOffsetInPayloadOrig << "";
5820 
5821     CmLogger::LogDataArrayHex( oss, arg.value, arg.unitSize * arg.unitCount);
5822 
5823     if (CHECK_SURFACE_TYPE(arg.unitKind,
5824                            ARG_KIND_SURFACE_1D,
5825                            ARG_KIND_SURFACE_2D,
5826                            ARG_KIND_SURFACE_2D_UP,
5827                            ARG_KIND_SURFACE_VME,
5828                            ARG_KIND_SURFACE_SAMPLER,
5829                            ARG_KIND_SURFACE_3D,
5830                            ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5831                            ARG_KIND_SURFACE_SAMPLER8X8_VA,
5832                            ARG_KIND_SURFACE2DUP_SAMPLER))
5833     {
5834         uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5835         if (arg.unitKind == ARG_KIND_SURFACE_VME)
5836         {
5837             numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5838         }
5839         for (uint16_t i = 0; i < numSurfaces; i++)
5840         {
5841             uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5842 
5843             if(surfaceIndex == CM_NULL_SURFACE)
5844                 continue;
5845 
5846             CmSurface *surf = nullptr;
5847             m_surfaceMgr->GetSurface(surfaceIndex, surf);
5848             if (surf == nullptr)
5849             {
5850                 continue;
5851             }
5852             surf->Log(oss);
5853         }
5854     }
5855 }
5856 #endif
5857 
SurfaceDump(uint32_t kernelNumber,int32_t taskId)5858 void CmKernelRT::SurfaceDump(uint32_t kernelNumber, int32_t taskId)
5859 {
5860 #if MDF_SURFACE_CONTENT_DUMP
5861     CM_ARG arg;
5862 
5863     for (uint32_t argIndex = 0; argIndex< m_argCount; argIndex++)
5864     {
5865         arg = m_args[argIndex];
5866         if (CHECK_SURFACE_TYPE(arg.unitKind,
5867             ARG_KIND_SURFACE_1D,
5868             ARG_KIND_SURFACE_2D,
5869             ARG_KIND_SURFACE_2D_UP,
5870             ARG_KIND_SURFACE_VME,
5871             ARG_KIND_SURFACE_SAMPLER,
5872             ARG_KIND_SURFACE_3D,
5873             ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5874             ARG_KIND_SURFACE_SAMPLER8X8_VA,
5875             ARG_KIND_SURFACE2DUP_SAMPLER))
5876         {
5877             uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5878             if (arg.unitKind == ARG_KIND_SURFACE_VME)
5879             {
5880                 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5881             }
5882 
5883             for (uint16_t i = 0; i < numSurfaces; i++)
5884             {
5885                 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5886                 CmSurface *surf = nullptr;
5887                 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5888                 if (surf == nullptr)
5889                 {
5890                     return;
5891                 }
5892                 surf->DumpContent(kernelNumber, m_kernelInfo->kernelName, taskId, argIndex, i);
5893             }
5894         }
5895     }
5896 #endif
5897 }
5898 
SetSamplerBTI(SamplerIndex * sampler,uint32_t nIndex)5899 CM_RT_API int32_t CmKernelRT::SetSamplerBTI(SamplerIndex* sampler, uint32_t nIndex)
5900 {
5901     if (!sampler)
5902     {
5903         return CM_NULL_POINTER;
5904     }
5905     if (CM_SAMPLER_MAX_BINDING_INDEX < nIndex)
5906     {
5907         return CM_KERNELPAYLOAD_SAMPLER_INVALID_BTINDEX;
5908     }
5909 
5910     uint32_t        samplerIndex   = sampler->get_data();
5911     PCM_HAL_STATE   cmHalState    = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5912 
5913     uint32_t i = 0;
5914     for (i = 0; i < m_samplerBtiCount; i++)
5915     {
5916         if ((m_samplerBtiEntry[i].samplerIndex == samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5917         {
5918             break;
5919         }
5920         if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
5921         {
5922             if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5923             {
5924                 if (cmHalState->useNewSamplerHeap)
5925                 {
5926                     SamplerParam sampler1 = {};
5927                     SamplerParam sampler2 = {};
5928                     cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[m_samplerBtiEntry[i].samplerIndex], sampler1);
5929                     cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[samplerIndex], sampler2);
5930 
5931                     if (sampler1.elementType== sampler2.elementType)
5932                     {
5933                         // return failure only if the two samplers have the same type, because different type samplers are able to set to the same BTI
5934                         return CM_FAILURE;
5935                     }
5936                 }
5937                 else
5938                 {
5939                     return CM_FAILURE;
5940                 }
5941             }
5942 
5943             CmSampler8x8State_RT *sampler8x8 = nullptr;
5944             CmSampler8x8State_RT *tmpSampler8x8 = nullptr;
5945             m_device->GetSampler8x8(samplerIndex, sampler8x8);
5946             m_device->GetSampler8x8(m_samplerBtiEntry[i].samplerIndex, tmpSampler8x8);
5947 
5948             if (sampler8x8 && tmpSampler8x8 && (sampler8x8->GetStateType() == CM_SAMPLER8X8_AVS)
5949                 && (tmpSampler8x8->GetStateType() == CM_SAMPLER8X8_AVS) &&
5950                 cmHalState->cmHalInterface->IsAdjacentSamplerIndexRequiredbyHw())
5951             {
5952                 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) &&
5953                     ((m_samplerBtiEntry[i].samplerBTI == nIndex + 1) || (m_samplerBtiEntry[i].samplerBTI == nIndex - 1)))
5954                     return CM_FAILURE;
5955             }
5956         }
5957     }
5958 
5959     if (i >= CM_MAX_SAMPLER_TABLE_SIZE)
5960     {
5961         CM_ASSERTMESSAGE("Error: Exceed maximum sampler table size.");
5962         return CM_FAILURE;
5963     }
5964 
5965     if (i == m_samplerBtiCount)
5966     {
5967         m_samplerBtiEntry[i].samplerIndex = samplerIndex;
5968         m_samplerBtiEntry[i].samplerBTI = nIndex;
5969 
5970         m_samplerBtiCount = i + 1;
5971 
5972         m_dirty |= cMKERNELDATASAMPLERBTIDIRTY;
5973     }
5974     return CM_SUCCESS;
5975 }
5976 
GetBinary(std::vector<char> & binary)5977 CMRT_UMD_API int32_t CmKernelRT::GetBinary(std::vector<char>& binary)
5978 {
5979     binary.resize(m_binarySize);
5980 
5981     CmSafeMemCopy((void *)&binary[0], (void *)m_binary, m_binarySize);
5982 
5983     return CM_SUCCESS;
5984 }
5985 
ReplaceBinary(std::vector<char> & binary)5986 CMRT_UMD_API int32_t CmKernelRT::ReplaceBinary(std::vector<char>& binary)
5987 {
5988     uint32_t size = binary.size();
5989 
5990     if (size == 0)
5991     {
5992         return CM_INVALID_ARG_VALUE;
5993     }
5994 
5995     if(m_binaryOrig == nullptr)
5996     {
5997         //Store the orignal binary once.
5998         m_binaryOrig = m_binary;
5999         m_binarySizeOrig = m_binarySize;
6000     }
6001 
6002     m_binary = MOS_NewArray(char, size);
6003     CmSafeMemCopy((void *)m_binary, (void *)&binary[0], size);
6004 
6005     m_binarySize = size;
6006 
6007     return CM_SUCCESS;
6008 }
6009 
ResetBinary()6010 CMRT_UMD_API int32_t CmKernelRT::ResetBinary()
6011 {
6012     if (m_binaryOrig == nullptr)
6013     {
6014         //ReplaceBinary is never called
6015         return CM_SUCCESS;
6016     }
6017     if(m_binary!= m_binaryOrig)
6018     {
6019         MosSafeDeleteArray(m_binary);
6020     }
6021     m_binary = m_binaryOrig;
6022     m_binarySize = m_binarySizeOrig;
6023 
6024     return CM_SUCCESS;
6025 }
6026 
UpdateSamplerHeap(CmKernelData * kernelData)6027 int CmKernelRT::UpdateSamplerHeap(CmKernelData *kernelData)
6028 {
6029     // Get sampler bti & offset
6030     PCM_HAL_KERNEL_PARAM cmKernel = nullptr;
6031     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
6032     PCM_HAL_STATE state = cmData->cmHalState;
6033     std::list<SamplerParam>::iterator iter;
6034     unsigned int heapOffset = 0;
6035 
6036     if (state->useNewSamplerHeap == false)
6037     {
6038         return CM_SUCCESS;
6039     }
6040 
6041     heapOffset = 0;
6042     cmKernel = kernelData->GetHalCmKernelData();
6043     std::list<SamplerParam> *sampler_heap = cmKernel->samplerHeap;
6044 
6045     // First pass, inserts sampler with user-defined BTI to the list. Sorts by element order low to high, then by BTI order low to high.
6046     for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6047     {
6048         for (unsigned int n = 0; n < cmKernel->samplerBTIParam.samplerCount; ++n)
6049         {
6050             SamplerParam sampler = {};
6051             sampler.samplerTableIndex = cmKernel->samplerBTIParam.samplerInfo[n].samplerIndex;
6052 
6053             if (state->samplerTable[sampler.samplerTableIndex].ElementType == samplerElementType)
6054             {
6055                 sampler.bti = cmKernel->samplerBTIParam.samplerInfo[n].samplerBTI;
6056                 sampler.userDefinedBti = true;
6057                 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6058 
6059                 // Guarantees each user-defined BTI has a spacing between each other user-defined BTIs larger than the stepping
6060                 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6061                 {
6062                     if (iter->elementType == sampler.elementType)
6063                     {
6064                         unsigned int diff = (iter->bti > sampler.bti) ? (iter->bti - sampler.bti) : (sampler.bti - iter->bti);
6065                         if (diff < sampler.btiStepping)
6066                         {
6067                             CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6068                             return MOS_STATUS_INVALID_PARAMETER;
6069                         }
6070                     }
6071                 }
6072 
6073                 // Inserts by the order
6074                 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6075                 {
6076                     if (iter->elementType > sampler.elementType)
6077                     {
6078                         break;
6079                     }
6080                     else if ((iter->elementType == sampler.elementType) && (iter->bti > sampler.bti))
6081                     {
6082                         break;
6083                     }
6084                 }
6085                 sampler.heapOffset = sampler.bti * sampler.btiMultiplier;
6086                 sampler_heap->insert(iter, sampler);
6087             }
6088         }
6089     }
6090 
6091     // Second pass, loops over all kernel/thread args, find regular sampler and insert to sampler heap.
6092     // Follows the existing sorted order.
6093     for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6094     {
6095         for (unsigned int index = 0; index < cmKernel->numArgs; index++)
6096         {
6097             PCM_HAL_KERNEL_ARG_PARAM argParam = &cmKernel->argParams[index];
6098             if (argParam->isNull)
6099             {
6100                 continue;
6101             }
6102 
6103             for (unsigned int threadIndex = 0; threadIndex < argParam->unitCount; threadIndex++)
6104             {
6105                 if (argParam->kind == CM_ARGUMENT_SAMPLER)
6106                 {
6107                     unsigned char *arg = argParam->firstValue + (threadIndex * argParam->unitSize);
6108                     unsigned int samplerTableIndex = *((uint32_t *)arg);
6109 
6110                     SamplerParam sampler = {};
6111                     sampler.samplerTableIndex = samplerTableIndex;
6112                     state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6113                     sampler.regularBti = true;
6114 
6115                     if (sampler.elementType != samplerElementType)
6116                     {
6117                         continue;
6118                     }
6119 
6120                     // if the sampler is already in the heap, skip
6121                     bool isDuplicate = false;
6122                     for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6123                     {
6124                         if (iter->samplerTableIndex == sampler.samplerTableIndex)
6125                         {
6126                             isDuplicate = true;
6127                             iter->regularBti = true;
6128                             break;
6129                         }
6130                     }
6131                     if (isDuplicate == true)
6132                     {
6133                         continue;
6134                     }
6135 
6136                     // insert the new sampler to the heap
6137                     heapOffset = 0;
6138                     for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6139                     {
6140                         if (iter->elementType == sampler.elementType)
6141                         {
6142                             // Needs to keep the inserted sampler's correctness, so do not insert before same element regular sampler
6143                             // Only insert before user-defined BTI
6144                             if (iter->userDefinedBti == true)
6145                             {
6146                                 unsigned int curOffset = iter->heapOffset;
6147                                 if (heapOffset > curOffset)
6148                                 {
6149                                     // Confliction, which means that sampler heap in smaller
6150                                     // element type has excced the position which is supposed
6151                                     // to put this user-defined BTI sampler.
6152                                     // User needs to set the BTI to a larger value.
6153                                     CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6154                                     return MOS_STATUS_INVALID_PARAMETER;
6155                                 }
6156                                 else
6157                                 {
6158                                     if (curOffset - heapOffset >= sampler.btiStepping * sampler.btiMultiplier)
6159                                     {
6160                                         break;
6161                                     }
6162                                     else
6163                                     {
6164                                         heapOffset = curOffset + iter->btiStepping * iter->btiMultiplier;
6165                                     }
6166                                 }
6167                             }
6168                             else
6169                             {
6170                                 heapOffset += iter->btiStepping * iter->btiMultiplier;
6171                             }
6172                         }
6173                         else if (iter->elementType > sampler.elementType)
6174                         {
6175                             break;
6176                         }
6177                         else
6178                         {
6179                             heapOffset = iter->heapOffset + iter->size;
6180                             std::list<SamplerParam>::iterator iter_next = std::next(iter, 1);
6181                             if ((iter_next != sampler_heap->end()) && (iter_next->elementType > iter->elementType))
6182                             {
6183                                 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6184                                 heapOffset = (heapOffset + iter_next->btiStepping * iter_next->btiMultiplier - 1) / (iter_next->btiStepping * iter_next->btiMultiplier) * (iter_next->btiStepping * iter_next->btiMultiplier);
6185                             }
6186                         }
6187                     }
6188 
6189                     if (iter == sampler_heap->end())
6190                     {
6191                         // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6192                         heapOffset = (heapOffset + sampler.btiStepping * sampler.btiMultiplier - 1) / (sampler.btiStepping * sampler.btiMultiplier) * (sampler.btiStepping * sampler.btiMultiplier);
6193                     }
6194                     sampler.heapOffset = heapOffset;
6195 
6196                     if (sampler.btiMultiplier != 0)
6197                     {
6198                         sampler.bti = sampler.heapOffset / sampler.btiMultiplier;
6199                     }
6200                     else
6201                     {
6202                         CM_ASSERTMESSAGE("Sampler BTI setting error. Multiplier cannot be zero!\n");
6203                         return MOS_STATUS_INVALID_PARAMETER;
6204                     }
6205                     sampler_heap->insert(iter, sampler);
6206                 }
6207             }
6208         }
6209     }
6210 
6211     return CM_SUCCESS;
6212 }
6213 }
6214