1 /*
2 * Copyright (c) 2018-2019, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_kernel_ex.cpp
24 //! \brief     Contains Class CmKernelEx definitions
25 //!
26 
27 #include "cm_kernel_ex.h"
28 #include "cm_surface.h"
29 #include "cm_surface_manager.h"
30 #include "cm_surface_sampler8x8.h"
31 #include "cm_surface_sampler.h"
32 #include "cm_mem.h"
33 #include "cm_surface_2d_rt.h"
34 #include "cm_surface_2d_up_rt.h"
35 #include "cm_surface_3d_rt.h"
36 #include "cm_buffer_rt.h"
37 #include "cm_device_rt.h"
38 #include "cm_hal.h"
39 #include "cm_surface_state.h"
40 #include "cm_surface_state_manager.h"
41 #include "cm_surface_vme.h"
42 #include "cm_ssh.h"
43 #include "cm_thread_space_rt.h"
44 #include "cm_surface_sampler.h"
45 #include "cm_media_state.h"
46 
47 #include "mhw_state_heap.h"
48 
49 using namespace CMRT_UMD;
50 
~CmKernelEx()51 CmKernelEx::~CmKernelEx()
52 {
53     if (m_dummyThreadSpace)
54     {
55         m_device->DestroyThreadSpace(m_dummyThreadSpace);
56     }
57     if (m_dummyThreadGroupSpace)
58     {
59         m_device->DestroyThreadGroupSpace(m_dummyThreadGroupSpace);
60     }
61     MOS_DeleteArray(m_indexMap);
62     MOS_DeleteArray(m_flatArgs);
63     MOS_DeleteArray(m_propertyIndexes);
64     MOS_DeleteArray(m_cmSurfIndexes);
65     MOS_DeleteArray(m_data);
66     MOS_DeleteArray(m_surfaceInArg);
67     MOS_DeleteArray(m_curbe);
68 }
69 
Initialize(const char * kernelName,const char * options)70 int32_t CmKernelEx::Initialize(const char *kernelName, const char *options)
71 {
72     int ret = CmKernelRT::Initialize(kernelName, options);
73     if (ret != CM_SUCCESS)
74     {
75         return ret;
76     }
77 
78     m_indexMap = MOS_NewArray(uint32_t, (m_argCount+1));
79     CM_CHK_NULL_RETURN_CMERROR(m_indexMap);
80     MOS_ZeroMemory(m_indexMap, (m_argCount+1)*sizeof(uint32_t));
81     m_flatArgCount= 0;
82     bool isGpgpuKernel = false;
83     uint32_t minPayload = 0;
84     for (uint32_t i = 0; i < m_argCount; i++)
85     {
86         if (ArgArraySupported(m_args[i].unitKind))
87         {
88             int numSurfaces = m_args[i].unitSize/sizeof(int);
89             m_flatArgCount += numSurfaces;
90         }
91         else
92         {
93             ++m_flatArgCount;
94         }
95 
96         if (!isGpgpuKernel &&
97             ( m_args[i].unitKind == CM_ARGUMENT_IMPLICT_LOCALSIZE
98             ||m_args[i].unitKind == CM_ARGUMENT_IMPLICT_GROUPSIZE
99             ||m_args[i].unitKind == CM_ARGUMENT_IMPLICIT_LOCALID))
100         {
101             isGpgpuKernel = true;
102         }
103         if (i == 0 || (m_args[i].unitKind != CM_ARGUMENT_IMPLICIT_LOCALID && minPayload > m_args[i].unitOffsetInPayload))
104         {
105             minPayload = m_args[i].unitOffsetInPayload;
106         }
107     }
108 
109     if (!isGpgpuKernel)
110     {
111         minPayload = CM_PAYLOAD_OFFSET;
112     }
113 
114     if (m_flatArgCount == 0)
115     {
116         return CM_SUCCESS;
117     }
118 
119     m_flatArgs = MOS_NewArray(_CmArg, m_flatArgCount);
120     CM_CHK_NULL_RETURN_CMERROR(m_flatArgs);
121     MOS_ZeroMemory(m_flatArgs, m_flatArgCount * sizeof(_CmArg));
122     m_propertyIndexes = MOS_NewArray(uint8_t, m_flatArgCount);
123     CM_CHK_NULL_RETURN_CMERROR(m_propertyIndexes);
124     MOS_ZeroMemory(m_propertyIndexes, m_flatArgCount);
125     m_cmSurfIndexes = MOS_NewArray(uint32_t, m_flatArgCount);
126     CM_CHK_NULL_RETURN_CMERROR(m_cmSurfIndexes);
127     MOS_ZeroMemory(m_cmSurfIndexes, m_flatArgCount * sizeof(uint32_t));
128 
129     int j = 0;
130     uint32_t offset = 0; //offset in the local buffer
131     int localIDIndex = -1;
132     for (uint32_t i = 0; i < m_argCount; i++)
133     {
134         if (ArgArraySupported(m_args[i].unitKind))
135         {
136             m_indexMap[i] = j;
137             int numSurfaces = m_args[i].unitSize/sizeof(int);
138             for (int k = 0; k < numSurfaces; k ++)
139             {
140                 m_flatArgs[j].isaKind = m_args[i].unitKind;
141                 m_flatArgs[j].kind = m_args[i].unitKind;
142                 m_flatArgs[j].unitSize = sizeof(void *); // we can either store the pointer to CmSurfaceState or pointer to mos_resource here
143                 m_flatArgs[j].payloadOffset = m_args[i].unitOffsetInPayload + k*4 - minPayload; //each bte index has 4 bytes
144                 m_flatArgs[j].offset = offset;
145                 m_flatArgs[j].sizeInCurbe = 4;
146                 offset += m_flatArgs[j].unitSize;
147 
148                 // update curbe size
149                 if (m_explicitCurbeSize < (uint32_t)(m_flatArgs[j].payloadOffset + m_flatArgs[j].sizeInCurbe))
150                 {
151                     m_explicitCurbeSize = m_flatArgs[j].payloadOffset + m_flatArgs[j].sizeInCurbe;
152                 }
153                 ++ j;
154             }
155         }
156         else
157         {
158             m_indexMap[i] = j;
159             m_flatArgs[j].isaKind = m_args[i].unitKind;
160             m_flatArgs[j].kind = m_args[i].unitKind;
161             m_flatArgs[j].unitSize = m_args[i].unitSize;
162             m_flatArgs[j].payloadOffset = m_args[i].unitOffsetInPayload - minPayload;
163             m_flatArgs[j].offset = offset;
164             m_flatArgs[j].sizeInCurbe = m_flatArgs[j].unitSize;
165             offset += m_flatArgs[j].unitSize;
166 
167             // update curbe size
168             if (m_args[i].unitKind == CM_ARGUMENT_IMPLICIT_LOCALID)
169             {
170                 localIDIndex = j;
171             }
172             else
173             {
174                 if (m_explicitCurbeSize < (uint32_t)(m_flatArgs[j].payloadOffset + m_flatArgs[j].sizeInCurbe))
175                 {
176                     m_explicitCurbeSize = m_flatArgs[j].payloadOffset + m_flatArgs[j].sizeInCurbe;
177                 }
178             }
179             ++ j;
180         }
181         m_indexMap[m_argCount] = j;
182     }
183 
184     // adjust the payload of local id
185     if (localIDIndex >= 0)
186     {
187         m_flatArgs[localIDIndex].payloadOffset = MOS_ALIGN_CEIL(m_explicitCurbeSize, 32);
188     }
189 
190     m_data = MOS_NewArray(uint8_t, offset);
191     CM_CHK_NULL_RETURN_CMERROR(m_data);
192     m_surfaceInArg = MOS_NewArray(uint8_t, offset);
193     CM_CHK_NULL_RETURN_CMERROR(m_surfaceInArg);
194     MOS_ZeroMemory(m_data, sizeof(uint8_t)*offset);
195     MOS_ZeroMemory(m_surfaceInArg, sizeof(uint8_t)*offset);
196 
197     m_hashValue = m_kernelInfo->hashValue;
198 
199     return CM_SUCCESS;
200 }
201 
AllocateCurbe()202 MOS_STATUS CmKernelEx::AllocateCurbe()
203 {
204     MOS_DeleteArray(m_curbe);
205     if (m_explicitCurbeSize > 0)
206     {
207         m_curbeSize = MOS_ALIGN_CEIL(m_explicitCurbeSize, 64);
208         m_curbeSizePerThread = m_curbeSize;
209         m_curbeSizeCrossThread = 0;
210         m_curbe = MOS_NewArray(uint8_t, m_curbeSize);
211         CM_CHK_NULL_RETURN_MOSERROR(m_curbe);
212         MOS_ZeroMemory(m_curbe, m_curbeSize);
213     }
214     return MOS_STATUS_SUCCESS;
215 }
216 
AllocateCurbeAndFillImplicitArgs(CmThreadGroupSpace * globalGroupSpace)217 MOS_STATUS CmKernelEx::AllocateCurbeAndFillImplicitArgs(CmThreadGroupSpace *globalGroupSpace)
218 {
219     CmThreadGroupSpace *tgs = (globalGroupSpace == nullptr)?m_threadGroupSpace:globalGroupSpace;
220 
221     uint32_t thrdSpaceWidth = 0;
222     uint32_t thrdSpaceHeight = 0;
223     uint32_t thrdSpaceDepth = 0;
224     uint32_t grpSpaceWidth = 0;
225     uint32_t grpSpaceHeight = 0;
226     uint32_t grpSpaceDepth = 0;
227 
228     if (tgs)
229     {
230         tgs->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
231     }
232 
233     MOS_DeleteArray(m_curbe);
234     m_curbeSizePerThread = (m_explicitCurbeSize%32 == 4)? 64:32;
235     m_curbeSizeCrossThread = MOS_ALIGN_CEIL(m_explicitCurbeSize, 32);
236     m_curbeSize = m_curbeSizeCrossThread + m_curbeSizePerThread * thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
237     m_curbeSize = MOS_ALIGN_CEIL(m_curbeSize, 64);
238     m_curbe = MOS_NewArray(uint8_t, m_curbeSize);
239     CM_CHK_NULL_RETURN_MOSERROR(m_curbe);
240     MOS_ZeroMemory(m_curbe, m_curbeSize);
241 
242     int localIdPayload = -1;
243     int groupSizePayload = -1;
244     int localSizePayload = -1;
245 
246     for (uint32_t i = 0; i < m_flatArgCount; i++)
247     {
248         if (m_flatArgs[i].kind == ARG_KIND_IMPLICT_LOCALSIZE)
249             localSizePayload = m_flatArgs[i].payloadOffset;
250         if (m_flatArgs[i].kind == ARG_KIND_IMPLICT_GROUPSIZE)
251             groupSizePayload = m_flatArgs[i].payloadOffset;
252         if (m_flatArgs[i].kind == ARG_KIND_IMPLICIT_LOCALID)
253             localIdPayload = m_flatArgs[i].payloadOffset;
254     }
255 
256     // set group size implicit args
257     if (groupSizePayload >= 0)
258     {
259         *(uint32_t *)(m_curbe + groupSizePayload) = grpSpaceWidth;
260         *(uint32_t *)(m_curbe + groupSizePayload + 4) = grpSpaceHeight;
261         *(uint32_t *)(m_curbe + groupSizePayload + 8) = grpSpaceDepth;
262     }
263 
264     // set local size implicit args
265     if (localSizePayload >= 0)
266     {
267         *(uint32_t *)(m_curbe + localSizePayload) = thrdSpaceWidth;
268         *(uint32_t *)(m_curbe + localSizePayload + 4) = thrdSpaceHeight;
269         *(uint32_t *)(m_curbe + localSizePayload + 8) = thrdSpaceDepth;
270     }
271 
272     // set local id data per thread
273     if (localIdPayload >= 0)
274     {
275         int offset = localIdPayload;
276         for (uint32_t idZ = 0; idZ < thrdSpaceDepth; idZ++)
277         {
278             for (uint32_t idY = 0; idY < thrdSpaceHeight; idY++)
279             {
280                 for (uint32_t idX = 0; idX < thrdSpaceWidth; idX++)
281                 {
282                     *(uint32_t *)(m_curbe + offset) = idX;
283                     *(uint32_t *)(m_curbe + offset + 4) = idY;
284                     *(uint32_t *)(m_curbe + offset + 8) = idZ;
285                     offset += m_curbeSizePerThread;
286                 }
287             }
288         }
289     }
290 
291     return MOS_STATUS_SUCCESS;
292 }
293 
IsSurface(uint16_t kind)294 bool CmKernelEx::IsSurface(uint16_t kind)
295 {
296     switch (kind)
297     {
298         case ARG_KIND_SURFACE:
299         case ARG_KIND_SURFACE_1D:
300         case ARG_KIND_SURFACE_2D:
301         case ARG_KIND_SURFACE_2D_UP:
302         case ARG_KIND_SURFACE_SAMPLER:
303         case ARG_KIND_SURFACE2DUP_SAMPLER:
304         case ARG_KIND_SURFACE_3D:
305         case ARG_KIND_SURFACE_SAMPLER8X8_AVS:
306         case ARG_KIND_SURFACE_SAMPLER8X8_VA:
307         case ARG_KIND_SURFACE_2D_SCOREBOARD:
308         case ARG_KIND_STATE_BUFFER:
309         case ARG_KIND_SURFACE_VME:
310             return true;
311         default:
312             return false;
313     }
314     return false;
315 }
316 
SetKernelArg(uint32_t index,size_t size,const void * value)317 int32_t CmKernelEx::SetKernelArg(uint32_t index, size_t size, const void * value)
318 {
319     if (!m_blCreatingGPUCopyKernel) // gpucopy kernels only executed by fastpath, no need to set legacy kernels
320     {
321         CmKernelRT::SetKernelArg(index, size, value);
322     }
323     if( index >= m_argCount )
324     {
325         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
326         return CM_INVALID_ARG_INDEX;
327 
328     }
329 
330     if( !value)
331     {
332         CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
333         return CM_INVALID_ARG_VALUE;
334     }
335 
336     if( size == 0)
337     {
338         CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
339         return CM_INVALID_ARG_SIZE;
340     }
341 
342     uint32_t start = m_indexMap[index];
343     uint32_t len = m_indexMap[index + 1] - start;
344 
345     if (IsSurface(m_flatArgs[start].isaKind))
346     {
347         CMRT_UMD::SurfaceIndex *surfIndexes = (CMRT_UMD::SurfaceIndex *)value;
348         if (surfIndexes == (CMRT_UMD::SurfaceIndex *)CM_NULL_SURFACE)
349         {
350             for (uint32_t i = 0; i < len; i++)
351             {
352                 *(void **)(m_data + m_flatArgs[start + i].offset) = nullptr;
353                 *(void **)(m_surfaceInArg + m_flatArgs[start + i].offset) = nullptr;
354                 m_flatArgs[start + i].isSet = true;
355             }
356             return CM_SUCCESS;
357         }
358         // sanity check
359         if (len * sizeof(CMRT_UMD::SurfaceIndex) != size)
360         {
361             CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
362             return CM_INVALID_ARG_SIZE;
363         }
364 
365         for (uint32_t i = 0; i < len; i++)
366         {
367             uint32_t index = surfIndexes[i].get_data();
368 
369             m_flatArgs[start + i].isSet = true;
370             if (index == CM_NULL_SURFACE)
371             {
372                 *(void **)(m_data + m_flatArgs[start+i].offset) = nullptr;
373                 *(void **)(m_surfaceInArg + m_flatArgs[start+i].offset) = nullptr;
374             }
375             else
376             {
377                 CmSurface* surface = nullptr;
378                 m_surfaceMgr->GetSurface(index, surface);
379                 if (nullptr == surface)
380                 {
381                     *(void **)(m_data + m_flatArgs[start+i].offset) = nullptr;
382                     *(void **)(m_surfaceInArg + m_flatArgs[start+i].offset) = nullptr;
383                 }
384                 else
385                 {
386                     m_flatArgs[start + i].kind = ToArgKind(surface);
387 
388                     // get the CmSurfaceState from the surface index, this will be changed if surfmgr optimized
389                     // most likely, this will be moved to CmSurface
390                     CmSurfaceState *temp = GetSurfaceState(surface, index);
391                     *(CmSurfaceState **)(m_data + m_flatArgs[start + i].offset) = temp;
392                     *(CmSurface **)(m_surfaceInArg + m_flatArgs[start+i].offset) = surface;
393                     m_propertyIndexes[start + i] = surface->GetPropertyIndex();
394                     m_cmSurfIndexes[start + i] = index;
395                 }
396             }
397         }
398     }
399     else if (m_flatArgs[start].isaKind == ARG_KIND_SAMPLER) // only support 3D sampler and AVS sampler in fastpath
400     {
401         CMRT_UMD::SamplerIndex *samplerIndexes = (CMRT_UMD::SamplerIndex *)value;
402         // sanity check
403         if (len * sizeof(CMRT_UMD::SurfaceIndex) != size)
404         {
405             CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
406             return CM_INVALID_ARG_SIZE;
407         }
408 
409         for (uint32_t i = 0; i < len; i++)
410         {
411             uint32_t index = samplerIndexes[i].get_data();
412             MHW_SAMPLER_STATE_PARAM *temp = (MHW_SAMPLER_STATE_PARAM *)GetSamplerParam(index);
413             *(MHW_SAMPLER_STATE_PARAM **)(m_data + m_flatArgs[start + i].offset) = temp;
414         }
415     }
416     else
417     {
418         if (size != m_flatArgs[start].unitSize)
419         {
420             CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
421             return CM_INVALID_ARG_SIZE;
422         }
423         CmSafeMemCopy((void *)(m_data + m_flatArgs[start].offset), value, size);
424     }
425     return CM_SUCCESS;
426 }
427 
ToArgKind(CmSurface * surface)428 CM_ARG_KIND CmKernelEx::ToArgKind(CmSurface *surface)
429 {
430     switch(surface->Type())
431     {
432         case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
433             return ARG_KIND_SURFACE_1D;
434         case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
435             return ARG_KIND_SURFACE_2D;
436         case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
437             return ARG_KIND_SURFACE_2D_UP;
438         case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
439             return ARG_KIND_SURFACE_3D;
440         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
441         {
442             CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
443             SAMPLER_SURFACE_TYPE type;
444             surfSampler->GetSurfaceType(type);
445             if (type == SAMPLER_SURFACE_TYPE_2D)
446             {
447                 return ARG_KIND_SURFACE_SAMPLER;
448             }
449             else if (type == SAMPLER_SURFACE_TYPE_2DUP)
450             {
451                 return ARG_KIND_SURFACE2DUP_SAMPLER;
452             }
453             else
454             {
455                 return ARG_KIND_SURFACE_3D;
456             }
457         }
458         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
459         {
460             CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
461             if (surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE)
462             {
463                 return ARG_KIND_SURFACE_SAMPLER8X8_VA;
464             }
465             else
466             {
467                 return ARG_KIND_SURFACE_SAMPLER8X8_AVS;
468             }
469         }
470         case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
471             return ARG_KIND_SURFACE_VME;
472         case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
473             return ARG_KIND_STATE_BUFFER;
474         default:
475             return ARG_KIND_GENERAL;
476     }
477 }
478 
GetSurfaceState(CmSurface * surface,uint32_t index)479 CmSurfaceState* CmKernelEx::GetSurfaceState(CmSurface *surface, uint32_t index)
480 {
481     CM_HAL_STATE *cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
482     uint32_t surfaceArraySize = 0;
483     m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
484     CM_CHK_COND_RETURN((surfaceArraySize == 0), nullptr, "Surface Array is empty.");
485     uint32_t aliasIndex = index/surfaceArraySize;
486 
487     switch (surface->Type())
488     {
489         case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
490         {
491             CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(surface);
492             uint32_t halIndex = 0;
493             surf2D->GetIndexFor2D(halIndex);
494             PCM_HAL_SURFACE2D_SURFACE_STATE_PARAM surfStateParam = nullptr;
495             if (aliasIndex > 0 || cmHalState->umdSurf2DTable[halIndex].surfStateSet)
496             {
497                 surfStateParam = &(cmHalState->umdSurf2DTable[halIndex].surfaceStateParam[aliasIndex]);
498             }
499             return cmHalState->umdSurf2DTable[halIndex].surfStateMgr->GetSurfaceState(0, 0, surfStateParam);
500         }
501         case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
502         {
503             CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(surface);
504             uint32_t halIndex = 0;
505             surf2DUP->GetHandle(halIndex);
506             return cmHalState->surf2DUPTable[halIndex].surfStateMgr->GetSurfaceState();
507         }
508         case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
509         {
510             CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(surface);
511             uint32_t halIndex = 0;
512             surf1D->GetHandle(halIndex);
513             CM_HAL_BUFFER_SURFACE_STATE_ENTRY *surfStateParam = nullptr;
514             if (aliasIndex > 0 || cmHalState->bufferTable[halIndex].surfStateSet)
515             {
516                 surfStateParam = &(cmHalState->bufferTable[halIndex].surfaceStateEntry[aliasIndex]);
517             }
518             return cmHalState->bufferTable[halIndex].surfStateMgr->GetSurfaceState(surfStateParam);
519         }
520         case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
521         {
522             CmSurface3DRT *surf3D = static_cast<CmSurface3DRT *>(surface);
523             uint32_t halIndex = 0;
524             surf3D->GetHandle(halIndex);
525             return cmHalState->surf3DTable[halIndex].surfStateMgr->GetSurfaceState(0, 1);
526         }
527         case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
528         {
529             CmSurfaceVme *surfVme = static_cast<CmSurfaceVme*>(surface);
530             CmSurfaceStateVME *surfState = surfVme->GetSurfaceState();
531             if (surfState == nullptr)
532             {
533                 int argSize = surfVme->GetVmeCmArgSize();
534                 int surfCount = surfVme->GetTotalSurfacesCount();
535 
536                 uint8_t *vmeValue = MOS_NewArray(uint8_t, argSize);
537                 if (vmeValue == nullptr)
538                 {
539                     return nullptr;
540                 }
541                 uint16_t surfIndexes[17];
542                 SetArgsSingleVme(surfVme, vmeValue, surfIndexes);
543                 surfState = MOS_New(CmSurfaceStateVME, cmHalState);
544                 if (surfState == nullptr)
545                 {
546                     return nullptr;
547                 }
548                 surfState->Initialize((CM_HAL_VME_ARG_VALUE *)vmeValue);
549 
550                 surfVme->SetSurfState(cmHalState->advExecutor, vmeValue, surfState); // set for destroy later
551             }
552             return surfState;
553         }
554         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
555         {
556             uint32_t halIndex = 0;
557             uint16_t cmIndex = 0;
558             CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
559             surfSampler->GetHandle(halIndex);
560             surfSampler->GetCmIndexCurrent(cmIndex);
561             SAMPLER_SURFACE_TYPE type;
562             surfSampler->GetSurfaceType(type);
563             switch (type)
564             {
565                 case SAMPLER_SURFACE_TYPE_2D:
566                 {
567                     // re-calculate the aliasIndex
568                     aliasIndex = cmIndex/surfaceArraySize;
569 
570                     PCM_HAL_SURFACE2D_SURFACE_STATE_PARAM surfStateParam = nullptr;
571                     if (aliasIndex > 0 || cmHalState->umdSurf2DTable[halIndex].surfStateSet)
572                     {
573                         surfStateParam = &(cmHalState->umdSurf2DTable[halIndex].surfaceStateParam[aliasIndex]);
574                     }
575                     return cmHalState->umdSurf2DTable[halIndex].surfStateMgr->GetSurfaceState(0, 1, surfStateParam);
576                 }
577                 case SAMPLER_SURFACE_TYPE_2DUP:
578                 {
579                     return cmHalState->surf2DUPTable[halIndex].surfStateMgr->GetSurfaceState(0, 1);
580                 }
581                 case SAMPLER_SURFACE_TYPE_3D:
582                 {
583                     return cmHalState->surf3DTable[halIndex].surfStateMgr->GetSurfaceState(0, 1);
584                 }
585                 default:
586                 {
587                 }
588             }
589         }
590         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
591         {
592             CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
593             uint32_t halIndex = 0;
594             uint16_t cmIndex = 0;
595 
596             surfSampler8x8->GetIndexCurrent(halIndex);
597             surfSampler8x8->GetCmIndex(cmIndex);
598             // re-calculate the aliasIndex
599             aliasIndex = cmIndex/surfaceArraySize;
600 
601             PCM_HAL_SURFACE2D_SURFACE_STATE_PARAM surfStateParam = nullptr;
602             if (aliasIndex > 0 || cmHalState->umdSurf2DTable[halIndex].surfStateSet)
603             {
604                 surfStateParam = &(cmHalState->umdSurf2DTable[halIndex].surfaceStateParam[aliasIndex]);
605             }
606             return cmHalState->umdSurf2DTable[halIndex].surfStateMgr->GetSurfaceState(1, 1, surfStateParam);
607         }
608         default: //not implemented yet
609             return nullptr;
610 
611     }
612     return nullptr;
613 }
614 
GetMaxBteNum()615 uint32_t CmKernelEx::GetMaxBteNum()
616 {
617     uint32_t bteCount = 0;
618     for (uint32_t i = 0; i < m_flatArgCount; i++)
619     {
620         if (IsSurface(m_flatArgs[i].kind))
621         {
622             CmSurfaceState *surfState = *(CmSurfaceState **)(m_data + m_flatArgs[i].offset);
623             if (surfState == nullptr) //CM_NULL_SURFACE
624             {
625                 continue;
626             }
627             bteCount += surfState->GetNumBte();
628         }
629     }
630     return bteCount;
631 }
632 
UpdateCurbe(CmSSH * ssh,CmMediaState * mediaState,uint32_t kernelIdx)633 MOS_STATUS CmKernelEx::UpdateCurbe(CmSSH *ssh, CmMediaState *mediaState, uint32_t kernelIdx)
634 {
635     for (uint32_t i = 0; i < m_flatArgCount; i++)
636     {
637         if (IsSurface(m_flatArgs[i].kind))
638         {
639             CmSurface *surface = *(CmSurface **)(m_surfaceInArg + m_flatArgs[i].offset);
640             if (surface != nullptr && m_propertyIndexes[i] != surface->GetPropertyIndex())
641             {
642                 // need to update the surface state
643                 CmSurfaceState *temp = GetSurfaceState(surface, m_cmSurfIndexes[i]);
644                 m_propertyIndexes[i] = surface->GetPropertyIndex();
645                 *(CmSurfaceState **)(m_data + m_flatArgs[i].offset) = temp;
646             }
647             CmSurfaceState *surfState = *(CmSurfaceState **)(m_data + m_flatArgs[i].offset);
648             if (surfState == nullptr)
649             {
650                 continue;
651             }
652             uint32_t bteIdx = ssh->AddSurfaceState(surfState);
653             *(uint32_t *)(m_curbe + m_flatArgs[i].payloadOffset) = bteIdx;
654         }
655         else if (m_flatArgs[i].kind == ARG_KIND_SAMPLER)
656         {
657             MHW_SAMPLER_STATE_PARAM *param = *(MHW_SAMPLER_STATE_PARAM **)(m_data + m_flatArgs[i].offset);
658             uint32_t bteIdx = mediaState->AddSampler(param, kernelIdx);
659             *(uint32_t *)(m_curbe + m_flatArgs[i].payloadOffset) = bteIdx;
660         }
661         else if (m_flatArgs[i].kind != ARG_KIND_IMPLICT_LOCALSIZE
662                  && m_flatArgs[i].kind != ARG_KIND_IMPLICT_GROUPSIZE
663                  && m_flatArgs[i].kind != ARG_KIND_IMPLICIT_LOCALID)
664         {
665             MOS_SecureMemcpy(m_curbe + m_flatArgs[i].payloadOffset, m_flatArgs[i].sizeInCurbe,
666                 m_data + m_flatArgs[i].offset, m_flatArgs[i].unitSize);
667         }
668     }
669 
670     // dump
671     /*
672     for (int i = 0; i < m_curbeSize/4; i++)
673     {
674         printf("0x%x, ", *((uint32_t *)m_curbe + i));
675     }
676     printf("\n");
677     */
678     return MOS_STATUS_SUCCESS;
679 }
680 
UpdateFastTracker(uint32_t trackerIndex,uint32_t tracker)681 MOS_STATUS CmKernelEx::UpdateFastTracker(uint32_t trackerIndex, uint32_t tracker)
682 {
683     for (uint32_t i = 0; i < m_flatArgCount; i++)
684     {
685         if (IsSurface(m_flatArgs[i].kind))
686         {
687             CmSurface *surface = *(CmSurface **)(m_surfaceInArg + m_flatArgs[i].offset);
688             if (surface == nullptr)
689             {
690                 continue;
691             }
692             surface->SetFastTracker(trackerIndex, tracker);
693         }
694     }
695     return MOS_STATUS_SUCCESS;
696 }
697 
698 
UpdateSWSBArgs(CmThreadSpaceRT * threadSpace)699 MOS_STATUS CmKernelEx::UpdateSWSBArgs(CmThreadSpaceRT *threadSpace)
700 {
701     CmThreadSpaceRT *ts = (threadSpace == nullptr)?m_threadSpace:threadSpace;
702     if (ts == nullptr)
703     {
704         return MOS_STATUS_SUCCESS;
705     }
706     int ret = ts->SetDependencyArgToKernel(this);
707     return (ret == 0)? MOS_STATUS_SUCCESS : MOS_STATUS_UNKNOWN;
708 }
709 
SetStaticBuffer(uint32_t index,const void * value)710 int32_t CmKernelEx::SetStaticBuffer(uint32_t index, const void *value)
711 {
712     CM_CHK_CMSTATUS_RETURN(CmKernelRT::SetStaticBuffer(index, value));
713 
714     if(index >= CM_GLOBAL_SURFACE_NUMBER)
715     {
716         CM_ASSERTMESSAGE("Error: Surface Index exceeds max global surface number.");
717         return CM_INVALID_GLOBAL_BUFFER_INDEX;
718     }
719 
720     if(!value)
721     {
722         CM_ASSERTMESSAGE("Error: Invalid StaticBuffer arg value.");
723         return CM_INVALID_BUFFER_HANDLER;
724     }
725 
726     SurfaceIndex* surfIndex = (SurfaceIndex* )value;
727     uint32_t indexData = surfIndex->get_data();
728 
729     CmSurface* surface = nullptr;
730     m_surfaceMgr->GetSurface(indexData, surface);
731     if (surface != nullptr)
732     {
733         // for gen9+ platforms, index + 1 is the BTI
734         m_reservedSurfaceBteIndexes[index + CM_GLOBAL_SURFACE_INDEX_START_GEN9_PLUS]
735                                                 = GetSurfaceState(surface, indexData);
736     }
737     return CM_SUCCESS;
738 }
739 
SetSurfaceBTI(SurfaceIndex * surfIndex,uint32_t bti)740 int32_t CmKernelEx::SetSurfaceBTI(SurfaceIndex *surfIndex, uint32_t bti)
741 {
742     CM_CHK_CMSTATUS_RETURN(CmKernelRT::SetSurfaceBTI(surfIndex, bti));
743 
744     CM_CHK_NULL_RETURN_CMERROR(surfIndex);
745     uint32_t index = surfIndex->get_data();
746 
747     CmSurface* surface = nullptr;
748     m_surfaceMgr->GetSurface(index, surface);
749     if (surface != nullptr)
750     {
751         m_reservedSurfaceBteIndexes[bti] = GetSurfaceState(surface, index);
752     }
753     return CM_SUCCESS;
754 }
755 
SetSamplerBTI(SamplerIndex * sampler,uint32_t nIndex)756 int32_t CmKernelEx::SetSamplerBTI(SamplerIndex* sampler, uint32_t nIndex)
757 {
758     CM_CHK_CMSTATUS_RETURN(CmKernelRT::SetSamplerBTI(sampler, nIndex));
759 
760     uint32_t index = sampler->get_data();
761     m_reservedSamplerBteIndexes[nIndex] = (MHW_SAMPLER_STATE_PARAM *)GetSamplerParam(index);
762     return MOS_STATUS_SUCCESS;
763 }
764 
LoadReservedSurfaces(CmSSH * ssh)765 MOS_STATUS CmKernelEx::LoadReservedSurfaces(CmSSH *ssh)
766 {
767     for (auto it = m_reservedSurfaceBteIndexes.begin(); it != m_reservedSurfaceBteIndexes.end(); ++ it)
768     {
769         ssh->AddSurfaceState(it->second, it->first);
770     }
771 
772     // reset the table in legacy kernel for bti reuse
773     if (m_usKernelPayloadSurfaceCount)
774     {
775         CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
776         m_usKernelPayloadSurfaceCount = 0;
777     }
778     return MOS_STATUS_SUCCESS;
779 }
780 
LoadReservedSamplers(CmMediaState * mediaState,uint32_t kernelIdx)781 MOS_STATUS CmKernelEx::LoadReservedSamplers(CmMediaState *mediaState, uint32_t kernelIdx)
782 {
783     for (auto it = m_reservedSamplerBteIndexes.begin(); it != m_reservedSamplerBteIndexes.end(); ++ it)
784     {
785         mediaState->AddSampler((MHW_SAMPLER_STATE_PARAM *)it->second, kernelIdx, it->first);
786     }
787     return MOS_STATUS_SUCCESS;
788 }
789 
GetSamplerParam(uint32_t index)790 void* CmKernelEx::GetSamplerParam(uint32_t index)
791 {
792     CM_HAL_STATE *cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
793     return (void *)&cmHalState->samplerTable[index];
794 }
795 
GetSamplerCount(uint32_t * count3D,uint32_t * countAVS)796 MOS_STATUS CmKernelEx::GetSamplerCount(uint32_t *count3D, uint32_t *countAVS)
797 {
798     *count3D = 0;
799     *countAVS = 0;
800     for (uint32_t i = 0; i < m_flatArgCount; i++)
801     {
802         if (m_flatArgs[i].kind == ARG_KIND_SAMPLER)
803         {
804             MHW_SAMPLER_STATE_PARAM *temp = *(MHW_SAMPLER_STATE_PARAM **)(m_data + m_flatArgs[i].offset);
805             if (temp->SamplerType == MHW_SAMPLER_TYPE_3D)
806             {
807                 ++ (*count3D);
808             }
809             else if (temp->SamplerType == MHW_SAMPLER_TYPE_AVS)
810             {
811                 ++ (*countAVS);
812             }
813             else
814             {
815                 // only support 3D and AVS samplers by now in fast path
816                 return MOS_STATUS_INVALID_PARAMETER;
817             }
818         }
819     }
820     return MOS_STATUS_SUCCESS;
821 }
822 
GetThreadSpaceEx()823 CmThreadSpaceRT* CmKernelEx::GetThreadSpaceEx()
824 {
825     if (m_threadSpace)
826     {
827         return m_threadSpace;
828     }
829     if (m_dummyThreadSpace)
830     {
831         m_device->DestroyThreadSpace(m_dummyThreadSpace);
832     }
833     if (m_threadCount)
834     {
835         m_device->CreateThreadSpace(m_threadCount, 1, m_dummyThreadSpace);
836     }
837     return static_cast<CmThreadSpaceRT *>(m_dummyThreadSpace);
838 }
839 
GetThreadGroupSpaceEx()840 CmThreadGroupSpace* CmKernelEx::GetThreadGroupSpaceEx()
841 {
842     if (m_threadGroupSpace)
843     {
844         return m_threadGroupSpace;
845     }
846     if (m_dummyThreadGroupSpace)
847     {
848         m_device->DestroyThreadGroupSpace(m_dummyThreadGroupSpace);
849     }
850 
851     if (m_threadCount)
852     {
853         m_device->CreateThreadGroupSpace(1, 1, m_threadCount, 1, m_dummyThreadGroupSpace);
854     }
855     return m_dummyThreadGroupSpace;
856 }
857 
SurfaceDumpEx(uint32_t kernelNumber,int32_t taskId)858 void CmKernelEx::SurfaceDumpEx(uint32_t kernelNumber, int32_t taskId)
859 {
860     for(uint32_t argIdx = 0; argIdx < m_argCount; argIdx++)
861     {
862         uint32_t start = m_indexMap[argIdx];
863         uint32_t len = m_indexMap[argIdx + 1] - start;
864 
865         for (uint32_t v = 0; v < len; v ++)
866         {
867             uint32_t i = start + v;
868             if (IsSurface(m_flatArgs[i].kind))
869             {
870                 CmSurface *surface = *(CmSurface **)(m_surfaceInArg + m_flatArgs[i].offset);
871                 if (surface == nullptr)
872                 {
873                     continue;
874                 }
875                 surface->DumpContent(kernelNumber, m_kernelInfo->kernelName, taskId, argIdx, v);
876             }
877         }
878     }
879 }
880 
IsFastPathSupported()881 bool CmKernelEx::IsFastPathSupported()
882 {
883     // current fast path doesn't support media object
884     bool specialDependency = false;
885     if (m_threadSpace)
886     {
887         CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
888         m_threadSpace->GetDependencyPatternType(dependencyPatternType);
889         specialDependency = (dependencyPatternType == CM_WAVEFRONT26Z || dependencyPatternType == CM_WAVEFRONT26ZI);
890     }
891 
892     return !(m_perThreadArgExists || specialDependency);
893 }
894 
895