1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVOC_KERNEL_MIG_MANAGER_H_PRIVATE_ACCESS_ALLOWED
25
26 #include "kernel/gpu/mig_mgr/kernel_mig_manager.h"
27 #include "kernel/gpu/gr/kernel_graphics.h"
28 #include "kernel/gpu/rc/kernel_rc.h"
29 #include "kernel/gpu/device/device.h"
30 #include "kernel/gpu/subdevice/subdevice.h"
31 #include "kernel/gpu/mig_mgr/compute_instance_subscription.h"
32 #include "kernel/gpu/mig_mgr/gpu_instance_subscription.h"
33 #include "kernel/gpu/mem_mgr/mem_mgr.h"
34 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
35 #include "kernel/gpu/ce/kernel_ce.h"
36 #include "kernel/gpu/mem_mgr/mem_mgr.h"
37 #include "kernel/gpu/mmu/kern_gmmu.h"
38 #include "kernel/gpu/bus/kern_bus.h"
39 #include "kernel/gpu/mem_mgr/heap.h"
40 #include "kernel/gpu/nvlink/kernel_nvlink.h"
41 #include "kernel/gpu/gpu_engine_type.h"
42 #include "kernel/gpu/gpu_fabric_probe.h"
43 #include "rmapi/client.h"
44 #include "rmapi/rs_utils.h"
45 #include "rmapi/rmapi_utils.h"
46 #include "gpu/mem_mgr/mem_scrub.h"
47 #include "vgpu/rpc.h"
48 #include "virtualization/kernel_vgpu_mgr.h"
49 #include "virtualization/hypervisor/hypervisor.h"
50 #include "kernel/gpu/gr/kernel_graphics_manager.h"
51 #include "kernel/gpu/gr/kernel_graphics.h"
52 #include "kernel/gpu/intr/intr.h"
53 #include "kernel/core/locks.h"
54 #include "class/cl503b.h"
55 #include "nv_ref.h"
56 #include "platform/sli/sli.h"
57 #include "nvRmReg.h"
58
59 #include "kernel/gpu/ccu/kernel_ccu.h"
60
61 struct KERNEL_MIG_MANAGER_PRIVATE_DATA
62 {
63 NvBool bInitialized;
64 KERNEL_MIG_MANAGER_STATIC_INFO staticInfo;
65 };
66
67 typedef struct
68 {
69 struct
70 {
71 NvBool bValid;
72 NvU32 flags;
73 NV_RANGE placement;
74 } GIs[NV2080_CTRL_GPU_MAX_PARTITIONS];
75 struct
76 {
77 NvBool bValid;
78 NvU32 flags;
79 NvU32 ceCount;
80 NvU32 nvEncCount;
81 NvU32 nvDecCount;
82 NvU32 nvJpgCount;
83 NvU32 ofaCount;
84 NvU32 spanStart;
85 NvU32 GIIdx;
86 } CIs[NVC637_CTRL_MAX_EXEC_PARTITIONS];
87 } MIG_BOOT_CONFIG;
88
89 /*!
90 * @brief Function to increment gi/ci refcount
91 */
92 NV_STATUS
kmigmgrIncRefCount_IMPL(RsShared * pShared)93 kmigmgrIncRefCount_IMPL
94 (
95 RsShared *pShared
96 )
97 {
98 NvS32 refCount;
99
100 NV_ASSERT_OR_RETURN(pShared != NULL, NV_ERR_INVALID_ARGUMENT);
101
102 serverRefShare(&g_resServ, pShared);
103 refCount = serverGetShareRefCount(&g_resServ, pShared);
104
105 // Make sure refCount didn't overflow
106 NV_ASSERT_OR_RETURN(refCount > 0, NV_ERR_INVALID_STATE);
107 return NV_OK;
108 }
109
110 /*!
111 * @brief Function to decrement gi/ci refcount
112 */
113 NV_STATUS
kmigmgrDecRefCount_IMPL(RsShared * pShared)114 kmigmgrDecRefCount_IMPL
115 (
116 RsShared *pShared
117 )
118 {
119 NvS32 refCount;
120
121 NV_ASSERT_OR_RETURN(pShared != NULL, NV_ERR_INVALID_ARGUMENT);
122
123 refCount = serverGetShareRefCount(&g_resServ, pShared);
124 serverFreeShare(&g_resServ, pShared);
125 --refCount;
126
127 // Make sure refCount didn't underflow
128 NV_ASSERT_OR_RETURN(refCount > 0, NV_ERR_INVALID_STATE);
129 return NV_OK;
130 }
131
132 /*! @brief create a reference to a single GPU instance, no compute instance */
133 MIG_INSTANCE_REF
kmigmgrMakeGIReference_IMPL(KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)134 kmigmgrMakeGIReference_IMPL
135 (
136 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
137 )
138 {
139 MIG_INSTANCE_REF ref = { pKernelMIGGpuInstance, NULL };
140 return ref;
141 }
142
143 /*! @brief create a reference to a compute instance */
144 MIG_INSTANCE_REF
kmigmgrMakeCIReference_IMPL(KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,MIG_COMPUTE_INSTANCE * pMIGComputeInstance)145 kmigmgrMakeCIReference_IMPL
146 (
147 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
148 MIG_COMPUTE_INSTANCE *pMIGComputeInstance
149 )
150 {
151 MIG_INSTANCE_REF ref = { pKernelMIGGpuInstance, pMIGComputeInstance };
152 return ref;
153 }
154
155 /*! @brief create a Ref referencing no GI/CI */
156 MIG_INSTANCE_REF
kmigmgrMakeNoMIGReference_IMPL(void)157 kmigmgrMakeNoMIGReference_IMPL(void)
158 {
159 MIG_INSTANCE_REF ref = { NULL, NULL };
160 return ref;
161 }
162
163 /*! @brief check if MIG attribution id is valid for max instances */
164 NvBool
kmigmgrIsInstanceAttributionIdValid_IMPL(NvU16 id)165 kmigmgrIsInstanceAttributionIdValid_IMPL
166 (
167 NvU16 id
168 )
169 {
170 return (((id / KMIGMGR_MAX_GPU_SWIZZID) <= KMIGMGR_MAX_GPU_INSTANCES) &&
171 ((id % KMIGMGR_MAX_GPU_SWIZZID) <= KMIGMGR_MAX_COMPUTE_INSTANCES));
172 }
173
174 /*! @brief check if existing valid instance ref is passed in */
175 NvBool
kmigmgrIsMIGReferenceValid_IMPL(MIG_INSTANCE_REF * pRef)176 kmigmgrIsMIGReferenceValid_IMPL
177 (
178 MIG_INSTANCE_REF *pRef
179 )
180 {
181 // Invalid argument
182 NV_CHECK_OR_RETURN(LEVEL_SILENT, pRef != NULL, NV_FALSE);
183 // Invalid argument
184 NV_CHECK_OR_RETURN(LEVEL_SILENT, !((pRef->pKernelMIGGpuInstance == NULL) &&
185 (pRef->pMIGComputeInstance != NULL)), NV_FALSE);
186
187 NV_CHECK_OR_RETURN(LEVEL_SILENT, pRef->pKernelMIGGpuInstance != NULL, NV_FALSE);
188 NV_ASSERT_OR_RETURN(pRef->pKernelMIGGpuInstance->bValid, NV_FALSE);
189
190 // If we reached this point, the GPU instance is valid
191 NV_CHECK_OR_RETURN(LEVEL_SILENT, pRef->pMIGComputeInstance != NULL, NV_TRUE);
192 NV_ASSERT_OR_RETURN(pRef->pMIGComputeInstance->bValid, NV_FALSE);
193
194 return NV_TRUE;
195 }
196
197 /*! @brief check if the same instance(s) are passed in; only compare GI if lhs has no CI */
198 NvBool
kmigmgrAreMIGReferencesSame_IMPL(MIG_INSTANCE_REF * pRefA,MIG_INSTANCE_REF * pRefB)199 kmigmgrAreMIGReferencesSame_IMPL
200 (
201 MIG_INSTANCE_REF *pRefA,
202 MIG_INSTANCE_REF *pRefB
203 )
204 {
205 NV_CHECK_OR_RETURN(LEVEL_SILENT, kmigmgrIsMIGReferenceValid(pRefA) &&
206 kmigmgrIsMIGReferenceValid(pRefB), NV_FALSE);
207
208 if ((pRefA->pKernelMIGGpuInstance != pRefB->pKernelMIGGpuInstance) ||
209 ((pRefA->pMIGComputeInstance != NULL) &&
210 (pRefA->pMIGComputeInstance != pRefB->pMIGComputeInstance)))
211 {
212 return NV_FALSE;
213 }
214
215 return NV_TRUE;
216 }
217
218 /*!
219 * @brief Count set bits within range indicated by given base type in bitvector
220 *
221 * @param[in] pEngines Bitvector to count
222 * @param[in] rmEngineType 0th index RM_ENGINE_TYPE, only partitionable engines supported
223 */
224 NvU32
kmigmgrCountEnginesOfType_IMPL(const ENGTYPE_BIT_VECTOR * pEngines,RM_ENGINE_TYPE rmEngineType)225 kmigmgrCountEnginesOfType_IMPL
226 (
227 const ENGTYPE_BIT_VECTOR *pEngines,
228 RM_ENGINE_TYPE rmEngineType
229 )
230 {
231 NV_RANGE range = rangeMake(rmEngineType, rmEngineType);
232 ENGTYPE_BIT_VECTOR mask;
233
234 if (pEngines == NULL)
235 return 0;
236
237 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType))
238 return 0;
239
240 if (RM_ENGINE_TYPE_IS_GR(rmEngineType))
241 range = RM_ENGINE_RANGE_GR();
242 else if (RM_ENGINE_TYPE_IS_COPY(rmEngineType))
243 range = RM_ENGINE_RANGE_COPY();
244 else if (RM_ENGINE_TYPE_IS_NVDEC(rmEngineType))
245 range = RM_ENGINE_RANGE_NVDEC();
246 else if (RM_ENGINE_TYPE_IS_NVENC(rmEngineType))
247 range = RM_ENGINE_RANGE_NVENC();
248 else if (RM_ENGINE_TYPE_IS_NVJPEG(rmEngineType))
249 range = RM_ENGINE_RANGE_NVJPEG();
250 else if (RM_ENGINE_TYPE_IS_OFA(rmEngineType))
251 range = RM_ENGINE_RANGE_OFA();
252
253 bitVectorClrAll(&mask);
254 bitVectorSetRange(&mask, range);
255 bitVectorAnd(&mask, &mask, pEngines);
256 return bitVectorCountSetBits(&mask);
257 }
258
259 /*!
260 * @brief Calculate the attribution ID for the given MIG instance reference.
261 *
262 * @note the attribution ID is an encoding of gpu/compute instance IDs dependent
263 * upon the maximum values of these IDs which must be queried by the
264 * recipient in order to decode. Attribution values for NULL or lone
265 * GPU instances will produce non-zero attribution IDs which will decode to
266 * out-of-range values for both IDs.
267 *
268 * @param[in] ref Reference to a Gi/CI
269 *
270 * @return the encoded attribution ID
271 */
272 NvU16
kmigmgrGetAttributionIdFromMIGReference_IMPL(MIG_INSTANCE_REF ref)273 kmigmgrGetAttributionIdFromMIGReference_IMPL
274 (
275 MIG_INSTANCE_REF ref
276 )
277 {
278 NvU16 giID = KMIGMGR_MAX_GPU_SWIZZID;
279 NvU16 ciID = KMIGMGR_MAX_COMPUTE_INSTANCES;
280
281 //
282 // Inverting this encoding depends upon the compute instance IDs having a
283 // shorter range than the gpu instance IDs, otherwise high compute instance
284 // IDs will cause aliasing
285 //
286 ct_assert(KMIGMGR_MAX_COMPUTE_INSTANCES < KMIGMGR_MAX_GPU_SWIZZID);
287
288 // We are also depending on this encoding fitting in 16 bits...
289 ct_assert((KMIGMGR_MAX_GPU_SWIZZID * KMIGMGR_MAX_COMPUTE_INSTANCES) <= NV_U16_MAX);
290
291 if (kmigmgrIsMIGReferenceValid(&ref) &&
292 (ref.pKernelMIGGpuInstance->swizzId < KMIGMGR_MAX_GPU_SWIZZID))
293 {
294 giID = (NvU16)ref.pKernelMIGGpuInstance->swizzId;
295 if ((ref.pMIGComputeInstance != NULL) &&
296 (ref.pMIGComputeInstance->id < KMIGMGR_MAX_COMPUTE_INSTANCES))
297 {
298 ciID = (NvU16)ref.pMIGComputeInstance->id;
299 }
300 }
301
302 return (giID * KMIGMGR_MAX_GPU_SWIZZID) + ciID;
303 }
304
305 /*!
306 * @brief Function to convert an engine type from one bitvector to a
307 * corresponding engine type in another bitvector. The two bitvectors
308 * are expected to have the same set bit count.
309 */
310 NV_STATUS
kmigmgrEngineTypeXlate_IMPL(ENGTYPE_BIT_VECTOR * pSrc,RM_ENGINE_TYPE srcEngineType,ENGTYPE_BIT_VECTOR * pDst,RM_ENGINE_TYPE * pDstEngineType)311 kmigmgrEngineTypeXlate_IMPL
312 (
313 ENGTYPE_BIT_VECTOR *pSrc,
314 RM_ENGINE_TYPE srcEngineType,
315 ENGTYPE_BIT_VECTOR *pDst,
316 RM_ENGINE_TYPE *pDstEngineType
317 )
318 {
319 RM_ENGINE_TYPE tempSrcEngineType;
320 RM_ENGINE_TYPE tempDstEngineType;
321 NvBool bFound;
322
323 NV_ASSERT_OR_RETURN(pSrc != NULL, NV_ERR_INVALID_ARGUMENT);
324 NV_ASSERT_OR_RETURN(pDst != NULL, NV_ERR_INVALID_ARGUMENT);
325 NV_ASSERT_OR_RETURN(pDstEngineType != NULL, NV_ERR_INVALID_ARGUMENT);
326
327 if (!bitVectorTest(pSrc, srcEngineType))
328 return NV_ERR_OBJECT_NOT_FOUND;
329
330 // Iterate over both masks at the same time
331 bFound = NV_FALSE;
332 FOR_EACH_IN_BITVECTOR_PAIR(pSrc, tempSrcEngineType, pDst, tempDstEngineType)
333 {
334 bFound = (srcEngineType == tempSrcEngineType);
335 if (bFound)
336 break;
337 }
338 FOR_EACH_IN_BITVECTOR_PAIR_END();
339
340 // We already checked that the engine is present above, this should never fire
341 NV_ASSERT(bFound);
342
343 *pDstEngineType = tempDstEngineType;
344
345 return NV_OK;
346 }
347
348 //
349 // below algorithm depends on contiguity of all partitionable engine values
350 // in RM_ENGINE_TYPE, so add asserts here.
351 // Note - this only checks the first and last ID, a proper check would account
352 // for all entries, but that's not possible at this time.
353 //
354 ct_assert((RM_ENGINE_TYPE_GR(RM_ENGINE_TYPE_GR_SIZE - 1) -
355 RM_ENGINE_TYPE_GR(0)) == (RM_ENGINE_TYPE_GR_SIZE - 1));
356 ct_assert((RM_ENGINE_TYPE_COPY(RM_ENGINE_TYPE_COPY_SIZE - 1) -
357 RM_ENGINE_TYPE_COPY(0)) == (RM_ENGINE_TYPE_COPY_SIZE - 1));
358 ct_assert((RM_ENGINE_TYPE_NVDEC(RM_ENGINE_TYPE_NVDEC_SIZE - 1) -
359 RM_ENGINE_TYPE_NVDEC(0)) == (RM_ENGINE_TYPE_NVDEC_SIZE - 1));
360 ct_assert((RM_ENGINE_TYPE_NVENC(RM_ENGINE_TYPE_NVENC_SIZE - 1) -
361 RM_ENGINE_TYPE_NVENC(0)) == (RM_ENGINE_TYPE_NVENC_SIZE - 1));
362
363 /*!
364 * @brief Chooses the engines of the given type to allocate. Supports
365 * shared/exclusive ownership arbitration.
366 *
367 * @param[IN] pSourceEngines Mask of engines in an instance
368 * @param[IN} bShared NV_TRUE if engines should be shared
369 * @param[IN] engTypeRange NV_RANGE of bit indices for this eng type
370 * @param[IN] regEngCount Requested number of engines in this CI
371 * @param[I/O] pOutEngines Mask of engines already/newly allocated
372 * @param[I/O] pExclusiveEngines Mask of already exclusively-allocated engines
373 * @param[I/O] pSharedEngines Mask of engines shared by other instances
374 * @param[IN] pAllocatableEngines Mask of engines that are allocatable
375 */
376 NV_STATUS
kmigmgrAllocateInstanceEngines_IMPL(ENGTYPE_BIT_VECTOR * pSourceEngines,NvBool bShared,NV_RANGE engTypeRange,NvU32 reqEngCount,ENGTYPE_BIT_VECTOR * pOutEngines,ENGTYPE_BIT_VECTOR * pExclusiveEngines,ENGTYPE_BIT_VECTOR * pSharedEngines,ENGTYPE_BIT_VECTOR * pAllocatableEngines)377 kmigmgrAllocateInstanceEngines_IMPL
378 (
379 ENGTYPE_BIT_VECTOR *pSourceEngines,
380 NvBool bShared,
381 NV_RANGE engTypeRange,
382 NvU32 reqEngCount,
383 ENGTYPE_BIT_VECTOR *pOutEngines,
384 ENGTYPE_BIT_VECTOR *pExclusiveEngines,
385 ENGTYPE_BIT_VECTOR *pSharedEngines,
386 ENGTYPE_BIT_VECTOR *pAllocatableEngines
387 )
388 {
389 NvU32 allocated = 0;
390 ENGTYPE_BIT_VECTOR engines;
391 RM_ENGINE_TYPE rmEngineType;
392 NvU32 localIdx;
393
394 // Ensure allocatableEngines is subset of sourceEngines
395 bitVectorClrAll(&engines);
396 bitVectorAnd(&engines, pAllocatableEngines, pSourceEngines);
397 NV_ASSERT_OR_RETURN(bitVectorTestEqual(&engines, pAllocatableEngines), NV_ERR_INVALID_STATE);
398
399 // If using shared engines, allocate as many from existing shared engines as possible
400 if (bShared)
401 {
402 bitVectorClrAll(&engines);
403 bitVectorSetRange(&engines, engTypeRange);
404 bitVectorAnd(&engines, &engines, pSourceEngines);
405 localIdx = 0;
406 FOR_EACH_IN_BITVECTOR(&engines, rmEngineType)
407 {
408 if (allocated == reqEngCount)
409 break;
410
411 // Skip engines that aren't allocatable or aren't in the shared pool already
412 if (!bitVectorTest(pAllocatableEngines, rmEngineType) ||
413 !bitVectorTest(pSharedEngines, rmEngineType))
414 {
415 localIdx++;
416 continue;
417 }
418
419 // assign the engine
420 bitVectorSet(pOutEngines, engTypeRange.lo + localIdx);
421
422 localIdx++;
423 allocated++;
424 }
425 FOR_EACH_IN_BITVECTOR_END();
426 }
427
428 // Allocate the rest from the free pool
429 bitVectorClrAll(&engines);
430 bitVectorSetRange(&engines, engTypeRange);
431 bitVectorAnd(&engines, &engines, pSourceEngines);
432 localIdx = 0;
433 FOR_EACH_IN_BITVECTOR(&engines, rmEngineType)
434 {
435 if (allocated == reqEngCount)
436 break;
437
438 // Skip non-allocatable or in-use engines
439 if (!bitVectorTest(pAllocatableEngines, rmEngineType) ||
440 bitVectorTest(pSharedEngines, rmEngineType) ||
441 bitVectorTest(pExclusiveEngines, rmEngineType))
442 {
443 localIdx++;
444 continue;
445 }
446
447 // Add the engine to the appropriate in-use pool
448 bitVectorSet((bShared ? pSharedEngines : pExclusiveEngines), rmEngineType);
449
450 // Assign the engine
451 bitVectorSet(pOutEngines, engTypeRange.lo + localIdx);
452
453 localIdx++;
454 allocated++;
455 }
456 FOR_EACH_IN_BITVECTOR_END();
457
458 NV_CHECK_OR_RETURN(LEVEL_SILENT, allocated == reqEngCount, NV_ERR_INSUFFICIENT_RESOURCES);
459 return NV_OK;
460 }
461
462 /*!
463 * @brief Convert global/physical engine mask to logical/local (no-hole) mask
464 *
465 * @param[in] pPhysicalEngineMask Bitvector storing physical mask
466 * @param[in] pLocalEngineMask Bitvector storing local mask
467 */
468 void
kmigmgrGetLocalEngineMask_IMPL(ENGTYPE_BIT_VECTOR * pPhysicalEngineMask,ENGTYPE_BIT_VECTOR * pLocalEngineMask)469 kmigmgrGetLocalEngineMask_IMPL
470 (
471 ENGTYPE_BIT_VECTOR *pPhysicalEngineMask,
472 ENGTYPE_BIT_VECTOR *pLocalEngineMask
473 )
474 {
475 NV_RANGE range;
476 NvU32 count;
477 bitVectorClrAll(pLocalEngineMask);
478
479 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_GR(0));
480 if (count > 0)
481 {
482 range = rangeMake(RM_ENGINE_TYPE_GR(0), RM_ENGINE_TYPE_GR(count - 1));
483 bitVectorSetRange(pLocalEngineMask, range);
484 }
485
486 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_COPY(0));
487 if (count > 0)
488 {
489 range = rangeMake(RM_ENGINE_TYPE_COPY(0), RM_ENGINE_TYPE_COPY(count - 1));
490 bitVectorSetRange(pLocalEngineMask, range);
491 }
492
493 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_NVDEC(0));
494 if (count > 0)
495 {
496 range = rangeMake(RM_ENGINE_TYPE_NVDEC(0), RM_ENGINE_TYPE_NVDEC(count - 1));
497 bitVectorSetRange(pLocalEngineMask, range);
498 }
499
500 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_NVENC(0));
501 if (count > 0)
502 {
503 range = rangeMake(RM_ENGINE_TYPE_NVENC(0), RM_ENGINE_TYPE_NVENC(count - 1));
504 bitVectorSetRange(pLocalEngineMask, range);
505 }
506
507 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_NVJPEG(0));
508 if (count > 0)
509 {
510 range = rangeMake(RM_ENGINE_TYPE_NVJPEG(0), RM_ENGINE_TYPE_NVJPEG(count - 1));
511 bitVectorSetRange(pLocalEngineMask, range);
512 }
513
514 count = kmigmgrCountEnginesOfType(pPhysicalEngineMask, RM_ENGINE_TYPE_OFA(0));
515 if (count > 0)
516 {
517 range = rangeMake(RM_ENGINE_TYPE_OFA(0), RM_ENGINE_TYPE_OFA(count - 1));
518 bitVectorSetRange(pLocalEngineMask, range);
519 }
520 }
521
522 /*!
523 * @brief Create client and subdevice handles to make calls into this gpu instance
524 */
525 NV_STATUS
kmigmgrAllocGPUInstanceHandles_IMPL(OBJGPU * pGpu,NvU32 swizzId,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)526 kmigmgrAllocGPUInstanceHandles_IMPL
527 (
528 OBJGPU *pGpu,
529 NvU32 swizzId,
530 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
531 )
532 {
533 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
534 NvHandle hSubscription = NV01_NULL_OBJECT;
535 NvHandle hClient;
536 NvHandle hDevice;
537 NvHandle hSubdevice;
538 NVC637_ALLOCATION_PARAMETERS params;
539
540 NV_ASSERT_OK_OR_RETURN(
541 rmapiutilAllocClientAndDeviceHandles(pRmApi, pGpu, &hClient, &hDevice, &hSubdevice));
542
543 portMemSet(¶ms, 0, sizeof(params));
544 params.swizzId = swizzId;
545 NV_ASSERT_OK_OR_RETURN(
546 pRmApi->Alloc(pRmApi, hClient, hSubdevice, &hSubscription, AMPERE_SMC_PARTITION_REF, ¶ms, sizeof(params)));
547
548 pKernelMIGGpuInstance->instanceHandles.hClient = hClient;
549 pKernelMIGGpuInstance->instanceHandles.hDevice = hDevice;
550 pKernelMIGGpuInstance->instanceHandles.hSubdevice = hSubdevice;
551 pKernelMIGGpuInstance->instanceHandles.hSubscription = hSubscription;
552
553 return NV_OK;
554 }
555
556 /*!
557 * @brief Delete created gpu instance handles if they exist
558 */
559 void
kmigmgrFreeGPUInstanceHandles_IMPL(KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)560 kmigmgrFreeGPUInstanceHandles_IMPL
561 (
562 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
563 )
564 {
565 if (pKernelMIGGpuInstance->instanceHandles.hClient != NV01_NULL_OBJECT)
566 {
567 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
568
569 pRmApi->Free(pRmApi, pKernelMIGGpuInstance->instanceHandles.hClient, pKernelMIGGpuInstance->instanceHandles.hClient);
570 pKernelMIGGpuInstance->instanceHandles.hClient = NV01_NULL_OBJECT;
571 pKernelMIGGpuInstance->instanceHandles.hDevice = NV01_NULL_OBJECT;
572 pKernelMIGGpuInstance->instanceHandles.hSubdevice = NV01_NULL_OBJECT;
573 pKernelMIGGpuInstance->instanceHandles.hSubscription = NV01_NULL_OBJECT;
574 }
575 }
576
577 /*!
578 * @brief Checks if all references to gpu instance are internal
579 */
580 NvBool
kmigmgrIsGPUInstanceReadyToBeDestroyed_IMPL(KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)581 kmigmgrIsGPUInstanceReadyToBeDestroyed_IMPL
582 (
583 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
584 )
585 {
586 NvS32 targetRefCount;
587 NvS32 actualRefCount;
588
589 NV_CHECK_OR_RETURN(LEVEL_SILENT, pKernelMIGGpuInstance->pShare != NULL, NV_TRUE);
590
591 //
592 // Initial refCount is increased to "1" when gpu instance is created and then
593 // every subscription by a client should increase the refcount
594 //
595 targetRefCount = 1;
596
597 // A client handle is allocated to support internal GR Routing
598 if (pKernelMIGGpuInstance->instanceHandles.hClient != NV01_NULL_OBJECT)
599 targetRefCount++;
600
601 //
602 // GPU instance scrubber is initialized during gpu instance creation and deleted
603 // when gpu instance is invalidated, and subscribes to the gpu instance, so must
604 // be accounted for in the target ref count
605 //
606 if (pKernelMIGGpuInstance->bMemoryPartitionScrubberInitialized)
607 targetRefCount++;
608
609 actualRefCount = serverGetShareRefCount(&g_resServ, pKernelMIGGpuInstance->pShare);
610 if (actualRefCount > targetRefCount)
611 return NV_FALSE;
612
613 // Mismatch here indicates programming error
614 NV_ASSERT(actualRefCount == targetRefCount);
615 return NV_TRUE;
616 }
617
618 NV_STATUS
kmigmgrConstructEngine_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,ENGDESCRIPTOR engDesc)619 kmigmgrConstructEngine_IMPL
620 (
621 OBJGPU *pGpu,
622 KernelMIGManager *pKernelMIGManager,
623 ENGDESCRIPTOR engDesc
624 )
625 {
626 NvU32 GIIdx;
627 KERNEL_MIG_MANAGER_PRIVATE_DATA *pPrivate;
628
629 pKernelMIGManager->bMIGEnabled = NV_FALSE;
630 pKernelMIGManager->swizzIdInUseMask = 0x0;
631
632 pPrivate = portMemAllocNonPaged(sizeof(*pPrivate));
633 NV_CHECK_OR_RETURN(LEVEL_ERROR, pPrivate != NULL, NV_ERR_NO_MEMORY);
634 portMemSet(pPrivate, 0, sizeof(*pPrivate));
635 pKernelMIGManager->pPrivate = pPrivate;
636
637 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pKernelMIGManager->kernelMIGGpuInstance); ++GIIdx)
638 {
639 kmigmgrInitGPUInstanceInfo(pGpu, pKernelMIGManager,
640 &pKernelMIGManager->kernelMIGGpuInstance[GIIdx]);
641 }
642
643 kmigmgrInitRegistryOverrides(pGpu, pKernelMIGManager);
644
645 return NV_OK;
646 }
647
648 void
kmigmgrDestruct_IMPL(KernelMIGManager * pKernelMIGManager)649 kmigmgrDestruct_IMPL
650 (
651 KernelMIGManager *pKernelMIGManager
652 )
653 {
654 NvU32 GIIdx;
655 NvU32 CIIdx;
656
657 portMemFree(pKernelMIGManager->pPrivate->staticInfo.pProfiles);
658 pKernelMIGManager->pPrivate->staticInfo.pProfiles = NULL;
659 portMemFree(pKernelMIGManager->pPrivate->staticInfo.pSwizzIdFbMemPageRanges);
660 pKernelMIGManager->pPrivate->staticInfo.pSwizzIdFbMemPageRanges = NULL;
661 portMemFree(pKernelMIGManager->pPrivate->staticInfo.pCIProfiles);
662 pKernelMIGManager->pPrivate->staticInfo.pCIProfiles = NULL;
663 portMemFree(pKernelMIGManager->pPrivate->staticInfo.pSkylineInfo);
664 pKernelMIGManager->pPrivate->staticInfo.pSkylineInfo = NULL;
665
666 portMemFree(pKernelMIGManager->pPrivate);
667 pKernelMIGManager->pPrivate = NULL;
668
669 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pKernelMIGManager->kernelMIGGpuInstance); ++GIIdx)
670 {
671 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = &pKernelMIGManager->kernelMIGGpuInstance[GIIdx];
672
673 // Shouldn't have any valid GPU instance
674 if (pKernelMIGGpuInstance->bValid)
675 {
676 NV_PRINTF(LEVEL_ERROR,
677 "Deleting valid GPU instance with swizzId - %d. Should have been deleted before shutdown!\n",
678 pKernelMIGGpuInstance->swizzId);
679 }
680
681 for (CIIdx = 0;
682 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
683 ++CIIdx)
684 {
685 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx];
686
687 // Shouldn't have any valid compute instance
688 if (pMIGComputeInstance->bValid)
689 {
690 NV_PRINTF(LEVEL_ERROR,
691 "Deleting valid compute instance - %d. Should have been deleted before shutdown!\n",
692 CIIdx);
693 }
694 }
695 }
696 }
697
698 /*!
699 * @brief Handle KMIGMGR init which must occur after GPU post load.
700 *
701 * @param[in] pGpu
702 * @param[in] pUnusedData Unused callback data
703 */
704 static NV_STATUS
_kmigmgrHandlePostSchedulingEnableCallback(OBJGPU * pGpu,void * pUnusedData)705 _kmigmgrHandlePostSchedulingEnableCallback
706 (
707 OBJGPU *pGpu,
708 void *pUnusedData
709 )
710 {
711 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
712 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
713
714 if (!IS_VIRTUAL(pGpu))
715 {
716 NvBool bTopLevelScrubberEnabled = NV_FALSE;
717 NvBool bTopLevelScrubberConstructed = NV_FALSE;
718
719 memmgrGetTopLevelScrubberStatus(pGpu, pMemoryManager,
720 &bTopLevelScrubberEnabled, &bTopLevelScrubberConstructed);
721
722 //
723 // This callback is handled as part of the same routine that triggers
724 // scrubber initialization. Unfortunately this callback depends on the
725 // scrubber being initialized first, and we cannot enforce that the scrubber
726 // callback always goes first. However, the trigger routine does support a
727 // retry mechanism that will allow us to get called back after all of the
728 // other callbacks in the list are completed. We signal for retry by
729 // returning NV_WARN_MORE_PROCESSING_REQUIRED if the scrubber is enabled but
730 // hasn't been intialized yet. The warning will be quashed on the first
731 // attempt, but will then be reported and trigger initialization failure if
732 // it happens again on the retry.
733 //
734 // Bug: 2997744, skipping the check here because top level scrubber creation is dealyed until
735 // GPU instances are created in MIG enabled guest
736 //
737 NV_CHECK_OR_RETURN(LEVEL_SILENT,
738 !bTopLevelScrubberEnabled || bTopLevelScrubberConstructed,
739 NV_WARN_MORE_PROCESSING_REQUIRED);
740 }
741
742 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
743 memmgrSetPartitionableMem_HAL(pGpu, pMemoryManager));
744
745 if (IS_GSP_CLIENT(pGpu) && !IS_VIRTUAL(pGpu))
746 {
747 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
748 NV2080_CTRL_INTERNAL_GPU_GET_SMC_MODE_PARAMS params;
749
750 portMemSet(¶ms, 0x0, sizeof(params));
751 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
752 pRmApi->Control(pRmApi,
753 pGpu->hInternalClient,
754 pGpu->hInternalSubdevice,
755 NV2080_CTRL_CMD_INTERNAL_GPU_GET_SMC_MODE,
756 ¶ms,
757 sizeof(params)));
758
759 if (params.smcMode == NV2080_CTRL_GPU_INFO_GPU_SMC_MODE_UNSUPPORTED)
760 {
761 pGpu->setProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED, NV_FALSE);
762 }
763 }
764
765 if ((pKernelMIGManager == NULL) || !kmigmgrIsMIGSupported(pGpu, pKernelMIGManager))
766 {
767 NV_PRINTF(LEVEL_INFO, "MIG not supported on this GPU.\n");
768 return NV_OK;
769 }
770
771 if (!IS_MIG_ENABLED(pGpu) && !IS_VIRTUAL(pGpu) &&
772 pGpu->getProperty(pGpu, PDB_PROP_GPU_RESETLESS_MIG_SUPPORTED) &&
773 (gpumgrIsSystemMIGEnabled(gpuGetDBDF(pGpu)) || pKernelMIGManager->bMIGAutoOnlineEnabled || pKernelMIGManager->bBootConfigSupported))
774 {
775 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
776 NV2080_CTRL_GPU_SET_PARTITIONING_MODE_PARAMS params;
777
778 portMemSet(¶ms, 0x0, sizeof(params));
779 params.partitioningMode = NV2080_CTRL_GPU_SET_PARTITIONING_MODE_REPARTITIONING_FAST_RECONFIG;
780 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
781 pRmApi->Control(pRmApi,
782 pGpu->hInternalClient,
783 pGpu->hInternalSubdevice,
784 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_PARTITIONING_MODE,
785 ¶ms,
786 sizeof(params)));
787
788 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
789 kmigmgrSetPartitioningMode(pGpu, pKernelMIGManager));
790 }
791
792 gpumgrCacheSetMIGEnabled(pGpu, pKernelMIGManager->bMIGEnabled);
793
794 // Populate static info collection even if MIG is not enabled.
795 if ((!pGpu->getProperty(pGpu, PDB_PROP_GPU_BROKEN_FB) && IS_SILICON(pGpu)) ||
796 (IS_VIRTUAL(pGpu) && IS_MIG_ENABLED(pGpu)))
797 {
798 // Initialize static info derived from physical RM
799 NV_ASSERT_OK_OR_RETURN(kmigmgrLoadStaticInfo_HAL(pGpu, pKernelMIGManager));
800
801 //
802 // Populate static GPU instance memory config which will be used to manage
803 // GPU instance memory
804 //
805 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
806 NV_ASSERT_OK_OR_RETURN(kmemsysPopulateMIGGPUInstanceMemConfig_HAL(pGpu, pKernelMemorySystem));
807
808 // KERNEL_ONLY variants require static info to detect reduced configs
809 kmigmgrDetectReducedConfig_HAL(pGpu, pKernelMIGManager);
810 }
811
812 NV_ASSERT_OK_OR_RETURN(kmigmgrRestoreFromPersistence_HAL(pGpu, pKernelMIGManager));
813
814 return NV_OK;
815 }
816
_kmigmgrHandlePreSchedulingDisableCallback(OBJGPU * pGpu,void * pUnusedData)817 static NV_STATUS _kmigmgrHandlePreSchedulingDisableCallback
818 (
819 OBJGPU *pGpu,
820 void *pUnusedData
821 )
822 {
823 NvU32 GIIdx;
824 NvU32 CIIdx;
825 NV_STATUS rmStatus = NV_OK;
826 NvBool bDisable = NV_FALSE;
827 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
828
829 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pKernelMIGManager->kernelMIGGpuInstance); ++GIIdx)
830 {
831 if (pKernelMIGManager->kernelMIGGpuInstance[GIIdx].bValid)
832 {
833 kmigmgrDestroyGPUInstanceScrubber(pGpu, pKernelMIGManager, &pKernelMIGManager->kernelMIGGpuInstance[GIIdx]);
834 }
835 }
836
837 if (IS_VIRTUAL(pGpu) && kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
838 return NV_OK;
839
840 //
841 // Update persistent instance topology so that we can recreate it on next
842 // GPU attach.
843 //
844 NV_ASSERT_OK(kmigmgrSaveToPersistence(pGpu, pKernelMIGManager));
845
846 if (!IS_VIRTUAL(pGpu) && !IS_GSP_CLIENT(pGpu))
847 return NV_OK;
848
849 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pKernelMIGManager->kernelMIGGpuInstance); ++GIIdx)
850 {
851 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = &pKernelMIGManager->kernelMIGGpuInstance[GIIdx];
852 NvU32 swizzId;
853
854 // Skip invalid gpu instances
855 if (!pKernelMIGGpuInstance->bValid)
856 continue;
857
858 swizzId = pKernelMIGGpuInstance->swizzId;
859
860 // Shouldn't be any valid gpu instances
861 NV_PRINTF(LEVEL_ERROR,
862 "Invalidating valid gpu instance with swizzId = %d\n",
863 swizzId);
864
865 for (CIIdx = 0;
866 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
867 ++CIIdx)
868 {
869 MIG_COMPUTE_INSTANCE *pMIGComputeInstance =
870 &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx];
871
872 // Skip invalid compute instances
873 if (!pMIGComputeInstance->bValid)
874 continue;
875
876 // Shouldn't be any valid compute instances
877 NV_PRINTF(LEVEL_ERROR,
878 "Invalidating valid compute instance with id = %d\n",
879 CIIdx);
880
881 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
882 kmigmgrDeleteComputeInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, CIIdx, NV_TRUE));
883
884 if (IS_GSP_CLIENT(pGpu))
885 {
886 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
887 NVC637_CTRL_EXEC_PARTITIONS_DELETE_PARAMS params;
888
889 portMemSet(¶ms, 0, sizeof(params));
890 params.execPartCount = 1;
891 params.execPartId[0] = CIIdx;
892
893 NV_ASSERT_OK(
894 pRmApi->Control(pRmApi,
895 pKernelMIGGpuInstance->instanceHandles.hClient,
896 pKernelMIGGpuInstance->instanceHandles.hSubscription,
897 NVC637_CTRL_CMD_EXEC_PARTITIONS_DELETE,
898 ¶ms,
899 sizeof(params)));
900 }
901 }
902
903 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
904 kmigmgrInvalidateGPUInstance(pGpu, pKernelMIGManager, swizzId, NV_TRUE));
905
906 if (IS_GSP_CLIENT(pGpu))
907 {
908 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
909 NV2080_CTRL_GPU_SET_PARTITIONS_PARAMS params;
910
911 portMemSet(¶ms, 0, sizeof(params));
912 params.partitionCount = 1;
913 params.partitionInfo[0].bValid = NV_FALSE;
914 params.partitionInfo[0].swizzId = swizzId;
915
916 NV_ASSERT_OK(
917 pRmApi->Control(pRmApi,
918 pGpu->hInternalClient,
919 pGpu->hInternalSubdevice,
920 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_GPU_INSTANCES,
921 ¶ms,
922 sizeof(params)));
923 }
924
925 // There was an active gpu instance, we need to disable MIG later
926 bDisable = NV_TRUE;
927 }
928
929 // Disable MIG
930 if (pKernelMIGManager->swizzIdInUseMask != 0x0)
931 {
932 NV_ASSERT(0);
933 NV_PRINTF(LEVEL_ERROR, "leaked swizzid mask 0x%llx !!\n", pKernelMIGManager->swizzIdInUseMask);
934 }
935
936 if (bDisable)
937 {
938 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
939 kmigmgrSetMIGState(pGpu, pKernelMIGManager, NV_TRUE, NV_FALSE, NV_TRUE));
940 }
941
942 return NV_OK;
943 }
944
945 NV_STATUS
kmigmgrStateInitLocked_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)946 kmigmgrStateInitLocked_IMPL
947 (
948 OBJGPU *pGpu,
949 KernelMIGManager *pKernelMIGManager
950 )
951 {
952 //
953 // Configure MIG Mode based on devinit's determination of MIG enable
954 // preconditions being met or not. Devinit will set SW_SCRATCH bit if MIG
955 // mode was requested and was able to be supported / enabled.
956 //
957 if (kmigmgrIsDevinitMIGBitSet_HAL(pGpu, pKernelMIGManager))
958 pKernelMIGManager->bMIGEnabled = NV_TRUE;
959
960 NV_CHECK_OR_RETURN(LEVEL_SILENT, kmigmgrIsMIGSupported(pGpu, pKernelMIGManager), NV_OK);
961
962 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_RESETLESS_MIG_SUPPORTED))
963 {
964 NvU32 data32;
965 if (NV_OK == osReadRegistryDword(pGpu, NV_REG_STR_RM_SET_MIG_AUTO_ONLINE_MODE, &data32))
966 {
967 if (NV_REG_STR_RM_SET_MIG_AUTO_ONLINE_MODE_ENABLED == data32)
968 {
969 pKernelMIGManager->bMIGAutoOnlineEnabled = NV_TRUE;
970 }
971 }
972 }
973
974 // Setup a callback to initialize state at the very end of GPU post load
975 NV_ASSERT_OK(
976 kfifoAddSchedulingHandler(pGpu, GPU_GET_KERNEL_FIFO(pGpu),
977 _kmigmgrHandlePostSchedulingEnableCallback, NULL,
978 _kmigmgrHandlePreSchedulingDisableCallback, NULL));
979
980 return NV_OK;
981 }
982
983 /*! State unload */
984 NV_STATUS
kmigmgrStateUnload_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 flags)985 kmigmgrStateUnload_IMPL
986 (
987 OBJGPU *pGpu,
988 KernelMIGManager *pKernelMIGManager,
989 NvU32 flags
990 )
991 {
992 kmigmgrClearStaticInfo_HAL(pGpu, pKernelMIGManager);
993
994 // Nothing to do if MIG is not supported
995 NV_CHECK_OR_RETURN(LEVEL_SILENT, kmigmgrIsMIGSupported(pGpu, pKernelMIGManager), NV_OK);
996
997 kfifoRemoveSchedulingHandler(pGpu, GPU_GET_KERNEL_FIFO(pGpu),
998 _kmigmgrHandlePostSchedulingEnableCallback, NULL,
999 _kmigmgrHandlePreSchedulingDisableCallback, NULL);
1000
1001 return NV_OK;
1002 }
1003
1004 /*! Init registry overrides */
1005 void
kmigmgrInitRegistryOverrides_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1006 kmigmgrInitRegistryOverrides_IMPL
1007 (
1008 OBJGPU *pGpu,
1009 KernelMIGManager *pKernelMIGManager
1010 )
1011 {
1012 NvU32 data32;
1013
1014 //
1015 // Try reading boot config feature flags regkey from NvGlobal regkeys first.
1016 // If the NvGlobal regkey is not found, try reading from per-GPU regkeys.
1017 //
1018 if (osGetNvGlobalRegistryDword(pGpu, NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, &data32) == NV_OK)
1019 {
1020 pKernelMIGManager->bGlobalBootConfigUsed = NV_TRUE;
1021 }
1022 else if (osReadRegistryDword(pGpu, NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, &data32) == NV_OK)
1023 {
1024 // Do nothing
1025 }
1026 else
1027 {
1028 data32 = DRF_DEF(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, _SUPPORTED, _DEFAULT) |
1029 DRF_DEF(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, _AUTO_UPDATE, _DEFAULT);
1030 }
1031
1032 pKernelMIGManager->bBootConfigSupported = FLD_TEST_DRF(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, _SUPPORTED, _TRUE, data32);
1033 pKernelMIGManager->bAutoUpdateBootConfig = FLD_TEST_DRF(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_FEATURE_FLAGS, _SUPPORTED, _TRUE, data32);
1034 }
1035
1036 /**
1037 * @brief Retrieve data block for GPU instance at given slot
1038 */
1039 KERNEL_MIG_GPU_INSTANCE *
kmigmgrGetMIGGpuInstanceSlot_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 i)1040 kmigmgrGetMIGGpuInstanceSlot_IMPL
1041 (
1042 OBJGPU *pGpu,
1043 KernelMIGManager *pKernelMIGManager,
1044 NvU32 i
1045 )
1046 {
1047 NV_ASSERT_OR_RETURN(i < NV_ARRAY_ELEMENTS(pKernelMIGManager->kernelMIGGpuInstance), NULL);
1048 return &pKernelMIGManager->kernelMIGGpuInstance[i];
1049 }
1050
1051 /**
1052 * @brief Returns true if MIG is supported.
1053 * Also MIG is not supported on platforms that support ATS over NVLink.
1054 */
1055 NvBool
kmigmgrIsMIGSupported_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1056 kmigmgrIsMIGSupported_IMPL
1057 (
1058 OBJGPU *pGpu,
1059 KernelMIGManager *pKernelMIGManager
1060 )
1061 {
1062 return pGpu->getProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED);
1063 }
1064
1065 /*!
1066 * @brief Determines if MIG is enabled in supported system or not
1067 */
1068 NvBool
kmigmgrIsMIGEnabled_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1069 kmigmgrIsMIGEnabled_IMPL
1070 (
1071 OBJGPU *pGpu,
1072 KernelMIGManager *pKernelMIGManager
1073 )
1074 {
1075 return kmigmgrIsMIGSupported(pGpu, pKernelMIGManager) && pKernelMIGManager->bMIGEnabled;
1076 }
1077
1078 /*!
1079 * @brief Determines if MIG GPU instancing is enabled
1080 */
1081 NvBool
kmigmgrIsMIGGpuInstancingEnabled_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1082 kmigmgrIsMIGGpuInstancingEnabled_IMPL
1083 (
1084 OBJGPU *pGpu,
1085 KernelMIGManager *pKernelMIGManager
1086 )
1087 {
1088 return (IS_MIG_ENABLED(pGpu) &&
1089 (pKernelMIGManager->swizzIdInUseMask != 0));
1090 }
1091
1092 /*!
1093 * @brief Determines if MIG memory partitioning is enabled
1094 */
1095 NvBool
kmigmgrIsMIGMemPartitioningEnabled_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1096 kmigmgrIsMIGMemPartitioningEnabled_IMPL
1097 (
1098 OBJGPU *pGpu,
1099 KernelMIGManager *pKernelMIGManager
1100 )
1101 {
1102 NvU32 swizzId;
1103
1104 if (!IS_MIG_IN_USE(pGpu))
1105 {
1106 return NV_FALSE;
1107 }
1108
1109 FOR_EACH_INDEX_IN_MASK(64, swizzId, pKernelMIGManager->swizzIdInUseMask)
1110 {
1111 if (kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, swizzId))
1112 {
1113 return NV_TRUE;
1114 }
1115 }
1116 FOR_EACH_INDEX_IN_MASK_END;
1117
1118 return NV_FALSE;
1119 }
1120
1121 /*!
1122 * @brief Determines if NvLink and P2P are compatible with MIG
1123 */
1124 NvBool
kmigmgrIsMIGNvlinkP2PSupported_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1125 kmigmgrIsMIGNvlinkP2PSupported_IMPL
1126 (
1127 OBJGPU *pGpu,
1128 KernelMIGManager *pKernelMIGManager
1129 )
1130 {
1131 //
1132 // No need to make decision based on any override if MIG is not supported/enabled
1133 // on a specific chip
1134 //
1135 if (!IS_MIG_ENABLED(pGpu))
1136 {
1137 return NV_TRUE;
1138 }
1139
1140 // MIG+NVLINK not supported by default
1141 return NV_FALSE;
1142 }
1143
1144 /*! Retrieve immutable static data */
1145 const KERNEL_MIG_MANAGER_STATIC_INFO *
kmigmgrGetStaticInfo_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1146 kmigmgrGetStaticInfo_IMPL
1147 (
1148 OBJGPU *pGpu,
1149 KernelMIGManager *pKernelMIGManager
1150 )
1151 {
1152 KERNEL_MIG_MANAGER_PRIVATE_DATA *pPrivate = (KERNEL_MIG_MANAGER_PRIVATE_DATA *)pKernelMIGManager->pPrivate;
1153 return ((pPrivate != NULL) && pPrivate->bInitialized) ? &pPrivate->staticInfo : NULL;
1154 }
1155
1156 /*! Initialize static information sourced from VGPU static info */
1157 NV_STATUS
kmigmgrLoadStaticInfo_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1158 kmigmgrLoadStaticInfo_VF
1159 (
1160 OBJGPU *pGpu,
1161 KernelMIGManager *pKernelMIGManager
1162 )
1163 {
1164 NV_STATUS status = NV_OK;
1165 KERNEL_MIG_MANAGER_PRIVATE_DATA *pPrivate = (KERNEL_MIG_MANAGER_PRIVATE_DATA *)pKernelMIGManager->pPrivate;
1166 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
1167
1168 NV_ASSERT_OR_RETURN(pPrivate != NULL, NV_ERR_INVALID_STATE);
1169 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_ERR_INVALID_STATE);
1170
1171 if (pPrivate->bInitialized)
1172 return NV_OK;
1173
1174 pPrivate->staticInfo.pProfiles = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pProfiles));
1175 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1176 pPrivate->staticInfo.pProfiles != NULL,
1177 status = NV_ERR_NO_MEMORY;
1178 goto failed;);
1179 portMemSet(pPrivate->staticInfo.pProfiles, 0x0, sizeof(*pPrivate->staticInfo.pProfiles));
1180 // In VGPU only one profile is visible describing all resources available
1181 {
1182 NV2080_CTRL_INTERNAL_MIGMGR_PROFILE_INFO *pPartitionDesc = &pPrivate->staticInfo.pProfiles->table[0];
1183
1184 pPartitionDesc->partitionFlag = (DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _MEMORY_SIZE, _FULL) |
1185 DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, _FULL));
1186
1187 pPartitionDesc->grCount = pVSI->gpuPartitionInfo.grEngCount;
1188 pPartitionDesc->gpcCount = pVSI->gpuPartitionInfo.gpcCount;
1189 pPartitionDesc->virtualGpcCount = pVSI->gpuPartitionInfo.virtualGpcCount;
1190 pPartitionDesc->gfxGpcCount = pVSI->gpuPartitionInfo.gfxGpcCount;
1191 pPartitionDesc->veidCount = pVSI->gpuPartitionInfo.veidCount;
1192 pPartitionDesc->smCount = pVSI->gpuPartitionInfo.smCount;
1193 pPartitionDesc->ceCount = pVSI->gpuPartitionInfo.ceCount;
1194 pPartitionDesc->nvEncCount = pVSI->gpuPartitionInfo.nvEncCount;
1195 pPartitionDesc->nvDecCount = pVSI->gpuPartitionInfo.nvDecCount;
1196 pPartitionDesc->nvJpgCount = pVSI->gpuPartitionInfo.nvJpgCount;
1197 pPartitionDesc->nvOfaCount = pVSI->gpuPartitionInfo.nvOfaCount;
1198 pPartitionDesc->validCTSIdMask = pVSI->gpuPartitionInfo.validCTSIdMask;
1199 pPrivate->staticInfo.pProfiles->count = 1;
1200 }
1201
1202 bitVectorClrAll(&pPrivate->staticInfo.partitionableEngines);
1203
1204 // Use the engine info list to populate the partitionable engines in guest
1205 {
1206 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
1207 const NvU32 numEngines = kfifoGetNumEschedDrivenEngines(pKernelFifo);
1208 NvU32 engine;
1209
1210 NV_ASSERT_OK(gpuUpdateEngineTable(pGpu));
1211 for (engine = 0; engine < numEngines; ++engine)
1212 {
1213 RM_ENGINE_TYPE rmEngineType;
1214
1215 NV_ASSERT_OK_OR_RETURN(
1216 kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
1217 ENGINE_INFO_TYPE_INVALID, engine,
1218 ENGINE_INFO_TYPE_RM_ENGINE_TYPE, (NvU32 *)&rmEngineType));
1219
1220 // Skip invalid engine type values
1221 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType))
1222 {
1223 NV_ASSERT(0);
1224 continue;
1225 }
1226
1227 // Skip engines which are not partitionable
1228 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType))
1229 continue;
1230
1231 bitVectorSet(&pPrivate->staticInfo.partitionableEngines, rmEngineType);
1232 }
1233 }
1234
1235 pPrivate->staticInfo.pCIProfiles = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pCIProfiles));
1236 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1237 pPrivate->staticInfo.pCIProfiles != NULL,
1238 status = NV_ERR_NO_MEMORY;
1239 goto failed;);
1240 portMemSet(pPrivate->staticInfo.pCIProfiles, 0x0, sizeof(*pPrivate->staticInfo.pCIProfiles));
1241 {
1242 NvU32 entryCount = 0;
1243 NvU32 i;
1244
1245 NV_ASSERT(pVSI->ciProfiles.profileCount <= NV_ARRAY_ELEMENTS(pPrivate->staticInfo.pCIProfiles->profiles));
1246
1247 for (i = 0; i < pVSI->ciProfiles.profileCount; i++)
1248 {
1249 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pCIProfile;
1250
1251 // Filter any profiles which would not have fit on the VGPU's instance anyway
1252 if (pVSI->ciProfiles.profiles[i].smCount > pVSI->gpuPartitionInfo.smCount)
1253 continue;
1254
1255 pCIProfile = &pPrivate->staticInfo.pCIProfiles->profiles[entryCount];
1256 pCIProfile->gfxGpcCount = pVSI->ciProfiles.profiles[i].gfxGpcCount;
1257 pCIProfile->computeSize = pVSI->ciProfiles.profiles[i].computeSize;
1258 pCIProfile->gpcCount = pVSI->ciProfiles.profiles[i].gpcCount;
1259 pCIProfile->physicalSlots = pVSI->ciProfiles.profiles[i].gpcCount;
1260 pCIProfile->veidCount = pVSI->ciProfiles.profiles[i].veidCount;
1261 pCIProfile->smCount = pVSI->ciProfiles.profiles[i].smCount;
1262
1263 entryCount++;
1264 }
1265 pPrivate->staticInfo.pCIProfiles->profileCount = entryCount;
1266 }
1267
1268 // Not used by VGPU
1269 pPrivate->staticInfo.pSwizzIdFbMemPageRanges = NULL;
1270
1271 // Support to be added for skylines on vgpu as part of bug 3424046
1272 pPrivate->staticInfo.pSkylineInfo = NULL;
1273
1274 // Publish static data
1275 pPrivate->bInitialized = NV_TRUE;
1276
1277 // Load fake static info for VGPU
1278 kmigmgrSetStaticInfo_HAL(pGpu, pKernelMIGManager);
1279
1280 return NV_OK;
1281
1282 failed:
1283 portMemFree(pPrivate->staticInfo.pProfiles);
1284 pPrivate->staticInfo.pProfiles = NULL;
1285 portMemFree(pPrivate->staticInfo.pSwizzIdFbMemPageRanges);
1286 pPrivate->staticInfo.pSwizzIdFbMemPageRanges = NULL;
1287 portMemFree(pPrivate->staticInfo.pCIProfiles);
1288 pPrivate->staticInfo.pCIProfiles = NULL;
1289
1290 return status;
1291 }
1292
1293 /*! Initialize static information queried from Physical RM */
1294 NV_STATUS
kmigmgrLoadStaticInfo_KERNEL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1295 kmigmgrLoadStaticInfo_KERNEL
1296 (
1297 OBJGPU *pGpu,
1298 KernelMIGManager *pKernelMIGManager
1299 )
1300 {
1301 KERNEL_MIG_MANAGER_PRIVATE_DATA *pPrivate = (KERNEL_MIG_MANAGER_PRIVATE_DATA *)pKernelMIGManager->pPrivate;
1302 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
1303 NV_STATUS status;
1304 NV2080_CTRL_INTERNAL_STATIC_MIGMGR_GET_PARTITIONABLE_ENGINES_PARAMS params = {0};
1305 ENGTYPE_BIT_VECTOR partitionableNv2080Engines;
1306 NvU32 nv2080EngineType;
1307
1308 NV_ASSERT_OR_RETURN(pPrivate != NULL, NV_ERR_INVALID_STATE);
1309
1310 if (pPrivate->bInitialized)
1311 return NV_OK;
1312
1313 //
1314 // HACK
1315 // Some of the static data implementations depend on other static data. We
1316 // must publish early to make the data accessible as it becomes available.
1317 //
1318 pPrivate->bInitialized = NV_TRUE;
1319
1320 bitVectorClrAll(&pPrivate->staticInfo.partitionableEngines);
1321
1322 if (IS_GSP_CLIENT(pGpu))
1323 {
1324 NV_CHECK(LEVEL_ERROR, kmigmgrEnableAllLCEs(pGpu, pKernelMIGManager, NV_TRUE) == NV_OK);
1325 }
1326
1327 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
1328 pRmApi->Control(pRmApi,
1329 pGpu->hInternalClient,
1330 pGpu->hInternalSubdevice,
1331 NV2080_CTRL_CMD_INTERNAL_STATIC_KMIGMGR_GET_PARTITIONABLE_ENGINES,
1332 ¶ms,
1333 sizeof(params)),
1334 failed);
1335
1336 //
1337 // Copy over the engineMask and save it in the staticInfo for later use.
1338 // In staticInfo, we use RMEngineTypes, so convert the nv2080 types before saving.
1339 //
1340 bitVectorFromRaw(&partitionableNv2080Engines,
1341 params.engineMask,
1342 sizeof(params.engineMask));
1343 FOR_EACH_IN_BITVECTOR(&partitionableNv2080Engines, nv2080EngineType)
1344 {
1345 bitVectorSet(&pPrivate->staticInfo.partitionableEngines,
1346 gpuGetRmEngineType(nv2080EngineType));
1347 }
1348 FOR_EACH_IN_BITVECTOR_END();
1349
1350
1351 pPrivate->staticInfo.pSkylineInfo = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pSkylineInfo));
1352 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1353 pPrivate->staticInfo.pSkylineInfo != NULL,
1354 status = NV_ERR_NO_MEMORY;
1355 goto failed;);
1356 portMemSet(pPrivate->staticInfo.pSkylineInfo, 0x0, sizeof(*pPrivate->staticInfo.pSkylineInfo));
1357
1358 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
1359 pRmApi->Control(pRmApi,
1360 pGpu->hInternalClient,
1361 pGpu->hInternalSubdevice,
1362 NV2080_CTRL_CMD_INTERNAL_STATIC_GRMGR_GET_SKYLINE_INFO,
1363 pPrivate->staticInfo.pSkylineInfo,
1364 sizeof(*pPrivate->staticInfo.pSkylineInfo)),
1365 failed);
1366
1367 pPrivate->staticInfo.pCIProfiles = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pCIProfiles));
1368 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1369 pPrivate->staticInfo.pCIProfiles != NULL,
1370 status = NV_ERR_NO_MEMORY;
1371 goto failed;);
1372 portMemSet(pPrivate->staticInfo.pCIProfiles, 0x0, sizeof(*pPrivate->staticInfo.pCIProfiles));
1373
1374 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
1375 pRmApi->Control(pRmApi,
1376 pGpu->hInternalClient,
1377 pGpu->hInternalSubdevice,
1378 NV2080_CTRL_CMD_INTERNAL_STATIC_KMIGMGR_GET_COMPUTE_PROFILES,
1379 pPrivate->staticInfo.pCIProfiles,
1380 sizeof(*pPrivate->staticInfo.pCIProfiles)),
1381 failed);
1382
1383 pPrivate->staticInfo.pProfiles = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pProfiles));
1384 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1385 pPrivate->staticInfo.pProfiles != NULL,
1386 status = NV_ERR_NO_MEMORY;
1387 goto failed;);
1388 portMemSet(pPrivate->staticInfo.pProfiles, 0x0, sizeof(*pPrivate->staticInfo.pProfiles));
1389
1390 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
1391 pRmApi->Control(pRmApi,
1392 pGpu->hInternalClient,
1393 pGpu->hInternalSubdevice,
1394 NV2080_CTRL_CMD_INTERNAL_STATIC_KMIGMGR_GET_PROFILES,
1395 pPrivate->staticInfo.pProfiles,
1396 sizeof(*pPrivate->staticInfo.pProfiles)),
1397 failed);
1398
1399 if (IS_GSP_CLIENT(pGpu))
1400 {
1401 NV_CHECK(LEVEL_ERROR, kmigmgrEnableAllLCEs(pGpu, pKernelMIGManager, NV_FALSE) == NV_OK);
1402 }
1403
1404 pPrivate->staticInfo.pSwizzIdFbMemPageRanges = portMemAllocNonPaged(sizeof(*pPrivate->staticInfo.pSwizzIdFbMemPageRanges));
1405 NV_CHECK_OR_ELSE(LEVEL_ERROR,
1406 pPrivate->staticInfo.pSwizzIdFbMemPageRanges != NULL,
1407 status = NV_ERR_NO_MEMORY;
1408 goto failed;);
1409 portMemSet(pPrivate->staticInfo.pSwizzIdFbMemPageRanges, 0x0, sizeof(*pPrivate->staticInfo.pSwizzIdFbMemPageRanges));
1410
1411 status = pRmApi->Control(pRmApi,
1412 pGpu->hInternalClient,
1413 pGpu->hInternalSubdevice,
1414 NV2080_CTRL_CMD_INTERNAL_STATIC_KMIGMGR_GET_SWIZZ_ID_FB_MEM_PAGE_RANGES,
1415 pPrivate->staticInfo.pSwizzIdFbMemPageRanges,
1416 sizeof(*pPrivate->staticInfo.pSwizzIdFbMemPageRanges));
1417
1418 if (status == NV_ERR_NOT_SUPPORTED)
1419 {
1420 // Only supported on specific GPU's
1421 status = NV_OK;
1422 portMemFree(pPrivate->staticInfo.pSwizzIdFbMemPageRanges);
1423 pPrivate->staticInfo.pSwizzIdFbMemPageRanges = NULL;
1424 }
1425 else if (status != NV_OK)
1426 {
1427 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, status, failed);
1428 }
1429
1430 return status;
1431
1432 failed:
1433 portMemFree(pPrivate->staticInfo.pProfiles);
1434 pPrivate->staticInfo.pProfiles = NULL;
1435 portMemFree(pPrivate->staticInfo.pSwizzIdFbMemPageRanges);
1436 pPrivate->staticInfo.pSwizzIdFbMemPageRanges = NULL;
1437 portMemFree(pPrivate->staticInfo.pCIProfiles);
1438 pPrivate->staticInfo.pCIProfiles = NULL;
1439 portMemFree(pKernelMIGManager->pPrivate->staticInfo.pSkylineInfo);
1440 pKernelMIGManager->pPrivate->staticInfo.pSkylineInfo = NULL;
1441
1442 pPrivate->bInitialized = NV_FALSE;
1443
1444 return status;
1445 }
1446
1447 /*!
1448 * @brief Clears Static information set for vGPU
1449 */
1450 void
kmigmgrClearStaticInfo_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1451 kmigmgrClearStaticInfo_VF
1452 (
1453 OBJGPU *pGpu,
1454 KernelMIGManager *pKernelMIGManager
1455 )
1456 {
1457 NvU32 i;
1458
1459 // Nothing to do
1460 if (!kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
1461 return;
1462
1463 for (i = 0; i < KMIGMGR_MAX_GPU_INSTANCES; ++i)
1464 {
1465 if (pKernelMIGManager->kernelMIGGpuInstance[i].pShare != NULL)
1466 {
1467 serverFreeShare(&g_resServ, pKernelMIGManager->kernelMIGGpuInstance[i].pShare);
1468 pKernelMIGManager->kernelMIGGpuInstance[i].pShare = NULL;
1469 }
1470
1471 kmigmgrInitGPUInstanceInfo(pGpu, pKernelMIGManager, &pKernelMIGManager->kernelMIGGpuInstance[i]);
1472 }
1473 }
1474
1475 /*!
1476 * @brief Save MIG topology from VGPU static info to persistence, if available.
1477 */
1478 NV_STATUS
kmigmgrSaveToPersistenceFromVgpuStaticInfo_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1479 kmigmgrSaveToPersistenceFromVgpuStaticInfo_VF
1480 (
1481 OBJGPU *pGpu,
1482 KernelMIGManager *pKernelMIGManager
1483 )
1484 {
1485 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
1486 GPUMGR_SAVE_MIG_INSTANCE_TOPOLOGY *pTopologySave;
1487 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave;
1488 ENGTYPE_BIT_VECTOR engines;
1489 NvBool bTopologyValid;
1490 NvU32 GIIdx;
1491 NvU32 CIIdx;
1492 NvU32 savedCIIdx;
1493 NvU32 assignableGrMask;
1494 NvU32 i;
1495
1496 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_ERR_INVALID_ARGUMENT);
1497
1498 NV_CHECK_OR_RETURN(LEVEL_SILENT,
1499 gpumgrGetSystemMIGInstanceTopo(gpuGetDBDF(pGpu), &pTopologySave),
1500 NV_OK);
1501
1502 // check VSI to see whether we've restored from it before.
1503 NV_CHECK_OR_RETURN(LEVEL_SILENT, !pTopologySave->bVgpuRestoredFromStaticInfo, NV_OK);
1504
1505 // Check to see whether there is anything already saved
1506 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI); ++GIIdx)
1507 {
1508 pGPUInstanceSave = &pTopologySave->saveGI[GIIdx];
1509 if (pGPUInstanceSave->bValid)
1510 break;
1511 }
1512
1513 bTopologyValid = (GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI));
1514 NV_CHECK_OR_RETURN(LEVEL_SILENT, !bTopologyValid, NV_OK);
1515
1516 // There can only be one saved GPU instance in VGPU
1517 pGPUInstanceSave = &pTopologySave->saveGI[0];
1518
1519 pGPUInstanceSave->bValid = NV_TRUE;
1520 pGPUInstanceSave->swizzId = 0;
1521 pGPUInstanceSave->giInfo.partitionFlags = (DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _MEMORY_SIZE, _FULL) |
1522 DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, _FULL) );
1523
1524 gpumgrCacheCreateGpuInstance(pGpu, pGPUInstanceSave->swizzId);
1525
1526 bitVectorClrAll(&engines);
1527
1528 if (pVSI->gpuPartitionInfo.grEngCount > 0)
1529 bitVectorSetRange(&engines,
1530 rangeMake(RM_ENGINE_TYPE_GR(0),
1531 RM_ENGINE_TYPE_GR(pVSI->gpuPartitionInfo.grEngCount - 1)));
1532
1533 if (pVSI->gpuPartitionInfo.ceCount > 0)
1534 bitVectorSetRange(&engines,
1535 rangeMake(RM_ENGINE_TYPE_COPY(0),
1536 RM_ENGINE_TYPE_COPY(pVSI->gpuPartitionInfo.ceCount - 1)));
1537
1538 if (pVSI->gpuPartitionInfo.nvDecCount > 0)
1539 bitVectorSetRange(&engines,
1540 rangeMake(RM_ENGINE_TYPE_NVDEC(0),
1541 RM_ENGINE_TYPE_NVDEC(pVSI->gpuPartitionInfo.nvDecCount - 1)));
1542
1543 if (pVSI->gpuPartitionInfo.nvEncCount > 0)
1544 bitVectorSetRange(&engines,
1545 rangeMake(RM_ENGINE_TYPE_NVENC(0),
1546 RM_ENGINE_TYPE_NVENC(pVSI->gpuPartitionInfo.nvEncCount - 1)));
1547
1548 if (pVSI->gpuPartitionInfo.nvJpgCount > 0)
1549 bitVectorSetRange(&engines,
1550 rangeMake(RM_ENGINE_TYPE_NVJPEG(0),
1551 RM_ENGINE_TYPE_NVJPEG(pVSI->gpuPartitionInfo.nvJpgCount - 1)));
1552
1553 if (pVSI->gpuPartitionInfo.nvOfaCount > 0)
1554 bitVectorSetRange(&engines,
1555 rangeMake(RM_ENGINE_TYPE_OFA(0),
1556 RM_ENGINE_TYPE_OFA(pVSI->gpuPartitionInfo.nvOfaCount - 1)));
1557
1558 bitVectorToRaw(&engines,
1559 pGPUInstanceSave->giInfo.enginesMask,
1560 sizeof(pGPUInstanceSave->giInfo.enginesMask));
1561
1562 // Create a mask of GR IDs to later use in restoring
1563 assignableGrMask = 0x0;
1564 for (i = 0; i < pVSI->gpuPartitionInfo.grEngCount; i++)
1565 if (pVSI->gpuPartitionInfo.gpcsPerGr[i] != 0)
1566 assignableGrMask |= NVBIT32(i);
1567
1568 NV_ASSERT_OR_RETURN(nvPopCount32(assignableGrMask) <= pVSI->execPartitionInfo.execPartCount, NV_ERR_INSUFFICIENT_RESOURCES);
1569
1570 pGPUInstanceSave->giInfo.veidOffset = 0;
1571 pGPUInstanceSave->giInfo.veidCount = pVSI->gpuPartitionInfo.veidCount;
1572 pGPUInstanceSave->giInfo.gpcMask = DRF_MASK(pVSI->gpuPartitionInfo.gpcCount - 1 : 0);
1573 pGPUInstanceSave->giInfo.virtualGpcCount = pVSI->gpuPartitionInfo.virtualGpcCount;
1574
1575 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1576 osRmCapRegisterSmcPartition(pGpu->pOsRmCaps, &pGPUInstanceSave->pOsRmCaps, pGPUInstanceSave->swizzId));
1577
1578 savedCIIdx = 0;
1579 for (CIIdx = 0; CIIdx < pVSI->execPartitionInfo.execPartCount; ++CIIdx)
1580 {
1581 GPUMGR_SAVE_COMPUTE_INSTANCE *pComputeInstanceSave = &pGPUInstanceSave->saveCI[savedCIIdx];
1582 NVC637_CTRL_EXEC_PARTITIONS_INFO *pExecPartInfo = &pVSI->execPartitionInfo.execPartInfo[CIIdx];
1583 NvU32 grIdx = portUtilCountTrailingZeros32(assignableGrMask);
1584 ENGTYPE_BIT_VECTOR engines;
1585
1586 NV_CHECK_OR_RETURN(LEVEL_ERROR, grIdx < RM_ENGINE_TYPE_GR_SIZE, NV_ERR_INVALID_STATE);
1587
1588 pComputeInstanceSave->bValid = NV_TRUE;
1589 pComputeInstanceSave->ciInfo.sharedEngFlags = pExecPartInfo->sharedEngFlag;
1590 pComputeInstanceSave->id = pVSI->execPartitionInfo.execPartId[CIIdx];
1591
1592 //
1593 // This association is not strictly enforced when allocating compute instances
1594 // however, the ordering RM expects here is that the local GR index per GI
1595 // matches with the execPartId that it was created to. Simply perform a non-fatal
1596 // check for logging purposes.
1597 //
1598 NV_CHECK(LEVEL_WARNING, pComputeInstanceSave->id == grIdx);
1599
1600 bitVectorClrAll(&engines);
1601 bitVectorSetRange(&engines,
1602 rangeMake(RM_ENGINE_TYPE_GR(grIdx),
1603 RM_ENGINE_TYPE_GR(grIdx)));
1604 assignableGrMask &= ~(NVBIT32(grIdx));
1605
1606 if (pExecPartInfo->ceCount > 0)
1607 bitVectorSetRange(&engines,
1608 rangeMake(RM_ENGINE_TYPE_COPY(0),
1609 RM_ENGINE_TYPE_COPY(pExecPartInfo->ceCount - 1)));
1610
1611 if (pExecPartInfo->nvDecCount > 0)
1612 bitVectorSetRange(&engines,
1613 rangeMake(RM_ENGINE_TYPE_NVDEC(0),
1614 RM_ENGINE_TYPE_NVDEC(pExecPartInfo->nvDecCount - 1)));
1615
1616 if (pExecPartInfo->nvEncCount > 0)
1617 bitVectorSetRange(&engines,
1618 rangeMake(RM_ENGINE_TYPE_NVENC(0),
1619 RM_ENGINE_TYPE_NVENC(pExecPartInfo->nvEncCount - 1)));
1620
1621 if (pExecPartInfo->nvJpgCount > 0)
1622 bitVectorSetRange(&engines,
1623 rangeMake(RM_ENGINE_TYPE_NVJPEG(0),
1624 RM_ENGINE_TYPE_NVJPEG(pExecPartInfo->nvJpgCount - 1)));
1625
1626 if (pExecPartInfo->ofaCount > 0)
1627 bitVectorSetRange(&engines,
1628 rangeMake(RM_ENGINE_TYPE_OFA(0),
1629 RM_ENGINE_TYPE_OFA(pExecPartInfo->ofaCount - 1)));
1630
1631 bitVectorToRaw(&engines,
1632 pComputeInstanceSave->ciInfo.enginesMask,
1633 sizeof(pComputeInstanceSave->ciInfo.enginesMask));
1634
1635 pComputeInstanceSave->ciInfo.gpcMask = DRF_MASK(pExecPartInfo->gpcCount - 1 : 0);
1636 pComputeInstanceSave->ciInfo.gfxGpcCount = pExecPartInfo->gfxGpcCount;
1637 pComputeInstanceSave->ciInfo.spanStart = pExecPartInfo->spanStart;
1638 pComputeInstanceSave->ciInfo.smCount = pExecPartInfo->smCount;
1639 pComputeInstanceSave->ciInfo.computeSize = pExecPartInfo->computeSize;
1640 pComputeInstanceSave->ciInfo.veidCount = pExecPartInfo->veidCount;
1641 pComputeInstanceSave->ciInfo.veidOffset = pExecPartInfo->veidStartOffset;
1642
1643 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1644 osRmCapRegisterSmcExecutionPartition(pGPUInstanceSave->pOsRmCaps,
1645 &(pComputeInstanceSave->pOsRmCaps),
1646 pComputeInstanceSave->id));
1647
1648 ++savedCIIdx;
1649 }
1650
1651 //
1652 // Make sure that we never try to restore from static info again, to allow
1653 // guest to change the topology themselves.
1654 //
1655 pTopologySave->bVgpuRestoredFromStaticInfo = NV_TRUE;
1656
1657 return NV_OK;
1658 }
1659
1660 /*!
1661 * @brief Sets static KernelMIGManager, KernelGraphicsManager, and partitionInfo inside a vGPU
1662 */
1663 NV_STATUS
kmigmgrSetStaticInfo_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1664 kmigmgrSetStaticInfo_VF
1665 (
1666 OBJGPU *pGpu,
1667 KernelMIGManager *pKernelMIGManager
1668 )
1669 {
1670 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo;
1671 VGPU_STATIC_INFO *pVSI;
1672 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance;
1673 MIG_RESOURCE_ALLOCATION *pResourceAllocation;
1674 NvU32 veidOffset = 0;
1675 NvU32 grIdx;
1676 NV_RANGE memoryRange;
1677 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
1678 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
1679
1680 pVSI = GPU_GET_STATIC_INFO(pGpu);
1681 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_ERR_OBJECT_NOT_FOUND);
1682
1683 // If MIG isn't enabled for this VM, nothing to do
1684 NV_CHECK_OR_RETURN(LEVEL_SILENT, IS_MIG_ENABLED(pGpu), NV_OK);
1685
1686 pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
1687 NV_ASSERT_OR_RETURN(pStaticInfo != NULL, NV_ERR_INVALID_STATE);
1688
1689 //
1690 // Fill required GPU instance info and create state needed in
1691 // KernelMIGManager/KernelGraphicsManager For legacy MIG vgpu policy, there
1692 // is only one GPU instance with no compute instances, so we statically
1693 // setup its resources taken from plugin and assign swizzID-0 to it inside
1694 // vGPU. For production vgpu policy, guest is responsible for requesting GPU
1695 // instance and compute instance creation, so we initialize RM with the
1696 // correct resource counts and let the rest of the instancing APIs work as
1697 // in host RM.
1698 //
1699
1700 if (kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
1701 {
1702 //
1703 // In legacy flow, mark swizzid 0 as in use, as the GPU instance is
1704 // pre-populated
1705 //
1706 NV_ASSERT_OK_OR_RETURN(kmigmgrSetSwizzIdInUse(pGpu, pKernelMIGManager, 0));
1707 }
1708 else
1709 {
1710 //
1711 // In Prod flow, copy any GPU instance info retrieved from vpgu static info
1712 // into persistent storage on first driver boot.
1713 //
1714 NV_ASSERT_OK(kmigmgrSaveToPersistenceFromVgpuStaticInfo_HAL(pGpu, pKernelMIGManager));
1715 }
1716
1717 if (kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
1718 {
1719 // In legacy flow, engines are pre-configured. VEIDs are marked as in-use
1720 kgrmgrSetVeidInUseMask(pGpu, pKernelGraphicsManager, DRF_MASK64(pVSI->gpuPartitionInfo.veidCount - 1 : 0));
1721 NV_PRINTF(LEVEL_INFO, "VF VEID in use mask: 0x%llX\n", kgrmgrGetVeidInUseMask(pGpu, pKernelGraphicsManager));
1722 }
1723
1724 NV_ASSERT_OK_OR_RETURN(memmgrDiscoverMIGPartitionableMemoryRange_HAL(pGpu, pMemoryManager, &memoryRange));
1725
1726 memmgrSetMIGPartitionableMemoryRange(pGpu, pMemoryManager, memoryRange);
1727
1728 if (kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
1729 {
1730 GPUMGR_SAVE_GPU_INSTANCE save = { 0 };
1731 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params =
1732 {
1733 .type = KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_RESTORE,
1734 .inst.restore.pGPUInstanceSave = &save
1735 };
1736 NvU32 grCount;
1737 NvUuid uuid;
1738
1739 save.bValid = NV_TRUE;
1740 save.swizzId = 0;
1741 save.giInfo.partitionFlags = (DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _MEMORY_SIZE, _FULL) |
1742 DRF_DEF(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, _FULL) );
1743
1744 bitVectorToRaw(&pStaticInfo->partitionableEngines,
1745 save.giInfo.enginesMask,
1746 sizeof(save.giInfo.enginesMask));
1747
1748 save.giInfo.veidOffset = 0;
1749 save.giInfo.veidCount = pVSI->gpuPartitionInfo.veidCount;
1750 save.giInfo.gpcMask = DRF_MASK(pVSI->gpuPartitionInfo.gpcCount - 1 : 0);
1751
1752 NV_ASSERT_OK_OR_RETURN(kmigmgrGenerateGPUInstanceUuid_HAL(pGpu, pKernelMIGManager, 0, &uuid));
1753
1754 // Create static GPU instance for legacy vgpu flow
1755 NV_ASSERT_OK_OR_RETURN(kmigmgrSetGPUInstanceInfo(pGpu, pKernelMIGManager, 0/*SwizzID-0*/, uuid.uuid, params));
1756 NV_ASSERT_OK_OR_RETURN(kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, 0, &pKernelMIGGpuInstance));
1757
1758 gpumgrCacheCreateGpuInstance(pGpu, 0);
1759
1760 pResourceAllocation = &pKernelMIGGpuInstance->resourceAllocation;
1761 pResourceAllocation->virtualGpcCount = pVSI->gpuPartitionInfo.virtualGpcCount;
1762 pResourceAllocation->gfxGpcCount = pVSI->gpuPartitionInfo.gfxGpcCount;
1763
1764 grCount = kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_GR(0));
1765 for (grIdx = 0; grIdx < grCount; ++grIdx)
1766 {
1767 // set VEID mask for grIdx
1768 kgrmgrSetGrIdxVeidMask(pGpu, pKernelGraphicsManager, grIdx, DRF_MASK64(pVSI->gpuPartitionInfo.veidsPerGr[grIdx] - 1:0) << veidOffset);
1769 veidOffset += pVSI->gpuPartitionInfo.veidsPerGr[grIdx];
1770
1771 if (pVSI->gpuPartitionInfo.gpcsPerGr[grIdx] != 0)
1772 {
1773 KernelGraphics *pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, grIdx);
1774
1775 kgraphicsInvalidateStaticInfo(pGpu, pKernelGraphics);
1776 NV_ASSERT_OK(kgraphicsLoadStaticInfo_HAL(pGpu, pKernelGraphics, 0));
1777 }
1778 }
1779 }
1780
1781 return NV_OK;
1782 }
1783
1784 /*!
1785 * @brief Disable RC Watchdog
1786 */
1787 NV_STATUS
kmigmgrDisableWatchdog_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMigManager)1788 kmigmgrDisableWatchdog_IMPL
1789 (
1790 OBJGPU *pGpu,
1791 KernelMIGManager *pKernelMigManager
1792 )
1793 {
1794 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
1795 NvU32 wdFlags = pKernelRc->watchdog.flags;
1796 NvS32 enableRequestsRefcount;
1797 NvS32 disableRequestsRefcount;
1798 NvS32 softDisableRequestsRefcount;
1799
1800 krcWatchdogGetReservationCounts(pKernelRc,
1801 &enableRequestsRefcount,
1802 &disableRequestsRefcount,
1803 &softDisableRequestsRefcount);
1804
1805 //
1806 // If clients have made requests to the watchdog, we can't enable MIG until
1807 // these clients have gone away because we disallow them from modifying WD
1808 // state while MIG is active but these clients need to release their
1809 // refcount on exit
1810 //
1811 if ((enableRequestsRefcount != 0) || (disableRequestsRefcount != 0) ||
1812 (softDisableRequestsRefcount != 0))
1813 {
1814 NV_PRINTF(LEVEL_ERROR,
1815 "Failed to disable watchdog with outstanding reservations - enable: %d disable: %d softDisable: %d.\n",
1816 enableRequestsRefcount,
1817 disableRequestsRefcount,
1818 softDisableRequestsRefcount);
1819
1820 return NV_ERR_STATE_IN_USE;
1821 }
1822
1823 NV_CHECK_OR_RETURN(LEVEL_SILENT, (wdFlags & WATCHDOG_FLAGS_INITIALIZED) != 0x0, NV_OK);
1824
1825 pKernelMigManager->bRestoreWatchdog = NV_TRUE;
1826 pKernelMigManager->bReenableWatchdog = (wdFlags & WATCHDOG_FLAGS_DISABLED) == 0x0;
1827
1828 return krcWatchdogShutdown(pGpu, pKernelRc);
1829 }
1830
1831 /*!
1832 * @brief Enable RC Watchdog if it was enabled before kmigmgrDisableWatchdog invocation
1833 */
1834 NV_STATUS
kmigmgrRestoreWatchdog_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMigManager)1835 kmigmgrRestoreWatchdog_IMPL
1836 (
1837 OBJGPU *pGpu,
1838 KernelMIGManager *pKernelMigManager
1839 )
1840 {
1841 KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu);
1842
1843 NV_CHECK_OR_RETURN(LEVEL_SILENT, pKernelMigManager->bRestoreWatchdog, NV_OK);
1844
1845 if (pKernelMigManager->bReenableWatchdog)
1846 {
1847 krcWatchdogEnable(pKernelRc, NV_FALSE /* bOverRide */);
1848 }
1849
1850 pKernelMigManager->bRestoreWatchdog = NV_FALSE;
1851 pKernelMigManager->bReenableWatchdog = NV_FALSE;
1852
1853 return krcWatchdogInit_HAL(pGpu, pKernelRc);
1854 }
1855
1856 /*!
1857 * @brief Function to set swizzId in use
1858 */
1859 NV_STATUS
kmigmgrSetSwizzIdInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId)1860 kmigmgrSetSwizzIdInUse_IMPL
1861 (
1862 OBJGPU *pGpu,
1863 KernelMIGManager *pKernelMIGManager,
1864 NvU32 swizzId
1865 )
1866 {
1867 // Validate that same ID is not already set and then set the ID
1868 NvU64 mask = NVBIT64(swizzId);
1869
1870 if (swizzId >= KMIGMGR_MAX_GPU_SWIZZID)
1871 {
1872 return NV_ERR_INVALID_ARGUMENT;
1873 }
1874
1875 if (mask & pKernelMIGManager->swizzIdInUseMask)
1876 {
1877 NV_PRINTF(LEVEL_ERROR, "SwizzID - %d already in use\n", swizzId);
1878 DBG_BREAKPOINT();
1879 return NV_ERR_STATE_IN_USE;
1880 }
1881
1882 pKernelMIGManager->swizzIdInUseMask |= mask;
1883
1884 return NV_OK;
1885 }
1886
1887 /*!
1888 * @brief Function to mark swizzId free
1889 */
1890 NV_STATUS
kmigmgrClearSwizzIdInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId)1891 kmigmgrClearSwizzIdInUse_IMPL
1892 (
1893 OBJGPU *pGpu,
1894 KernelMIGManager *pKernelMIGManager,
1895 NvU32 swizzId
1896 )
1897 {
1898 // Validate that same ID is not already set and then set the ID
1899 NvU64 mask = NVBIT64(swizzId);
1900
1901 if (swizzId >= KMIGMGR_MAX_GPU_SWIZZID)
1902 {
1903 return NV_ERR_INVALID_ARGUMENT;
1904 }
1905
1906 if (!(mask & pKernelMIGManager->swizzIdInUseMask))
1907 {
1908 NV_PRINTF(LEVEL_ERROR, "SwizzID - %d not in use\n", swizzId);
1909 DBG_BREAKPOINT();
1910 return NV_ERR_INVALID_STATE;
1911 }
1912
1913 pKernelMIGManager->swizzIdInUseMask &= ~mask;
1914
1915 return NV_OK;
1916 }
1917
1918 /*!
1919 * @brief Function to see if swizzId in use
1920 */
1921 NvBool
kmigmgrIsSwizzIdInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId)1922 kmigmgrIsSwizzIdInUse_IMPL
1923 (
1924 OBJGPU *pGpu,
1925 KernelMIGManager *pKernelMIGManager,
1926 NvU32 swizzId
1927 )
1928 {
1929 NvU64 mask = NVBIT64(swizzId);
1930
1931 if (mask & pKernelMIGManager->swizzIdInUseMask)
1932 return NV_TRUE;
1933
1934 return NV_FALSE;
1935 }
1936
1937 /*
1938 * @brief Return global swizzId mask
1939 */
1940 NvU64
kmigmgrGetSwizzIdInUseMask_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)1941 kmigmgrGetSwizzIdInUseMask_IMPL
1942 (
1943 OBJGPU *pGpu,
1944 KernelMIGManager *pKernelMIGManager
1945 )
1946 {
1947 return pKernelMIGManager->swizzIdInUseMask;
1948 }
1949
1950 /*!
1951 * @brief Marks the given engines as in use by some GPU instance
1952 */
1953 NV_STATUS
kmigmgrSetEnginesInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,ENGTYPE_BIT_VECTOR * pEngines)1954 kmigmgrSetEnginesInUse_IMPL
1955 (
1956 OBJGPU *pGpu,
1957 KernelMIGManager *pKernelMIGManager,
1958 ENGTYPE_BIT_VECTOR *pEngines
1959 )
1960 {
1961 ENGTYPE_BIT_VECTOR tempEngines;
1962
1963 NV_ASSERT_OR_RETURN(pEngines != NULL, NV_ERR_INVALID_ARGUMENT);
1964
1965 bitVectorAnd(&tempEngines, pEngines, &pKernelMIGManager->partitionableEnginesInUse);
1966 // Ensure no engine in given mask is marked as in-use
1967 NV_ASSERT_OR_RETURN(bitVectorTestAllCleared(&tempEngines), NV_ERR_STATE_IN_USE);
1968
1969 // partitionableEnginesInUse |= pEngines
1970 bitVectorOr(&pKernelMIGManager->partitionableEnginesInUse,
1971 &pKernelMIGManager->partitionableEnginesInUse,
1972 pEngines);
1973 return NV_OK;
1974 }
1975
1976 /*!
1977 * @brief Marks the given sys pipes as no longer in use by any GPU instance
1978 */
1979 NV_STATUS
kmigmgrClearEnginesInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,ENGTYPE_BIT_VECTOR * pEngines)1980 kmigmgrClearEnginesInUse_IMPL
1981 (
1982 OBJGPU *pGpu,
1983 KernelMIGManager *pKernelMIGManager,
1984 ENGTYPE_BIT_VECTOR *pEngines
1985 )
1986 {
1987 ENGTYPE_BIT_VECTOR tempEngines;
1988
1989 NV_ASSERT_OR_RETURN(pEngines != NULL, NV_ERR_INVALID_ARGUMENT);
1990
1991 bitVectorAnd(&tempEngines, pEngines, &pKernelMIGManager->partitionableEnginesInUse);
1992 // Ensure every engine in given mask is marked as in-use
1993 NV_ASSERT_OR_RETURN(bitVectorTestEqual(&tempEngines, pEngines), NV_ERR_STATE_IN_USE);
1994
1995 // partitionableEnginesInUse &= ~(pEngines)
1996 bitVectorComplement(&tempEngines, pEngines);
1997 bitVectorAnd(&pKernelMIGManager->partitionableEnginesInUse,
1998 &pKernelMIGManager->partitionableEnginesInUse,
1999 &tempEngines);
2000 return NV_OK;
2001 }
2002
2003 /*!
2004 * @brief Checks whether given engine is in use by any GPU instance
2005 */
2006 NvBool
kmigmgrIsEngineInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,RM_ENGINE_TYPE rmEngineType)2007 kmigmgrIsEngineInUse_IMPL
2008 (
2009 OBJGPU *pGpu,
2010 KernelMIGManager *pKernelMIGManager,
2011 RM_ENGINE_TYPE rmEngineType
2012 )
2013 {
2014 return bitVectorTest(&pKernelMIGManager->partitionableEnginesInUse, rmEngineType);
2015 }
2016
2017 /*
2018 * @brief Determines whether RM_ENGINE_TYPE can be partitioned
2019 */
2020 NvBool
kmigmgrIsEnginePartitionable_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,RM_ENGINE_TYPE rmEngineType)2021 kmigmgrIsEnginePartitionable_IMPL
2022 (
2023 OBJGPU *pGpu,
2024 KernelMIGManager *pKernelMIGManager,
2025 RM_ENGINE_TYPE rmEngineType
2026 )
2027 {
2028 return kmigmgrIsMIGSupported(pGpu, pKernelMIGManager) &&
2029 (RM_ENGINE_TYPE_IS_COPY(rmEngineType) ||
2030 RM_ENGINE_TYPE_IS_GR(rmEngineType) ||
2031 RM_ENGINE_TYPE_IS_NVDEC(rmEngineType) ||
2032 RM_ENGINE_TYPE_IS_NVENC(rmEngineType) ||
2033 RM_ENGINE_TYPE_IS_NVJPEG(rmEngineType) ||
2034 RM_ENGINE_TYPE_IS_OFA(rmEngineType));
2035 }
2036
2037 /*!
2038 * @brief Function to determine whether global RM_ENGINE_TYPE belongs to given
2039 * gpu/compute instance.
2040 *
2041 * @return NV_TRUE if this engine falls within the given instance. NV_FALSE
2042 * otherwise. Non-partitioned engines fall within all instances.
2043 */
2044 NvBool
kmigmgrIsEngineInInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,RM_ENGINE_TYPE globalRmEngType,MIG_INSTANCE_REF ref)2045 kmigmgrIsEngineInInstance_IMPL
2046 (
2047 OBJGPU *pGpu,
2048 KernelMIGManager *pKernelMIGManager,
2049 RM_ENGINE_TYPE globalRmEngType,
2050 MIG_INSTANCE_REF ref
2051 )
2052 {
2053 return kmigmgrGetGlobalToLocalEngineType(pGpu, pKernelMIGManager, ref,
2054 globalRmEngType, NULL) == NV_OK;
2055 }
2056
2057 /*!
2058 * @brief Function to determine whether local RM_ENGINE_TYPE belongs to given
2059 * gpu/compute instance.
2060 *
2061 * @return NV_TRUE if this engine falls within the given instance. NV_FALSE
2062 * otherwise. Non-partitioned engines fall within all instances.
2063 */
2064 NvBool
kmigmgrIsLocalEngineInInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,RM_ENGINE_TYPE localRmEngType,MIG_INSTANCE_REF ref)2065 kmigmgrIsLocalEngineInInstance_IMPL
2066 (
2067 OBJGPU *pGpu,
2068 KernelMIGManager *pKernelMIGManager,
2069 RM_ENGINE_TYPE localRmEngType,
2070 MIG_INSTANCE_REF ref
2071 )
2072 {
2073 ENGTYPE_BIT_VECTOR *pLocalEngines;
2074
2075 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, localRmEngType))
2076 {
2077 return NV_TRUE;
2078 }
2079
2080 pLocalEngines = (ref.pMIGComputeInstance != NULL) ?
2081 &ref.pMIGComputeInstance->resourceAllocation.localEngines :
2082 &ref.pKernelMIGGpuInstance->resourceAllocation.localEngines;
2083
2084 return bitVectorTest(pLocalEngines, localRmEngType);
2085 }
2086
2087 /*!
2088 * @brief Trim runlist buffer pools
2089 */
2090 void
kmigmgrTrimInstanceRunlistBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)2091 kmigmgrTrimInstanceRunlistBufPools_IMPL
2092 (
2093 OBJGPU *pGpu,
2094 KernelMIGManager *pKernelMIGManager,
2095 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
2096 )
2097 {
2098 RM_ENGINE_TYPE rmEngineType;
2099 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
2100
2101 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId))
2102 return;
2103
2104 if (!ctxBufPoolIsSupported(pGpu))
2105 return;
2106
2107 for (rmEngineType = 0; rmEngineType < RM_ENGINE_TYPE_LAST; rmEngineType++)
2108 {
2109 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType) ||
2110 !kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType) ||
2111 !kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, kmigmgrMakeGIReference(pKernelMIGGpuInstance)))
2112 {
2113 continue;
2114 }
2115
2116 if (kfifoGetRunlistBufPool(pGpu, pKernelFifo, rmEngineType) != NULL)
2117 {
2118 ctxBufPoolTrim(kfifoGetRunlistBufPool(pGpu, pKernelFifo, rmEngineType));
2119 }
2120 }
2121 }
2122
2123 //
2124 // Creates runlist buffers for engines belonging to this GPU instance from non-partitionable memory and
2125 // recreates these runlist buffers in GPU instance's memory.
2126 //
2127 NV_STATUS
kmigmgrCreateGPUInstanceRunlists_FWCLIENT(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)2128 kmigmgrCreateGPUInstanceRunlists_FWCLIENT
2129 (
2130 OBJGPU *pGpu,
2131 KernelMIGManager *pKernelMIGManager,
2132 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
2133 )
2134 {
2135 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
2136 NvU32 index;
2137 NvU32 runlistId;
2138 RM_ENGINE_TYPE rmEngineType;
2139 NvU32 engDesc;
2140 NV_STATUS status = NV_OK;
2141 NvU32 numEngines = kfifoGetNumEschedDrivenEngines(pKernelFifo);
2142 NvU32 maxRunlists = kfifoGetMaxNumRunlists_HAL(pGpu, pKernelFifo);
2143 NvU64 runlistAlign;
2144 NvU64 allocFlags;
2145 NvU32 attr;
2146 NV_ADDRESS_SPACE aperture;
2147 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
2148 NV2080_CTRL_INTERNAL_FIFO_PROMOTE_RUNLIST_BUFFERS_PARAMS *pParams;
2149
2150 // TODO: Mem partitioning check should suffice here
2151 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId) ||
2152 !ctxBufPoolIsSupported(pGpu))
2153 {
2154 return NV_OK;
2155 }
2156
2157 kfifoRunlistGetBufAllocParams(pGpu, &aperture, &attr, &allocFlags);
2158 allocFlags |= MEMDESC_FLAGS_OWNED_BY_CTX_BUF_POOL;
2159
2160 for (index = 0; index < numEngines; index++)
2161 {
2162 NV_ASSERT_OK_OR_GOTO(status,
2163 kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
2164 ENGINE_INFO_TYPE_INVALID, index,
2165 ENGINE_INFO_TYPE_RUNLIST, &runlistId),
2166 failed);
2167
2168 if ((runlistId >= maxRunlists) || (runlistId >= NV_NBITS_IN_TYPE(pKernelMIGGpuInstance->runlistIdMask)))
2169 {
2170 status = NV_ERR_INVALID_STATE;
2171 goto failed;
2172 }
2173
2174 // some engines share runlists. so skip if have already dealt with this runlist
2175 if ((pKernelMIGGpuInstance->runlistIdMask & NVBIT64(runlistId)) != 0x0)
2176 {
2177 continue;
2178 }
2179
2180 NV_ASSERT_OK_OR_GOTO(status,
2181 kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
2182 ENGINE_INFO_TYPE_RUNLIST, runlistId,
2183 ENGINE_INFO_TYPE_RM_ENGINE_TYPE, (NvU32 *)&rmEngineType),
2184 failed);
2185
2186 NV_ASSERT_OK_OR_GOTO(status,
2187 kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
2188 ENGINE_INFO_TYPE_RUNLIST, runlistId,
2189 ENGINE_INFO_TYPE_ENG_DESC, &engDesc),
2190 failed);
2191
2192 // Check if this is a partitionable engine. Non-partitionable engine runlists can stay in RM reserved memory
2193 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType))
2194 {
2195 continue;
2196 }
2197
2198 // if partitionable engine doesn't belong to this GPU instance then nothing to do
2199 if (!kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, kmigmgrMakeGIReference(pKernelMIGGpuInstance)))
2200 {
2201 continue;
2202 }
2203
2204 //
2205 // Sched is only managed by Physical RM.
2206 // If running on GSP client, we will instead allocate the runlist buffers from the ctxbuf pool
2207 // and promote them to GSP later. GSP will skip the runlist buffer allocation during schedInit
2208 // and wait for the RPC to memdescDescribe the allocation from client RM.
2209 //
2210 // OBJSCHEDMGR is not valid in kernel RM. Allocate and store runlist buffers in OBJFIFO,
2211 // which will be sent to GSP to store in its schedmgr
2212 //
2213 NV_ASSERT_OK_OR_GOTO(status,
2214 kfifoRunlistAllocBuffers(pGpu, pKernelFifo,
2215 NV_TRUE,
2216 aperture,
2217 runlistId,
2218 attr,
2219 allocFlags,
2220 0,
2221 NV_TRUE,
2222 pKernelFifo->pppRunlistBufMemDesc[runlistId]),
2223 failed);
2224
2225 // Add runlist to GPU instance
2226 pKernelMIGGpuInstance->runlistIdMask |= NVBIT64(runlistId);
2227 }
2228
2229 runlistAlign = NVBIT64(kfifoRunlistGetBaseShift_HAL(pKernelFifo));
2230
2231 pParams = portMemAllocNonPaged(sizeof(*pParams));
2232 NV_ASSERT_OR_GOTO(pParams != NULL, failed);
2233
2234 ct_assert(sizeof(pParams->runlistIdMask) == sizeof(pKernelMIGGpuInstance->runlistIdMask));
2235 pParams->runlistIdMask = pKernelMIGGpuInstance->runlistIdMask;
2236 pParams->swizzId = pKernelMIGGpuInstance->swizzId;
2237
2238 for (runlistId = 0; runlistId < maxRunlists; runlistId++)
2239 {
2240 if (pParams->runlistIdMask & NVBIT64(runlistId))
2241 {
2242 for (index = 0; index < NUM_BUFFERS_PER_RUNLIST; index++)
2243 {
2244 MEMORY_DESCRIPTOR *pSourceMemDesc = pKernelFifo->pppRunlistBufMemDesc[runlistId][index];
2245
2246 pParams->rlBuffers[runlistId][index].base = (NvU64)memdescGetPhysAddr(pSourceMemDesc, AT_GPU, 0);
2247 pParams->rlBuffers[runlistId][index].size = pSourceMemDesc->ActualSize;
2248 pParams->rlBuffers[runlistId][index].alignment = runlistAlign;
2249 pParams->rlBuffers[runlistId][index].addressSpace = memdescGetAddressSpace(pSourceMemDesc);
2250 pParams->rlBuffers[runlistId][index].cpuCacheAttrib = attr;
2251
2252 }
2253 }
2254 }
2255
2256 status = pRmApi->Control(pRmApi,
2257 pGpu->hInternalClient,
2258 pGpu->hInternalSubdevice,
2259 NV2080_CTRL_CMD_INTERNAL_FIFO_PROMOTE_RUNLIST_BUFFERS,
2260 pParams,
2261 sizeof(*pParams));
2262
2263 portMemFree(pParams);
2264
2265 NV_ASSERT_OK_OR_GOTO(status, status, failed);
2266
2267 //
2268 // Trim out any additional memory after runlist buffers are allocated
2269 // from ctx buf pools
2270 //
2271 kmigmgrTrimInstanceRunlistBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
2272
2273 return NV_OK;
2274
2275 failed:
2276 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(status,
2277 kmigmgrDeleteGPUInstanceRunlists_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance));
2278
2279 return status;
2280 }
2281
2282 //
2283 // Deletes runlist buffers for all partitionable engines from GPU instance's memory and
2284 // reallocates these runlist buffers in non-partitionable memory.
2285 //
2286 NV_STATUS
kmigmgrDeleteGPUInstanceRunlists_FWCLIENT(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)2287 kmigmgrDeleteGPUInstanceRunlists_FWCLIENT
2288 (
2289 OBJGPU *pGpu,
2290 KernelMIGManager *pKernelMIGManager,
2291 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
2292 )
2293 {
2294 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
2295 NvU32 runlistId;
2296 NV_STATUS status = NV_OK;
2297 NvU32 bufIdx;
2298 MEMORY_DESCRIPTOR **ppRlBuffer;
2299
2300 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId) ||
2301 !ctxBufPoolIsSupported(pGpu))
2302 {
2303 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance->runlistIdMask == 0, NV_ERR_INVALID_STATE);
2304 return NV_OK;
2305 }
2306
2307 FOR_EACH_INDEX_IN_MASK(64, runlistId, pKernelMIGGpuInstance->runlistIdMask)
2308 {
2309 for (bufIdx = 0; bufIdx < NUM_BUFFERS_PER_RUNLIST; bufIdx++)
2310 {
2311 ppRlBuffer = &(pKernelFifo->pppRunlistBufMemDesc[runlistId][bufIdx]);
2312
2313 if (*ppRlBuffer != NULL)
2314 {
2315 memdescFree(*ppRlBuffer);
2316 memdescDestroy(*ppRlBuffer);
2317 *ppRlBuffer = NULL;
2318 }
2319 }
2320
2321 // remove runlist from GPU instance
2322 pKernelMIGGpuInstance->runlistIdMask &= ~(NVBIT64(runlistId));
2323
2324 }
2325 FOR_EACH_INDEX_IN_MASK_END;
2326
2327 return status;
2328 }
2329
2330 /*!
2331 * @brief Load MIG instance topology from persistence, if available.
2332 * If MIG is disabled, this operation will be skipped with a warning.
2333 */
2334 NV_STATUS
kmigmgrRestoreFromPersistence_PF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2335 kmigmgrRestoreFromPersistence_PF
2336 (
2337 OBJGPU *pGpu,
2338 KernelMIGManager *pKernelMIGManager
2339 )
2340 {
2341 NV_STATUS status = NV_OK;
2342 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
2343 GPUMGR_SAVE_MIG_INSTANCE_TOPOLOGY *pTopologySave = NULL;
2344 NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS *pPartImportParams = NULL;
2345 NVC637_CTRL_EXEC_PARTITIONS_IMPORT_EXPORT_PARAMS *pExecPartImportParams = NULL;
2346 NvU32 GIIdx;
2347 NvU32 CIIdx;
2348 NvBool bTopologyValid;
2349 NvHandle hClient = NV01_NULL_OBJECT;
2350 NvHandle hDevice = NV01_NULL_OBJECT;
2351 NvHandle hSubdevice = NV01_NULL_OBJECT;
2352
2353 NV_CHECK_OR_RETURN(LEVEL_SILENT,
2354 gpumgrGetSystemMIGInstanceTopo(gpuGetDBDF(pGpu), &pTopologySave),
2355 NV_OK);
2356
2357 // Check to see whether there was actually anything saved
2358 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI); ++GIIdx)
2359 {
2360 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave = &pTopologySave->saveGI[GIIdx];
2361 if (pGPUInstanceSave->bValid)
2362 break;
2363 }
2364
2365 bTopologyValid = (GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI));
2366 if (!bTopologyValid)
2367 {
2368 // The boot config is honored only if no topology was saved previously (e.g. on reboot)
2369 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kmigmgrRestoreFromBootConfig_HAL(pGpu, pKernelMIGManager));
2370 return NV_OK;
2371 }
2372
2373 if (!IS_MIG_ENABLED(pGpu))
2374 {
2375 NV_PRINTF(LEVEL_WARNING, "Skipping reinitialization of persistent MIG instances due to MIG disablement!\n");
2376 //
2377 // If we ended up here, we have inconsistent state in that there are instances to be restored
2378 // but MIG is disabled. This also means, that /proc filesystem is populated with nodes for the
2379 // instances that we are expected to restore, but wont do so. Clean them up.
2380 //
2381 gpumgrUnregisterRmCapsForMIGGI(gpuGetDBDF(pGpu));
2382 return NV_OK;
2383 }
2384
2385 NV_ASSERT_OK_OR_RETURN(
2386 rmapiutilAllocClientAndDeviceHandles(pRmApi, pGpu, &hClient, &hDevice, &hSubdevice));
2387
2388 pPartImportParams = portMemAllocNonPaged(sizeof(*pPartImportParams));
2389 NV_CHECK_OR_ELSE(LEVEL_ERROR, pPartImportParams != NULL,
2390 status = NV_ERR_NO_MEMORY;
2391 goto cleanup; );
2392 pExecPartImportParams = portMemAllocNonPaged(sizeof(*pExecPartImportParams));
2393 NV_CHECK_OR_ELSE(LEVEL_ERROR, pExecPartImportParams != NULL,
2394 status = NV_ERR_NO_MEMORY;
2395 goto cleanup; );
2396
2397 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI); ++GIIdx)
2398 {
2399 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave = &pTopologySave->saveGI[GIIdx];
2400 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance;
2401
2402 if (!pGPUInstanceSave->bValid)
2403 continue;
2404
2405 portMemSet(pPartImportParams, 0, sizeof(*pPartImportParams));
2406 pPartImportParams->swizzId = pGPUInstanceSave->swizzId;
2407 portMemCopy(&pPartImportParams->info, sizeof(pPartImportParams->info),
2408 &pGPUInstanceSave->giInfo, sizeof(pGPUInstanceSave->giInfo));
2409
2410 NV_ASSERT_OK_OR_GOTO(status,
2411 pRmApi->Control(pRmApi,
2412 hClient,
2413 hSubdevice,
2414 NV2080_CTRL_CMD_INTERNAL_KMIGMGR_IMPORT_GPU_INSTANCE,
2415 pPartImportParams,
2416 sizeof(*pPartImportParams)),
2417 cleanup);
2418
2419 NV_ASSERT_OK_OR_GOTO(status,
2420 kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, pGPUInstanceSave->swizzId, &pKernelMIGGpuInstance),
2421 cleanup);
2422
2423 // Restore capability caps
2424 pKernelMIGGpuInstance->pOsRmCaps = pGPUInstanceSave->pOsRmCaps;
2425
2426 for (CIIdx = 0; CIIdx < NV_ARRAY_ELEMENTS(pGPUInstanceSave->saveCI); ++CIIdx)
2427 {
2428 GPUMGR_SAVE_COMPUTE_INSTANCE *pComputeInstanceSave = &pGPUInstanceSave->saveCI[CIIdx];
2429 NvHandle hSubscription;
2430 NVC637_ALLOCATION_PARAMETERS alloc;
2431
2432 if (!pComputeInstanceSave->bValid)
2433 continue;
2434
2435 portMemSet(&alloc, 0, sizeof(alloc));
2436 alloc.swizzId = pGPUInstanceSave->swizzId;
2437 NV_ASSERT_OK_OR_GOTO(status,
2438 pRmApi->AllocWithSecInfo(pRmApi,
2439 hClient,
2440 hSubdevice,
2441 &hSubscription,
2442 AMPERE_SMC_PARTITION_REF,
2443 &alloc,
2444 sizeof(alloc),
2445 RMAPI_ALLOC_FLAGS_NONE,
2446 NULL,
2447 &pRmApi->defaultSecInfo),
2448 cleanup);
2449
2450 portMemSet(pExecPartImportParams, 0, sizeof(*pExecPartImportParams));
2451 pExecPartImportParams->id = pComputeInstanceSave->id;
2452 pExecPartImportParams->bCreateCap = NV_FALSE;
2453 portMemCopy(&pExecPartImportParams->info, sizeof(pExecPartImportParams->info),
2454 &pComputeInstanceSave->ciInfo, sizeof(pComputeInstanceSave->ciInfo));
2455
2456 NV_ASSERT_OK_OR_GOTO(status,
2457 pRmApi->Control(pRmApi,
2458 hClient,
2459 hSubscription,
2460 NVC637_CTRL_CMD_EXEC_PARTITIONS_IMPORT,
2461 pExecPartImportParams,
2462 sizeof(*pExecPartImportParams)),
2463 cleanup);
2464
2465 // Restore capability caps
2466 pKernelMIGGpuInstance->MIGComputeInstance[pExecPartImportParams->id].pOsRmCaps = pComputeInstanceSave->pOsRmCaps;
2467
2468 pRmApi->Free(pRmApi, hClient, hSubscription);
2469 }
2470 }
2471
2472 cleanup:
2473 rmapiutilFreeClientAndDeviceHandles(pRmApi, &hClient, &hDevice, &hSubdevice);
2474 portMemFree(pPartImportParams);
2475 portMemFree(pExecPartImportParams);
2476
2477 //
2478 // Let stateUnload handle an error teardown case, since it has to be
2479 // coordinated between CPU/GSP
2480 //
2481 return status;
2482 }
2483
2484 /*!
2485 * @brief Load MIG instance topology from persistence, if available.
2486 * If MIG is disabled, this operation will be skipped with a warning.
2487 */
2488 NV_STATUS
kmigmgrRestoreFromPersistence_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2489 kmigmgrRestoreFromPersistence_VF
2490 (
2491 OBJGPU *pGpu,
2492 KernelMIGManager *pKernelMIGManager
2493 )
2494 {
2495 NV_STATUS status = NV_OK;
2496 GPUMGR_SAVE_MIG_INSTANCE_TOPOLOGY *pTopologySave = NULL;
2497 NvU32 GIIdx;
2498 NvU32 CIIdx;
2499 NvBool bTopologyValid;
2500 NvBool bMemoryPartitioningNeeded;
2501 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance;
2502
2503 NV_CHECK_OR_RETURN(LEVEL_SILENT,
2504 gpumgrGetSystemMIGInstanceTopo(gpuGetDBDF(pGpu), &pTopologySave),
2505 NV_OK);
2506
2507 // Check to see whether there was actually anything saved
2508 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI); ++GIIdx)
2509 {
2510 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave = &pTopologySave->saveGI[GIIdx];
2511 if (pGPUInstanceSave->bValid)
2512 break;
2513 }
2514
2515 bTopologyValid = (GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI));
2516 NV_CHECK_OR_RETURN(LEVEL_SILENT, bTopologyValid, NV_OK);
2517
2518 if (!IS_MIG_ENABLED(pGpu))
2519 {
2520 NV_PRINTF(LEVEL_WARNING, "Skipping reinitialization of persistent MIG instances due to MIG disablement!\n");
2521 gpumgrUnregisterRmCapsForMIGGI(gpuGetDBDF(pGpu));
2522 return NV_OK;
2523 }
2524
2525 bMemoryPartitioningNeeded = kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pTopologySave->saveGI[0].swizzId);
2526
2527 // Perform all initialization that must be done when MIG is first enabled
2528 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2529 kmigmgrSetMIGState(pGpu, pKernelMIGManager, bMemoryPartitioningNeeded, NV_TRUE, NV_FALSE));
2530
2531 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(pTopologySave->saveGI); ++GIIdx)
2532 {
2533 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance;
2534 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave = &pTopologySave->saveGI[GIIdx];
2535 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS restore =
2536 {
2537 .type = KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_RESTORE,
2538 .inst.restore.pGPUInstanceSave = pGPUInstanceSave
2539 };
2540 NvU32 swizzId = pGPUInstanceSave->swizzId;
2541 NvUuid uuid;
2542
2543 if (!pGPUInstanceSave->bValid)
2544 continue;
2545
2546 NV_ASSERT_OK_OR_GOTO(status,
2547 kmigmgrGenerateGPUInstanceUuid_HAL(pGpu, pKernelMIGManager, swizzId, &uuid),
2548 fail);
2549
2550 // Create a GPU instance using the saved data
2551 NV_CHECK_OK_OR_GOTO(status, LEVEL_WARNING,
2552 kmigmgrCreateGPUInstance(pGpu, pKernelMIGManager, swizzId, uuid.uuid, restore, NV_TRUE, NV_FALSE),
2553 fail);
2554
2555 NV_ASSERT_OK_OR_GOTO(status,
2556 kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, &pKernelMIGGPUInstance),
2557 fail);
2558
2559 // Restore capability caps
2560 pKernelMIGGPUInstance->pOsRmCaps = pGPUInstanceSave->pOsRmCaps;
2561
2562 for (CIIdx = 0; CIIdx < NV_ARRAY_ELEMENTS(pGPUInstanceSave->saveCI); ++CIIdx)
2563 {
2564 GPUMGR_SAVE_COMPUTE_INSTANCE *pComputeInstanceSave = &pGPUInstanceSave->saveCI[CIIdx];
2565 KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS restore =
2566 {
2567 .type = KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_RESTORE,
2568 .inst.restore.pComputeInstanceSave = pComputeInstanceSave
2569 };
2570 //
2571 // This id variable actually doesn't need to be initialized since the callee
2572 // is not referencing to its value. But GCC13 is unhappy with that, thus WAR
2573 // this issue by initializing it.
2574 //
2575 NvU32 id = pComputeInstanceSave->id;
2576
2577 if (!pComputeInstanceSave->bValid)
2578 continue;
2579
2580 // Create a compute instance on this GPU instance using the saved data
2581 NV_CHECK_OK_OR_GOTO(status, LEVEL_WARNING,
2582 kmigmgrCreateComputeInstances_HAL(pGpu, pKernelMIGManager, pKernelMIGGPUInstance, NV_FALSE, restore, &id, NV_FALSE),
2583 fail);
2584
2585 // Restore capability caps
2586 pKernelMIGGPUInstance->MIGComputeInstance[id].pOsRmCaps = pComputeInstanceSave->pOsRmCaps;
2587 }
2588 }
2589
2590 return NV_OK;
2591
2592 fail:
2593
2594 // Clean up anything we created and bail
2595 FOR_EACH_VALID_GPU_INSTANCE(pGpu, pKernelMIGManager, pKernelMIGGPUInstance)
2596 {
2597 for (CIIdx = 0; CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGPUInstance->MIGComputeInstance); ++CIIdx)
2598 {
2599 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pKernelMIGGPUInstance->MIGComputeInstance[CIIdx];
2600
2601 // Skip invalid compute instances
2602 if (!pMIGComputeInstance->bValid)
2603 continue;
2604
2605 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
2606 kmigmgrDeleteComputeInstance(pGpu, pKernelMIGManager, pKernelMIGGPUInstance, CIIdx, NV_TRUE));
2607 }
2608
2609 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
2610 kmigmgrInvalidateGPUInstance(pGpu, pKernelMIGManager, pKernelMIGGPUInstance->swizzId, NV_TRUE));
2611 }
2612 FOR_EACH_VALID_GPU_INSTANCE_END();
2613
2614 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
2615 kmigmgrSetMIGState(pGpu, pKernelMIGManager, bMemoryPartitioningNeeded, NV_FALSE, NV_FALSE));
2616
2617 return status;
2618 }
2619
2620 /*
2621 * @brief Initialize MIG gpu instance
2622 */
2623 void
kmigmgrInitGPUInstanceInfo_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)2624 kmigmgrInitGPUInstanceInfo_IMPL
2625 (
2626 OBJGPU *pGpu,
2627 KernelMIGManager *pKernelMIGManager,
2628 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
2629 )
2630 {
2631 NvU32 i;
2632
2633 bitVectorClrAll(&pKernelMIGGpuInstance->exclusiveEngMask);
2634 bitVectorClrAll(&pKernelMIGGpuInstance->sharedEngMask);
2635
2636 for (i = 0; i < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance); ++i)
2637 {
2638 NV_ASSERT(!pKernelMIGGpuInstance->MIGComputeInstance[i].bValid);
2639 pKernelMIGGpuInstance->MIGComputeInstance[i].pOsRmCaps = NULL;
2640 pKernelMIGGpuInstance->MIGComputeInstance[i].id = KMIGMGR_COMPUTE_INSTANCE_ID_INVALID;
2641 }
2642
2643 pKernelMIGGpuInstance->swizzId = KMIGMGR_SWIZZID_INVALID;
2644 pKernelMIGGpuInstance->hMemory = NV01_NULL_OBJECT;
2645 pKernelMIGGpuInstance->pShare = NULL;
2646 pKernelMIGGpuInstance->pMemoryPartitionHeap = NULL;
2647 pKernelMIGGpuInstance->bValid = NV_FALSE;
2648 pKernelMIGGpuInstance->memRange = NV_RANGE_EMPTY;
2649 pKernelMIGGpuInstance->pMIGGpuInstance = NULL;
2650 pKernelMIGGpuInstance->pOsRmCaps = NULL;
2651 pKernelMIGGpuInstance->pProfile = NULL;
2652
2653 portMemSet(&pKernelMIGGpuInstance->resourceAllocation, 0x0, sizeof(pKernelMIGGpuInstance->resourceAllocation));
2654 }
2655
2656 /*!
2657 * @brief Checks Devinit owned scratch bit to see if MIG is enabled or not
2658 *
2659 * @return NV_TRUE if there is valid GPU instance in VGPU static info
2660 */
2661 NvBool
kmigmgrIsDevinitMIGBitSet_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2662 kmigmgrIsDevinitMIGBitSet_VF
2663 (
2664 OBJGPU *pGpu,
2665 KernelMIGManager *pKernelMIGManager
2666 )
2667 {
2668 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
2669 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_FALSE);
2670
2671 return pVSI->gpuPartitionInfo.swizzId != KMIGMGR_SWIZZID_INVALID;
2672 }
2673
2674 /*!
2675 * @brief Function to set device profiling in use
2676 */
2677 NV_STATUS
kmigmgrSetDeviceProfilingInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2678 kmigmgrSetDeviceProfilingInUse_IMPL
2679 (
2680 OBJGPU *pGpu,
2681 KernelMIGManager *pKernelMIGManager
2682 )
2683 {
2684 NV_ASSERT_OR_RETURN(!kmigmgrIsDeviceProfilingInUse(pGpu, pKernelMIGManager),
2685 NV_ERR_STATE_IN_USE);
2686 pKernelMIGManager->bDeviceProfilingInUse = NV_TRUE;
2687 return NV_OK;
2688 }
2689
2690 /*!
2691 * @brief Function to clear device profiling in-use
2692 */
2693 void
kmigmgrClearDeviceProfilingInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2694 kmigmgrClearDeviceProfilingInUse_IMPL
2695 (
2696 OBJGPU *pGpu,
2697 KernelMIGManager *pKernelMIGManager
2698 )
2699 {
2700 pKernelMIGManager->bDeviceProfilingInUse = NV_FALSE;
2701 }
2702
2703 /*!
2704 * @brief Function to check if device profiling is in-use
2705 */
2706 NvBool
kmigmgrIsDeviceProfilingInUse_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)2707 kmigmgrIsDeviceProfilingInUse_IMPL
2708 (
2709 OBJGPU *pGpu,
2710 KernelMIGManager *pKernelMIGManager
2711 )
2712 {
2713 return pKernelMIGManager->bDeviceProfilingInUse;
2714 }
2715
2716 /*!
2717 * @brief Function to check if specific device is subscribed to DeviceProfiling
2718 */
2719 NvBool
kmigmgrIsDeviceUsingDeviceProfiling_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Device * pDevice)2720 kmigmgrIsDeviceUsingDeviceProfiling_IMPL
2721 (
2722 OBJGPU *pGpu,
2723 KernelMIGManager *pKernelMIGManager,
2724 Device *pDevice
2725 )
2726 {
2727 RsClient *pRsClient;
2728 GPUInstanceSubscription *pGPUInstanceSubscription;
2729 Subdevice *pSubdevice;
2730 NV_STATUS status;
2731
2732 NV_CHECK_OR_RETURN(LEVEL_SILENT, IS_MIG_ENABLED(pGpu), NV_FALSE);
2733
2734 if (!kmigmgrIsDeviceProfilingInUse(pGpu, pKernelMIGManager))
2735 {
2736 return NV_FALSE;
2737 }
2738
2739 NV_ASSERT_OR_RETURN(pDevice != NULL, NV_ERR_INVALID_ARGUMENT);
2740 pRsClient = RES_GET_CLIENT(pDevice);
2741
2742 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2743 subdeviceGetByInstance(pRsClient, RES_GET_HANDLE(pDevice), 0, &pSubdevice));
2744
2745 NV_CHECK_OK_OR_ELSE(status, LEVEL_ERROR,
2746 gisubscriptionGetGPUInstanceSubscription(pRsClient, RES_GET_HANDLE(pSubdevice), &pGPUInstanceSubscription),
2747 return NV_FALSE; );
2748
2749 return gisubscriptionIsDeviceProfiling(pGPUInstanceSubscription);
2750 }
2751
2752 /*!
2753 * @brief enable all LCE engines for use by GPU instances
2754 */
2755 NV_STATUS
kmigmgrEnableAllLCEs_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvBool bEnableAllLCEs)2756 kmigmgrEnableAllLCEs_IMPL
2757 (
2758 OBJGPU *pGpu,
2759 KernelMIGManager *pKernelMIGManager,
2760 NvBool bEnableAllLCEs
2761 )
2762 {
2763 KernelCE *pKCe = NULL;
2764
2765 //
2766 // AMODEL support of CEs is faked. No actual work needs to be done for
2767 // AMODEL here, so just return NV_OK early to avoid triggering assertions.
2768 //
2769 NV_CHECK_OR_RETURN(LEVEL_SILENT, !IsAMODEL(pGpu), NV_OK);
2770
2771 NV_ASSERT_OK_OR_RETURN(kceFindFirstInstance(pGpu, &pKCe));
2772
2773 if (bEnableAllLCEs)
2774 NV_ASSERT_OK_OR_RETURN(kceUpdateClassDB_HAL(pGpu, pKCe));
2775 else
2776 NV_ASSERT_OK_OR_RETURN(kceTopLevelPceLceMappingsUpdate(pGpu, pKCe));
2777
2778 return NV_OK;
2779 }
2780
2781 /*!
2782 * @brief Retrieves instance(s) associated with a device, if applicable
2783 */
2784 NV_STATUS
kmigmgrGetInstanceRefFromDevice_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Device * pDevice,MIG_INSTANCE_REF * pRef)2785 kmigmgrGetInstanceRefFromDevice_IMPL
2786 (
2787 OBJGPU *pGpu,
2788 KernelMIGManager *pKernelMIGManager,
2789 Device *pDevice,
2790 MIG_INSTANCE_REF *pRef
2791 )
2792 {
2793 NV_STATUS status = NV_OK;
2794 RsClient *pRsClient;
2795 GPUInstanceSubscription *pGPUInstanceSubscription;
2796 ComputeInstanceSubscription *pComputeInstanceSubscription = NULL;
2797 Subdevice *pSubdevice;
2798 MIG_INSTANCE_REF ref;
2799
2800 NV_ASSERT_OR_RETURN(pRef != NULL, NV_ERR_INVALID_ARGUMENT);
2801 *pRef = kmigmgrMakeNoMIGReference();
2802
2803 if (!IS_MIG_IN_USE(pGpu))
2804 {
2805 return NV_ERR_INVALID_STATE;
2806 }
2807
2808 NV_ASSERT_OR_RETURN(pDevice != NULL, NV_ERR_INVALID_ARGUMENT);
2809 pRsClient = RES_GET_CLIENT(pDevice);
2810
2811 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
2812 subdeviceGetByInstance(pRsClient, RES_GET_HANDLE(pDevice), 0, &pSubdevice));
2813
2814 NV_CHECK_OK_OR_RETURN(LEVEL_NOTICE,
2815 gisubscriptionGetGPUInstanceSubscription(pRsClient, RES_GET_HANDLE(pSubdevice),
2816 &pGPUInstanceSubscription));
2817
2818 ref.pKernelMIGGpuInstance = gisubscriptionGetMIGGPUInstance(pGPUInstanceSubscription);
2819
2820 status = cisubscriptionGetComputeInstanceSubscription(pRsClient,
2821 RES_GET_HANDLE(pGPUInstanceSubscription),
2822 &pComputeInstanceSubscription);
2823 if (status == NV_OK)
2824 {
2825 ref = kmigmgrMakeCIReference(gisubscriptionGetMIGGPUInstance(pGPUInstanceSubscription),
2826 cisubscriptionGetMIGComputeInstance(pComputeInstanceSubscription));
2827 }
2828 else
2829 {
2830 ref = kmigmgrMakeGIReference(gisubscriptionGetMIGGPUInstance(pGPUInstanceSubscription));
2831 // Quash status, this is optional
2832 status = NV_OK;
2833 }
2834
2835 NV_CHECK_OR_RETURN(LEVEL_SILENT, kmigmgrIsMIGReferenceValid(&ref), NV_ERR_INVALID_STATE);
2836 *pRef = ref;
2837 return status;
2838 }
2839
2840 /*!
2841 * @brief Retrieves GPU instance heap associated with a device, if applicable
2842 */
2843 NV_STATUS
kmigmgrGetMemoryPartitionHeapFromDevice_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Device * pDevice,Heap ** ppMemoryPartitionHeap)2844 kmigmgrGetMemoryPartitionHeapFromDevice_IMPL
2845 (
2846 OBJGPU *pGpu,
2847 KernelMIGManager *pKernelMIGManager,
2848 Device *pDevice,
2849 Heap **ppMemoryPartitionHeap
2850 )
2851 {
2852 MIG_INSTANCE_REF ref;
2853 NV_STATUS rmStatus = NV_OK;
2854 NvHandle hClient;
2855
2856 NV_ASSERT_OR_RETURN(IS_MIG_IN_USE(pGpu), NV_ERR_INVALID_STATE);
2857
2858 NV_ASSERT_OR_RETURN(pDevice != NULL, NV_ERR_INVALID_ARGUMENT);
2859 hClient = RES_GET_CLIENT_HANDLE(pDevice);
2860
2861 rmStatus = kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref);
2862 if ((rmStatus != NV_OK) || !kmigmgrIsMIGReferenceValid(&ref))
2863 {
2864 RS_PRIV_LEVEL privLevel = rmclientGetCachedPrivilegeByHandle(hClient);
2865
2866 // It's okay for kernel/root clients to not be associated to a GPU instance
2867 if (privLevel >= RS_PRIV_LEVEL_KERNEL)
2868 {
2869 rmStatus = NV_OK;
2870 }
2871 else
2872 {
2873 NV_PRINTF(LEVEL_ERROR,
2874 "Failed to get GPU instance for non-privileged client hClient=0x%08x!\n",
2875 hClient);
2876
2877 // if we got here due to a bogus GPU instance info, actually return an error
2878 if (rmStatus == NV_OK)
2879 rmStatus = NV_ERR_INVALID_STATE;
2880 }
2881 }
2882 else
2883 {
2884 NV_ASSERT_OR_RETURN(ppMemoryPartitionHeap != NULL, NV_ERR_INVALID_ARGUMENT);
2885 *ppMemoryPartitionHeap = ref.pKernelMIGGpuInstance->pMemoryPartitionHeap;
2886 NV_PRINTF(LEVEL_INFO,
2887 "GPU instance heap found for hClient = 0x%08x with swizzId = %d!\n",
2888 hClient, ref.pKernelMIGGpuInstance->swizzId);
2889 }
2890
2891 return rmStatus;
2892 }
2893
2894 /*!
2895 * @brief Retrieves swizzid associated with a client, if applicable
2896 */
2897 NV_STATUS
kmigmgrGetSwizzIdFromDevice_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Device * pDevice,NvU32 * pSwizzId)2898 kmigmgrGetSwizzIdFromDevice_IMPL
2899 (
2900 OBJGPU *pGpu,
2901 KernelMIGManager *pKernelMIGManager,
2902 Device *pDevice,
2903 NvU32 *pSwizzId
2904 )
2905 {
2906 MIG_INSTANCE_REF ref;
2907 NV_ASSERT_OK_OR_RETURN(
2908 kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
2909
2910 *pSwizzId = ref.pKernelMIGGpuInstance->swizzId;
2911 return NV_OK;
2912 }
2913
2914 /*!
2915 * @brief Printout properties of specified MIG gpu instance
2916 */
2917 void
kmigmgrPrintGPUInstanceInfo_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)2918 kmigmgrPrintGPUInstanceInfo_IMPL
2919 (
2920 OBJGPU *pGpu,
2921 KernelMIGManager *pKernelMIGManager,
2922 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
2923 )
2924 {
2925 #if NV_PRINTF_LEVEL_ENABLED(LEVEL_INFO)
2926 NV_STATUS status;
2927 const MIG_GPU_INSTANCE_MEMORY_CONFIG *pGPUInstanceMemConfig;
2928 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
2929 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
2930 NV_RANGE partitionableMemoryRange = memmgrGetMIGPartitionableMemoryRange(pGpu, pMemoryManager);
2931
2932 NvU32 grCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2933 RM_ENGINE_TYPE_GR(0));
2934 NvU32 ceCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2935 RM_ENGINE_TYPE_COPY(0));
2936 NvU32 decCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2937 RM_ENGINE_TYPE_NVDEC(0));
2938 NvU32 encCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2939 RM_ENGINE_TYPE_NVENC(0));
2940 NvU32 jpgCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2941 RM_ENGINE_TYPE_NVJPG);
2942 NvU32 ofaCount = kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines,
2943 RM_ENGINE_TYPE_OFA(0));
2944
2945 #define PADDING_STR "-----------------------------------------------------------------"
2946
2947 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2948 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
2949 "SwizzId",
2950 "SwizzId Table Mask",
2951 "Gpc Count");
2952 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2953 NV_PRINTF(LEVEL_INFO, "| %18d | %18s | %18d |\n",
2954 pKernelMIGGpuInstance->swizzId,
2955 "NOT IMPLEMENTED",
2956 pKernelMIGGpuInstance->resourceAllocation.gpcCount);
2957 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2958 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
2959 "OBJGR Count",
2960 "OBJCE Count",
2961 "NVDEC Count");
2962 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2963 NV_PRINTF(LEVEL_INFO, "| %18d | %18d | %18d |\n",
2964 grCount,
2965 ceCount,
2966 decCount);
2967 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2968 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
2969 "NVENC Count",
2970 "NVJPG Count",
2971 "NVOFA Count");
2972 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2973 NV_PRINTF(LEVEL_INFO, "| %18d | %18d | %18d |\n",
2974 encCount,
2975 jpgCount,
2976 ofaCount);
2977 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2978 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
2979 "VEID Offset",
2980 "VEID Count",
2981 "VEID-GR Map");
2982 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2983 NV_PRINTF(LEVEL_INFO, "| %18d | %18d | %18llx |\n",
2984 pKernelMIGGpuInstance->resourceAllocation.veidOffset,
2985 pKernelMIGGpuInstance->resourceAllocation.veidCount,
2986 DRF_MASK64(pKernelMIGGpuInstance->resourceAllocation.veidCount : 0) << pKernelMIGGpuInstance->resourceAllocation.veidOffset);
2987 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2988 NV_PRINTF(LEVEL_INFO, "| %29s | %29s |\n",
2989 "Partitionable",
2990 "Partitionable");
2991 NV_PRINTF(LEVEL_INFO, "| %29s | %29s |\n",
2992 "Memory Start Addr",
2993 "Memory End Addr");
2994 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2995 NV_PRINTF(LEVEL_INFO, "| %29llx | %29llx |\n",
2996 partitionableMemoryRange.lo,
2997 partitionableMemoryRange.hi);
2998 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
2999 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
3000 "Local Instance",
3001 "Local Instance",
3002 "Local Instance");
3003 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
3004 "Memory Start Addr",
3005 "Memory End Addr",
3006 "Size in Bytes");
3007 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
3008 NV_PRINTF(LEVEL_INFO, "| %18llx | %18llx | %18llx |\n",
3009 pKernelMIGGpuInstance->memRange.lo,
3010 pKernelMIGGpuInstance->memRange.hi,
3011 rangeLength(pKernelMIGGpuInstance->memRange));
3012 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
3013 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
3014 "Local Instance",
3015 "Local Instance",
3016 "Local Instance");
3017 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n",
3018 "Start VMMU Seg.",
3019 "End VMMU Seg.",
3020 "Size in VMMU Seg.");
3021 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
3022
3023 status = kmemsysGetMIGGPUInstanceMemConfigFromSwizzId(pGpu, pKernelMemorySystem,
3024 pKernelMIGGpuInstance->swizzId,
3025 &pGPUInstanceMemConfig);
3026 if (status == NV_ERR_NOT_SUPPORTED)
3027 {
3028 // Guest does not populate VMMU segment details.
3029 NV_ASSERT_OR_RETURN_VOID(IS_VIRTUAL(pGpu));
3030 NV_PRINTF(LEVEL_INFO, "| %18s | %18s | %18s |\n", "N/A", "N/A", "N/A");
3031 }
3032 else
3033 {
3034 NV_ASSERT_OR_RETURN_VOID(status == NV_OK);
3035 NV_PRINTF(LEVEL_INFO, "| %18llx | %18llx | %18llx |\n",
3036 pGPUInstanceMemConfig->startingVmmuSegment,
3037 (pGPUInstanceMemConfig->startingVmmuSegment +
3038 pGPUInstanceMemConfig->memSizeInVmmuSegment) - 1,
3039 pGPUInstanceMemConfig->memSizeInVmmuSegment);
3040 }
3041 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
3042 #undef PADDING_STR
3043 #endif // NV_PRINTF_LEVEL_ENABLED(LEVEL_INFO)
3044 }
3045
3046 /*!
3047 * @brief Function to set GPU instance information representing provided swizzId.
3048 */
3049 NV_STATUS
kmigmgrSetGPUInstanceInfo_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvU8 * pUuid,KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params)3050 kmigmgrSetGPUInstanceInfo_IMPL
3051 (
3052 OBJGPU *pGpu,
3053 KernelMIGManager *pKernelMIGManager,
3054 NvU32 swizzId,
3055 NvU8 *pUuid,
3056 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params
3057 )
3058 {
3059 NvU32 i;
3060 NvHandle hMemory = NV01_NULL_OBJECT;
3061 NV_RANGE addrRange = NV_RANGE_EMPTY;
3062 NV_STATUS rmStatus = NV_OK;
3063 Heap *pMemoryPartitionHeap = NULL;
3064 NvU32 partitionFlag = (params.type == KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_REQUEST)
3065 ? params.inst.request.partitionFlag
3066 : params.inst.restore.pGPUInstanceSave->giInfo.partitionFlags;
3067
3068 if (swizzId >= KMIGMGR_MAX_GPU_SWIZZID)
3069 {
3070 return NV_ERR_INVALID_ARGUMENT;
3071 }
3072
3073 for (i = 0; i < KMIGMGR_MAX_GPU_INSTANCES; ++i)
3074 {
3075 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = &pKernelMIGManager->kernelMIGGpuInstance[i];
3076
3077 // Find first invalid GPU instance and use it to save GPU instance data
3078 if (!pKernelMIGGpuInstance->bValid)
3079 {
3080 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
3081
3082 rmStatus = memmgrAllocMIGGPUInstanceMemory_HAL(pGpu, pMemoryManager, swizzId,
3083 &hMemory, &addrRange,
3084 &pMemoryPartitionHeap);
3085 NV_CHECK_OR_RETURN(LEVEL_ERROR, rmStatus == NV_OK, rmStatus);
3086
3087 // Mark GPU instance as valid as we use GPU instance Invalidation for cleanup
3088 pKernelMIGGpuInstance->bValid = NV_TRUE;
3089 pKernelMIGGpuInstance->swizzId = swizzId;
3090 pKernelMIGGpuInstance->hMemory = hMemory;
3091 pKernelMIGGpuInstance->memRange = addrRange;
3092 pKernelMIGGpuInstance->pMemoryPartitionHeap = pMemoryPartitionHeap;
3093 pKernelMIGGpuInstance->partitionFlag = partitionFlag;
3094 portMemCopy(pKernelMIGGpuInstance->uuid.uuid, sizeof(pKernelMIGGpuInstance->uuid.uuid),
3095 pUuid, NVC637_UUID_LEN);
3096
3097 //
3098 // Offloading of VGPU to GSP requires that the memRange in KERNEL_MIG_GPU_INSTANCE
3099 // be populated, as the plugin will query only within GSP for GPU INSTANCE information.
3100 // CPU-RM is the entity which actually calculates and allocates memory, so with
3101 // VGPU offloaded, GSP-RM must be updated with the memRange info.
3102 //
3103 if (IS_GSP_CLIENT(pGpu) && !IS_VIRTUAL(pGpu) && IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
3104 {
3105 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
3106 NV2080_CTRL_INTERNAL_KMIGMGR_PROMOTE_GPU_INSTANCE_MEM_RANGE_PARAMS memParams;
3107
3108 memParams.swizzId = pKernelMIGGpuInstance->swizzId;
3109 memParams.memAddrRange.lo = pKernelMIGGpuInstance->memRange.lo;
3110 memParams.memAddrRange.hi = pKernelMIGGpuInstance->memRange.hi;
3111 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3112 pRmApi->Control(pRmApi,
3113 pGpu->hInternalClient,
3114 pGpu->hInternalSubdevice,
3115 NV2080_CTRL_CMD_INTERNAL_KMIGMGR_PROMOTE_GPU_INSTANCE_MEM_RANGE,
3116 &memParams,
3117 sizeof(memParams)));
3118 }
3119
3120 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3121 kmigmgrGetProfileByPartitionFlag(pGpu, pKernelMIGManager, partitionFlag, &pKernelMIGGpuInstance->pProfile));
3122
3123 // Allocate RsShared for the GPU instance
3124 NV_ASSERT_OK_OR_RETURN(serverAllocShare(&g_resServ, classInfo(RsShared),
3125 &pKernelMIGGpuInstance->pShare));
3126
3127 // Get resources associated with this swizzId
3128 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3129 kmigmgrSwizzIdToResourceAllocation(pGpu, pKernelMIGManager, swizzId, params,
3130 pKernelMIGGpuInstance,
3131 &pKernelMIGGpuInstance->resourceAllocation));
3132
3133 pKernelMIGGpuInstance->resourceAllocation.gfxGpcCount = pKernelMIGGpuInstance->pProfile->gfxGpcCount;
3134
3135 // Set assigned engines as in use
3136 NV_ASSERT_OK_OR_RETURN(
3137 kmigmgrSetEnginesInUse(pGpu, pKernelMIGManager, &pKernelMIGGpuInstance->resourceAllocation.engines));
3138
3139 // Update engine tracking bitmasks for CI management later
3140 bitVectorClrAll(&pKernelMIGGpuInstance->exclusiveEngMask);
3141 bitVectorClrAll(&pKernelMIGGpuInstance->sharedEngMask);
3142
3143 // Print GPU instance info for debug
3144 NV_PRINTF(LEVEL_INFO, "CREATING GPU instance\n");
3145 kmigmgrPrintGPUInstanceInfo(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
3146
3147 break;
3148 }
3149 }
3150
3151 NV_ASSERT_OR_RETURN(i < KMIGMGR_MAX_GPU_INSTANCES, NV_ERR_INSUFFICIENT_RESOURCES);
3152 return rmStatus;
3153 }
3154
3155 /*!
3156 * @brief Function to get GPU instance information representing provided swizzId.
3157 */
3158 NV_STATUS
kmigmgrGetGPUInstanceInfo_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,KERNEL_MIG_GPU_INSTANCE ** ppKernelMIGGpuInstance)3159 kmigmgrGetGPUInstanceInfo_IMPL
3160 (
3161 OBJGPU *pGpu,
3162 KernelMIGManager *pKernelMIGManager,
3163 NvU32 swizzId,
3164 KERNEL_MIG_GPU_INSTANCE **ppKernelMIGGpuInstance
3165 )
3166 {
3167 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance;
3168
3169 if (swizzId >= KMIGMGR_MAX_GPU_SWIZZID)
3170 {
3171 return NV_ERR_INVALID_ARGUMENT;
3172 }
3173
3174 FOR_EACH_VALID_GPU_INSTANCE(pGpu, pKernelMIGManager, pKernelMIGGPUInstance)
3175 {
3176 if (pKernelMIGGPUInstance->swizzId == swizzId)
3177 {
3178 *ppKernelMIGGpuInstance = pKernelMIGGPUInstance;
3179 return NV_OK;
3180 }
3181 }
3182 FOR_EACH_VALID_GPU_INSTANCE_END();
3183
3184 return NV_ERR_INVALID_ARGUMENT;
3185 }
3186
3187 /*!
3188 * @brief Function to convert local RM_ENGINE_TYPE to global
3189 * RM_ENGINE_TYPE for partitionable engines
3190 * Currently It support GR, CE, NVDEC, NVENC, NVJPG
3191 */
3192 NV_STATUS
kmigmgrGetLocalToGlobalEngineType_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,MIG_INSTANCE_REF ref,RM_ENGINE_TYPE localEngType,RM_ENGINE_TYPE * pGlobalEngType)3193 kmigmgrGetLocalToGlobalEngineType_IMPL
3194 (
3195 OBJGPU *pGpu,
3196 KernelMIGManager *pKernelMIGManager,
3197 MIG_INSTANCE_REF ref,
3198 RM_ENGINE_TYPE localEngType,
3199 RM_ENGINE_TYPE *pGlobalEngType
3200 )
3201 {
3202 NV_ASSERT_OR_RETURN(kmigmgrIsMIGReferenceValid(&ref), NV_ERR_INVALID_ARGUMENT);
3203 NV_ASSERT_OR_RETURN(RM_ENGINE_TYPE_IS_VALID(localEngType),
3204 NV_ERR_INVALID_ARGUMENT);
3205
3206 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, localEngType))
3207 {
3208 //
3209 // Return same engineId as local if called for non-partitioned
3210 // 2080type engines like host engines, PMU SEC etc.
3211 //
3212 *pGlobalEngType = localEngType;
3213 return NV_OK;
3214 }
3215
3216 if (ref.pMIGComputeInstance != NULL)
3217 {
3218 // Replace the CI-local input index with GI-local
3219 if (kmigmgrEngineTypeXlate(&ref.pMIGComputeInstance->resourceAllocation.localEngines, localEngType,
3220 &ref.pMIGComputeInstance->resourceAllocation.engines, &localEngType) != NV_OK)
3221 {
3222 NV_PRINTF(LEVEL_INFO,
3223 "Compute instance Local Engine type 0x%x is not allocated to Compute instance\n",
3224 localEngType);
3225 return NV_ERR_INVALID_ARGUMENT;
3226 }
3227 }
3228
3229 // Replace the GI-local input index with global
3230 if (kmigmgrEngineTypeXlate(&ref.pKernelMIGGpuInstance->resourceAllocation.localEngines, localEngType,
3231 &ref.pKernelMIGGpuInstance->resourceAllocation.engines, &localEngType) != NV_OK)
3232 {
3233 NV_PRINTF(LEVEL_INFO,
3234 "GPU instance Local Engine type 0x%x is not allocated to GPU instance\n",
3235 localEngType);
3236 return NV_ERR_INVALID_ARGUMENT;
3237 }
3238
3239 *pGlobalEngType = localEngType;
3240 return NV_OK;
3241 }
3242
3243 /*!
3244 * @brief Function to convert global RM_ENGINE_TYPE to local
3245 * RM_ENGINE_TYPE for partitionable engines
3246 * Currently it supports GR, CE, NVDEC, NVENC, NVJPG
3247 */
3248 NV_STATUS
kmigmgrGetGlobalToLocalEngineType_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,MIG_INSTANCE_REF ref,RM_ENGINE_TYPE globalEngType,RM_ENGINE_TYPE * pLocalEngType)3249 kmigmgrGetGlobalToLocalEngineType_IMPL
3250 (
3251 OBJGPU *pGpu,
3252 KernelMIGManager *pKernelMIGManager,
3253 MIG_INSTANCE_REF ref,
3254 RM_ENGINE_TYPE globalEngType,
3255 RM_ENGINE_TYPE *pLocalEngType
3256 )
3257 {
3258 NV_ASSERT_OR_RETURN(kmigmgrIsMIGReferenceValid(&ref), NV_ERR_INVALID_ARGUMENT);
3259 NV_ASSERT_OR_RETURN(RM_ENGINE_TYPE_IS_VALID(globalEngType),
3260 NV_ERR_INVALID_ARGUMENT);
3261
3262 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, globalEngType))
3263 {
3264 //
3265 // Return same engineId as global if called for non-partitioned
3266 // rm engine types like host engines, PMU SEC etc.
3267 //
3268 if (pLocalEngType != NULL)
3269 {
3270 *pLocalEngType = globalEngType;
3271 }
3272 return NV_OK;
3273 }
3274
3275 // Replace the global input index with GI-local
3276 if (kmigmgrEngineTypeXlate(&ref.pKernelMIGGpuInstance->resourceAllocation.engines, globalEngType,
3277 &ref.pKernelMIGGpuInstance->resourceAllocation.localEngines, &globalEngType) != NV_OK)
3278 {
3279 if (pLocalEngType != NULL)
3280 {
3281 NV_PRINTF(LEVEL_INFO,
3282 "Global Engine type 0x%x is not allocated to GPU instance\n",
3283 globalEngType);
3284 }
3285 return NV_ERR_INVALID_ARGUMENT;
3286 }
3287
3288 if (ref.pMIGComputeInstance != NULL)
3289 {
3290 // Replace the GI-local input index with CI-local
3291 if (kmigmgrEngineTypeXlate(&ref.pMIGComputeInstance->resourceAllocation.engines, globalEngType,
3292 &ref.pMIGComputeInstance->resourceAllocation.localEngines, &globalEngType) != NV_OK)
3293 {
3294 if (pLocalEngType != NULL)
3295 {
3296 NV_PRINTF(LEVEL_ERROR,
3297 "GPU instance Local Engine type 0x%x is not allocated to compute instance\n",
3298 globalEngType);
3299 }
3300 return NV_ERR_INVALID_ARGUMENT;
3301 }
3302 }
3303
3304 if (pLocalEngType != NULL)
3305 {
3306 *pLocalEngType = globalEngType;
3307 }
3308 return NV_OK;
3309 }
3310
3311 /*!
3312 * @brief Function to retrieve list of engine types belonging to this
3313 * GPU instance. When MIG is enabled, GRCEs are filtered from the engine
3314 * list, as well as any local GR engine indices outside of the range
3315 * allocated to this GPU instance. When MIG is disabled, all non-legacy GR
3316 * engines are filtered from the enginelist, but no CEs are filtered.
3317 *
3318 * @param[IN] pGpu
3319 * @param[IN] pKernelMIGManager
3320 * @param[IN] pSubdevice
3321 * @param[OUT] pEngineTypes Engine type list
3322 * @param[OUT] pEngineCount Engine type count
3323 *
3324 * @return NV_STATUS
3325 * NV_OK on success
3326 * NV_ERR_INVALID_ARGUMENT if invalid subdevice
3327 * NV_ERR_INVALID_STATE if subdevice is not partitioned
3328 */
3329 NV_STATUS
kmigmgrFilterEngineList_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Subdevice * pSubdevice,RM_ENGINE_TYPE * pEngineTypes,NvU32 * pEngineCount)3330 kmigmgrFilterEngineList_IMPL
3331 (
3332 OBJGPU *pGpu,
3333 KernelMIGManager *pKernelMIGManager,
3334 Subdevice *pSubdevice,
3335 RM_ENGINE_TYPE *pEngineTypes,
3336 NvU32 *pEngineCount
3337 )
3338 {
3339 MIG_INSTANCE_REF ref;
3340 NvBool bMIGInUse = IS_MIG_IN_USE(pGpu);
3341 NvU32 i;
3342
3343 if (bMIGInUse)
3344 {
3345 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3346 kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, GPU_RES_GET_DEVICE(pSubdevice), &ref));
3347 }
3348
3349 *pEngineCount = 0;
3350 for (i = 0; i < pGpu->engineDB.size; ++i)
3351 {
3352 RM_ENGINE_TYPE rmEngineType = pGpu->engineDB.pType[i];
3353 RM_ENGINE_TYPE newEngineType = rmEngineType;
3354 NvBool bAddEngine = NV_TRUE;
3355
3356 if (bMIGInUse)
3357 {
3358 if (kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, ref))
3359 {
3360 // Override the engine type with the local engine idx
3361 NV_ASSERT_OK(kmigmgrGetGlobalToLocalEngineType(pGpu, pKernelMIGManager, ref,
3362 rmEngineType,
3363 &newEngineType));
3364 }
3365 else
3366 {
3367 bAddEngine = NV_FALSE;
3368 }
3369 }
3370 else if (RM_ENGINE_TYPE_IS_GR(rmEngineType) &&
3371 (0 != RM_ENGINE_TYPE_GR_IDX(rmEngineType)))
3372 {
3373 bAddEngine = NV_FALSE;
3374 }
3375
3376 if (bAddEngine)
3377 {
3378 pEngineTypes[(*pEngineCount)++] = newEngineType;
3379 }
3380 }
3381
3382 return NV_OK;
3383 }
3384
3385 /**
3386 * @brief Removes all engines which are not in this client's GPU instance from the
3387 * partnerlist.
3388 *
3389 * @param[IN] pGpu
3390 * @param[IN] pKernelMIGManager
3391 * @param[IN] pSubdevice
3392 * @param[IN/OUT] pPartnerListParams Client Partner list params
3393 *
3394 * @return NV_STATUS
3395 * NV_OK on success or MIG disabled
3396 * NV_ERR_INVALID_ARGUMENT on bad pParams
3397 */
3398 NV_STATUS
kmigmgrFilterEnginePartnerList_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Subdevice * pSubdevice,NV2080_CTRL_GPU_GET_ENGINE_PARTNERLIST_PARAMS * pPartnerListParams)3399 kmigmgrFilterEnginePartnerList_IMPL
3400 (
3401 OBJGPU *pGpu,
3402 KernelMIGManager *pKernelMIGManager,
3403 Subdevice *pSubdevice,
3404 NV2080_CTRL_GPU_GET_ENGINE_PARTNERLIST_PARAMS *pPartnerListParams
3405 )
3406 {
3407 NvU32 i, j;
3408 MIG_INSTANCE_REF ref;
3409
3410 NV_ASSERT_OR_RETURN(NULL != pPartnerListParams, NV_ERR_INVALID_ARGUMENT);
3411
3412 // MIG disabled, nothing to do
3413 if (!IS_MIG_IN_USE(pGpu))
3414 {
3415 return NV_OK;
3416 }
3417
3418 NV_ASSERT_OK_OR_RETURN(
3419 kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, GPU_RES_GET_DEVICE(pSubdevice), &ref));
3420
3421 for (i = 0; i < pPartnerListParams->numPartners; ++i)
3422 {
3423 RM_ENGINE_TYPE rmEngineType = pPartnerListParams->partnerList[i];
3424
3425 if (!kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, ref))
3426 {
3427 // Filter this entry from the partner list
3428 for (j = i; j < pPartnerListParams->numPartners - 1; ++j)
3429 {
3430 pPartnerListParams->partnerList[j] = pPartnerListParams->partnerList[j + 1];
3431 }
3432
3433 pPartnerListParams->numPartners--;
3434
3435 // Break early to prevent underflow of i
3436 if (0 == pPartnerListParams->numPartners)
3437 {
3438 break;
3439 }
3440
3441 i--;
3442 }
3443 }
3444
3445 return NV_OK;
3446 }
3447
3448 /*!
3449 * @brief Finds a GPU Instance profile matching the input request flag
3450 */
3451 NV_STATUS
kmigmgrGetProfileByPartitionFlag_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 partitionFlag,const NV2080_CTRL_INTERNAL_MIGMGR_PROFILE_INFO ** ppProfile)3452 kmigmgrGetProfileByPartitionFlag_IMPL
3453 (
3454 OBJGPU *pGpu,
3455 KernelMIGManager *pKernelMIGManager,
3456 NvU32 partitionFlag,
3457 const NV2080_CTRL_INTERNAL_MIGMGR_PROFILE_INFO **ppProfile
3458 )
3459 {
3460 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
3461 NvU32 i;
3462
3463 NV_ASSERT_OR_RETURN(pStaticInfo != NULL, NV_ERR_INVALID_STATE);
3464 NV_ASSERT_OR_RETURN(pStaticInfo->pProfiles != NULL, NV_ERR_INVALID_STATE);
3465
3466 for (i = 0; i < pStaticInfo->pProfiles->count; ++i)
3467 {
3468 if (pStaticInfo->pProfiles->table[i].partitionFlag == partitionFlag)
3469 {
3470 *ppProfile = &pStaticInfo->pProfiles->table[i];
3471 return NV_OK;
3472 }
3473 }
3474
3475 return NV_ERR_INVALID_STATE;
3476 }
3477
3478 /*
3479 * @brief Determine illegal swizzIds based on global swizzId mask
3480 */
3481 NV_STATUS
kmigmgrGetInvalidSwizzIdMask_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvU64 * pUnsupportedSwizzIdMask)3482 kmigmgrGetInvalidSwizzIdMask_IMPL
3483 (
3484 OBJGPU *pGpu,
3485 KernelMIGManager *pKernelMIGManager,
3486 NvU32 swizzId,
3487 NvU64 *pUnsupportedSwizzIdMask
3488 )
3489 {
3490 NvU64 i;
3491 NvU64 gpuSlice[KGRMGR_MAX_GR] =
3492 {
3493 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(7)),
3494 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(8)),
3495 (NVBIT64(0) | NVBIT64(1) | NVBIT64(4) | NVBIT64(9)),
3496 (NVBIT64(0) | NVBIT64(1) | NVBIT64(4) | NVBIT64(10)),
3497 (NVBIT64(0) | NVBIT64(2) | NVBIT64(5) | NVBIT64(11)),
3498 (NVBIT64(0) | NVBIT64(2) | NVBIT64(5) | NVBIT64(12)),
3499 (NVBIT64(0) | NVBIT64(2) | NVBIT64(6) | NVBIT64(13)),
3500 (NVBIT64(0) | NVBIT64(2) | NVBIT64(6) | NVBIT64(14))
3501 };
3502
3503 NV_ASSERT_OR_RETURN(NULL != pUnsupportedSwizzIdMask, NV_ERR_INVALID_ARGUMENT);
3504
3505 // All bits corresponding to nonexistent swizzids are invalid
3506 *pUnsupportedSwizzIdMask = DRF_SHIFTMASK64(63:KMIGMGR_MAX_GPU_SWIZZID);
3507
3508 for (i = 0; i < KGRMGR_MAX_GR; ++i)
3509 {
3510 if (0 != (gpuSlice[i] & NVBIT64(swizzId)))
3511 {
3512 *pUnsupportedSwizzIdMask |= gpuSlice[i];
3513 }
3514 }
3515
3516 return NV_OK;
3517 }
3518
3519 /*!
3520 * @brief Processes request to update partitioning mode to the given value.
3521 */
3522 NV_STATUS
kmigmgrSetPartitioningMode_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)3523 kmigmgrSetPartitioningMode_IMPL
3524 (
3525 OBJGPU *pGpu,
3526 KernelMIGManager *pKernelMIGManager
3527 )
3528 {
3529 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
3530 NV2080_CTRL_INTERNAL_GPU_GET_SMC_MODE_PARAMS params;
3531 KernelCcu *pKccu = GPU_GET_KERNEL_CCU(pGpu);
3532
3533 portMemSet(¶ms, 0x0, sizeof(params));
3534 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3535 pRmApi->Control(pRmApi,
3536 pGpu->hInternalClient,
3537 pGpu->hInternalSubdevice,
3538 NV2080_CTRL_CMD_INTERNAL_GPU_GET_SMC_MODE,
3539 ¶ms,
3540 sizeof(params)));
3541
3542 // Should never have reached this far
3543 NV_ASSERT_OR_RETURN(params.smcMode != NV2080_CTRL_GPU_INFO_GPU_SMC_MODE_UNSUPPORTED,
3544 NV_ERR_INVALID_STATE);
3545
3546 //
3547 // If pending state, do not update mode in response to request. Mode will be
3548 // updated on next GPU reset.
3549 //
3550 if ((params.smcMode == NV2080_CTRL_GPU_INFO_GPU_SMC_MODE_DISABLE_PENDING) ||
3551 (params.smcMode == NV2080_CTRL_GPU_INFO_GPU_SMC_MODE_ENABLE_PENDING))
3552 {
3553 return NV_OK;
3554 }
3555
3556 pKernelMIGManager->bMIGEnabled = (params.smcMode == NV2080_CTRL_GPU_INFO_GPU_SMC_MODE_ENABLED);
3557
3558 gpumgrCacheSetMIGEnabled(pGpu, pKernelMIGManager->bMIGEnabled);
3559
3560 // MIG Mode might not have been enabled yet, so load static info if enabled
3561 if (IS_MIG_ENABLED(pGpu))
3562 {
3563 // Initialize static info derived from physical RM
3564 NV_ASSERT_OK_OR_RETURN(kmigmgrLoadStaticInfo_HAL(pGpu, pKernelMIGManager));
3565
3566 //
3567 // Populate static GPU instance memory config which will be used to manage
3568 // GPU instance memory
3569 //
3570 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
3571 NV_ASSERT_OK_OR_RETURN(kmemsysPopulateMIGGPUInstanceMemConfig_HAL(pGpu, pKernelMemorySystem));
3572
3573 NV_ASSERT_OK(gpuDisableAccounting(pGpu, NV_TRUE));
3574 }
3575
3576 kbusUpdateRusdStatistics(pGpu);
3577
3578 if (pKccu)
3579 {
3580 kccuMigShrBufHandler_HAL(pGpu, pKccu, pKernelMIGManager->bMIGEnabled);
3581 }
3582 return NV_OK;
3583 }
3584
3585 /**
3586 * @brief Function to get reference of gpu / compute instance which
3587 * contains the given engine. If no instances are found, an error is returned.
3588 */
3589 NV_STATUS
kmigmgrGetMIGReferenceFromEngineType_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,RM_ENGINE_TYPE rmEngineType,MIG_INSTANCE_REF * pRef)3590 kmigmgrGetMIGReferenceFromEngineType_IMPL
3591 (
3592 OBJGPU *pGpu,
3593 KernelMIGManager *pKernelMIGManager,
3594 RM_ENGINE_TYPE rmEngineType,
3595 MIG_INSTANCE_REF *pRef
3596 )
3597 {
3598 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance;
3599 MIG_COMPUTE_INSTANCE *pMIGComputeInstance;
3600 NvU32 CIIdx;
3601
3602 NV_ASSERT_OR_RETURN(pRef != NULL, NV_ERR_INVALID_ARGUMENT);
3603 // Default to non-attributed channel
3604 *pRef = kmigmgrMakeNoMIGReference();
3605
3606 // Bail out early if there are no instances to attribute to
3607 if (!IS_MIG_IN_USE(pGpu))
3608 return NV_ERR_NOT_SUPPORTED;
3609
3610 //
3611 // if this happens to be an RM internal channel not bound to an engine,
3612 // attribute it to no instance
3613 //
3614 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType))
3615 return NV_ERR_INVALID_ARGUMENT;
3616
3617 // Engine is not partitionable, attribute to no instance
3618 if (!kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType))
3619 return NV_ERR_INVALID_ARGUMENT;
3620
3621 pKernelMIGGPUInstance = NULL;
3622 FOR_EACH_VALID_GPU_INSTANCE(pGpu, pKernelMIGManager, pKernelMIGGPUInstance)
3623 {
3624 if (kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType,
3625 kmigmgrMakeGIReference(pKernelMIGGPUInstance)))
3626 {
3627 break;
3628 }
3629 }
3630 FOR_EACH_VALID_GPU_INSTANCE_END();
3631
3632 // Engine was partitionable, but not in any of our gpu instance.
3633 if ((pKernelMIGGPUInstance == NULL) || !pKernelMIGGPUInstance->bValid)
3634 return NV_ERR_INVALID_STATE;
3635
3636 *pRef = kmigmgrMakeGIReference(pKernelMIGGPUInstance);
3637
3638 // Attempt to find a compute instance which contains this engine
3639 for (CIIdx = 0;
3640 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGPUInstance->MIGComputeInstance);
3641 ++CIIdx)
3642 {
3643 pMIGComputeInstance = &pKernelMIGGPUInstance->MIGComputeInstance[CIIdx];
3644
3645 if (!pMIGComputeInstance->bValid)
3646 continue;
3647
3648 if (kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType,
3649 kmigmgrMakeCIReference(pKernelMIGGPUInstance, pMIGComputeInstance)))
3650 {
3651 break;
3652 }
3653 }
3654
3655 if (CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGPUInstance->MIGComputeInstance))
3656 *pRef = kmigmgrMakeCIReference(pKernelMIGGPUInstance, pMIGComputeInstance);
3657
3658 return NV_OK;
3659 }
3660
3661 /*!
3662 * @brief Check if we are running on a reduced config GPU then set the corresponding flag
3663 */
3664 void
kmigmgrDetectReducedConfig_KERNEL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)3665 kmigmgrDetectReducedConfig_KERNEL
3666 (
3667 OBJGPU *pGpu,
3668 KernelMIGManager *pKernelMIGManager
3669 )
3670 {
3671 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
3672 NvU32 i;
3673
3674 NV_ASSERT_OR_RETURN_VOID(pStaticInfo != NULL);
3675
3676 for (i = 0; i < pStaticInfo->pCIProfiles->profileCount; ++i)
3677 {
3678 // Reduced config A100 does not support 1/8 compute size
3679 if (pStaticInfo->pCIProfiles->profiles[i].computeSize == NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH)
3680 {
3681 return;
3682 }
3683 }
3684
3685 pKernelMIGManager->bIsA100ReducedConfig = NV_TRUE;
3686 }
3687
3688 /*!
3689 * @brief Get the CE in GI that can be used for scrubbing
3690 *
3691 * @param[IN] pGpu
3692 * @param[IN] pKernelMIGManager
3693 * @param[IN] pDevice Device subscribed to GI
3694 * @param[OUT] ppCe Scrubber CE
3695 */
3696 NV_STATUS
kmigmgrGetGPUInstanceScrubberCe_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,Device * pDevice,NvU32 * ceInst)3697 kmigmgrGetGPUInstanceScrubberCe_IMPL
3698 (
3699 OBJGPU *pGpu,
3700 KernelMIGManager *pKernelMIGManager,
3701 Device *pDevice,
3702 NvU32 *ceInst
3703 )
3704 {
3705 MIG_INSTANCE_REF ref;
3706 ENGTYPE_BIT_VECTOR ces;
3707
3708 NV_ASSERT_OK_OR_RETURN(
3709 kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref));
3710
3711 bitVectorClrAll(&ces);
3712 bitVectorSetRange(&ces, RM_ENGINE_RANGE_COPY());
3713 bitVectorAnd(&ces, &ces, &ref.pKernelMIGGpuInstance->resourceAllocation.engines);
3714
3715 NV_ASSERT_OR_RETURN(!bitVectorTestAllCleared(&ces), NV_ERR_INSUFFICIENT_RESOURCES);
3716
3717 // Pick the first CE in the instance
3718 *ceInst = RM_ENGINE_TYPE_COPY_IDX(bitVectorCountTrailingZeros(&ces));
3719
3720 return NV_OK;
3721 }
3722
3723 /*!
3724 * @brief Copy gpu instance type cache to user provided params for
3725 * DESCRIBE_PARTITIONS
3726 */
3727 NV_STATUS
kmigmgrDescribeGPUInstances_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NV2080_CTRL_GPU_DESCRIBE_PARTITIONS_PARAMS * pParams)3728 kmigmgrDescribeGPUInstances_IMPL
3729 (
3730 OBJGPU *pGpu,
3731 KernelMIGManager *pKernelMIGManager,
3732 NV2080_CTRL_GPU_DESCRIBE_PARTITIONS_PARAMS *pParams
3733 )
3734 {
3735 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
3736 NvU32 i;
3737 NvU32 entryCount;
3738
3739 if ((pStaticInfo == NULL) || (pStaticInfo->pProfiles == NULL))
3740 return NV_ERR_NOT_SUPPORTED;
3741
3742 entryCount = 0;
3743 for (i = 0; i < pStaticInfo->pProfiles->count; ++i)
3744 {
3745 if (IS_VIRTUAL(pGpu))
3746 {
3747 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
3748 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_ERR_INVALID_STATE);
3749 pParams->partitionDescs[entryCount].memorySize = pVSI->gpuPartitionInfo.memSize;
3750 }
3751 else
3752 {
3753 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
3754 NV_RANGE addrRange = NV_RANGE_EMPTY;
3755 NvU32 swizzId;
3756 NvU32 memorySize = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _MEMORY_SIZE,
3757 pStaticInfo->pProfiles->table[i].partitionFlag);
3758
3759 // Retrieve a valid id for this flag combination
3760 switch (memorySize)
3761 {
3762 case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_FULL:
3763 swizzId = 0;
3764 break;
3765 case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_HALF:
3766 swizzId = 1;
3767 break;
3768 case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_QUARTER:
3769 swizzId = 3;
3770 break;
3771 case NV2080_CTRL_GPU_PARTITION_FLAG_MEMORY_SIZE_EIGHTH:
3772 swizzId = 7;
3773 break;
3774 default:
3775 NV_ASSERT(0);
3776 continue;
3777 }
3778
3779 NV_ASSERT_OK(kmemsysGetMIGGPUInstanceMemInfo(pGpu, pKernelMemorySystem, swizzId, &addrRange));
3780 pParams->partitionDescs[entryCount].memorySize = rangeLength(addrRange);
3781 }
3782
3783 pParams->partitionDescs[entryCount].partitionFlag = pStaticInfo->pProfiles->table[i].partitionFlag;
3784 pParams->partitionDescs[entryCount].grCount = pStaticInfo->pProfiles->table[i].grCount;
3785 pParams->partitionDescs[entryCount].gfxGrCount = pStaticInfo->pProfiles->table[i].gfxGrCount;
3786 pParams->partitionDescs[entryCount].gpcCount = pStaticInfo->pProfiles->table[i].gpcCount;
3787 pParams->partitionDescs[entryCount].gfxGpcCount = pStaticInfo->pProfiles->table[i].gfxGpcCount;
3788 pParams->partitionDescs[entryCount].virtualGpcCount = pStaticInfo->pProfiles->table[i].virtualGpcCount;
3789 pParams->partitionDescs[entryCount].veidCount = pStaticInfo->pProfiles->table[i].veidCount;
3790 pParams->partitionDescs[entryCount].smCount = pStaticInfo->pProfiles->table[i].smCount;
3791 pParams->partitionDescs[entryCount].ceCount = pStaticInfo->pProfiles->table[i].ceCount;
3792 pParams->partitionDescs[entryCount].nvEncCount = pStaticInfo->pProfiles->table[i].nvEncCount;
3793 pParams->partitionDescs[entryCount].nvDecCount = pStaticInfo->pProfiles->table[i].nvDecCount;
3794 pParams->partitionDescs[entryCount].nvJpgCount = pStaticInfo->pProfiles->table[i].nvJpgCount;
3795 pParams->partitionDescs[entryCount].nvOfaCount = pStaticInfo->pProfiles->table[i].nvOfaCount;
3796
3797 entryCount++;
3798 }
3799 pParams->descCount = pStaticInfo->pProfiles->count;
3800
3801 return NV_OK;
3802 }
3803
3804 /*!
3805 * @brief Saves MIG compute instance topology in provided structure
3806 */
3807 NV_STATUS
kmigmgrSaveComputeInstances_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,GPUMGR_SAVE_COMPUTE_INSTANCE * pComputeInstanceSaves)3808 kmigmgrSaveComputeInstances_IMPL
3809 (
3810 OBJGPU *pGpu,
3811 KernelMIGManager *pKernelMIGManager,
3812 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
3813 GPUMGR_SAVE_COMPUTE_INSTANCE *pComputeInstanceSaves
3814 )
3815 {
3816 NvU32 CIIdx;
3817 NvU32 ciCount = 0;
3818
3819 // Sanity checks
3820 NV_ASSERT_OR_RETURN((pKernelMIGGpuInstance != NULL) && (pComputeInstanceSaves != NULL),
3821 NV_ERR_INVALID_ARGUMENT);
3822
3823 for (CIIdx = 0; CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance); ++CIIdx)
3824 {
3825 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx];
3826 GPUMGR_SAVE_COMPUTE_INSTANCE *pComputeInstanceSave = &pComputeInstanceSaves[ciCount];
3827 NvU32 gpcIdx;
3828
3829 // Skip invalid compute instances
3830 if (!pMIGComputeInstance->bValid)
3831 continue;
3832
3833 portMemSet(pComputeInstanceSave, 0, sizeof(*pComputeInstanceSave));
3834 pComputeInstanceSave->bValid = NV_TRUE;
3835 pComputeInstanceSave->ciInfo.sharedEngFlags = pMIGComputeInstance->sharedEngFlag;
3836 pComputeInstanceSave->id = CIIdx;
3837 pComputeInstanceSave->pOsRmCaps = pMIGComputeInstance->pOsRmCaps;
3838 bitVectorToRaw(&pMIGComputeInstance->resourceAllocation.engines,
3839 &pComputeInstanceSave->ciInfo.enginesMask,
3840 sizeof(pComputeInstanceSave->ciInfo.enginesMask));
3841 if (IS_GSP_CLIENT(pGpu))
3842 {
3843 for (gpcIdx = 0; gpcIdx < pMIGComputeInstance->resourceAllocation.gpcCount; ++gpcIdx)
3844 {
3845 pComputeInstanceSave->ciInfo.gpcMask |=
3846 NVBIT32(pMIGComputeInstance->resourceAllocation.gpcIds[gpcIdx]);
3847 }
3848 }
3849 else
3850 {
3851 pComputeInstanceSave->ciInfo.gpcMask = DRF_MASK(pMIGComputeInstance->resourceAllocation.gpcCount - 1 : 0);
3852 }
3853
3854 pComputeInstanceSave->ciInfo.gfxGpcCount = pMIGComputeInstance->resourceAllocation.gfxGpcCount;
3855 pComputeInstanceSave->ciInfo.veidOffset = pMIGComputeInstance->resourceAllocation.veidOffset;
3856 pComputeInstanceSave->ciInfo.veidCount = pMIGComputeInstance->resourceAllocation.veidCount;
3857 pComputeInstanceSave->ciInfo.smCount = pMIGComputeInstance->resourceAllocation.smCount;
3858 pComputeInstanceSave->ciInfo.spanStart = pMIGComputeInstance->spanStart;
3859 pComputeInstanceSave->ciInfo.computeSize = pMIGComputeInstance->computeSize;
3860
3861 portMemCopy(pComputeInstanceSave->ciInfo.uuid, sizeof(pComputeInstanceSave->ciInfo.uuid),
3862 pMIGComputeInstance->uuid.uuid, sizeof(pMIGComputeInstance->uuid.uuid));
3863
3864 ++ciCount;
3865 }
3866
3867 return NV_OK;
3868 }
3869
3870 /*!
3871 * @brief Function to get SwizzId to allowed GrIdx, physical GPC_IDs,
3872 * physical CE_IDs and VEIDs in a GPU instance
3873 *
3874 * @param[IN] swizzId SwizzId used by the GPU instance
3875 * @param[OUT] pResourceAllocation Structure containing engine configs for a
3876 * GPU instance. This contains engineCount and
3877 * engine Ids.
3878 */
3879 NV_STATUS
kmigmgrSwizzIdToResourceAllocation_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,MIG_RESOURCE_ALLOCATION * pResourceAllocation)3880 kmigmgrSwizzIdToResourceAllocation_IMPL
3881 (
3882 OBJGPU *pGpu,
3883 KernelMIGManager *pKernelMIGManager,
3884 NvU32 swizzId,
3885 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params,
3886 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
3887 MIG_RESOURCE_ALLOCATION *pResourceAllocation
3888 )
3889 {
3890 NV2080_CTRL_INTERNAL_KMIGMGR_EXPORTED_GPU_INSTANCE_INFO info;
3891 NvU32 tempGpcMask;
3892
3893 NV_CHECK_OR_RETURN(LEVEL_ERROR, swizzId < KMIGMGR_MAX_GPU_SWIZZID, NV_ERR_INVALID_ARGUMENT);
3894
3895 if (params.type == KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_REQUEST)
3896 {
3897 NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS export;
3898 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
3899
3900 portMemSet(&export, 0, sizeof(export));
3901 export.swizzId = swizzId;
3902
3903 // Retrieve the info of the gpu instance GSP just created
3904 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
3905 pRmApi->Control(pRmApi,
3906 pGpu->hInternalClient,
3907 pGpu->hInternalSubdevice,
3908 NV2080_CTRL_CMD_INTERNAL_MIGMGR_EXPORT_GPU_INSTANCE,
3909 &export,
3910 sizeof(export)));
3911 info = export.info;
3912 }
3913 else
3914 {
3915 info = params.inst.restore.pGPUInstanceSave->giInfo;
3916 }
3917
3918 pResourceAllocation->gpcCount = 0;
3919 tempGpcMask = info.gpcMask;
3920 while (tempGpcMask != 0x0)
3921 {
3922 NvU32 gpcIdx = portUtilCountTrailingZeros32(tempGpcMask);
3923 pResourceAllocation->gpcIds[(pResourceAllocation->gpcCount)++] = gpcIdx;
3924 tempGpcMask &= ~(NVBIT32(gpcIdx));
3925 }
3926
3927 pResourceAllocation->veidCount = info.veidCount;
3928 pResourceAllocation->veidOffset = info.veidOffset;
3929 pResourceAllocation->virtualGpcCount = info.virtualGpcCount;
3930
3931 // Use profile SM count for filling the resource allocation
3932 pResourceAllocation->smCount = pKernelMIGGpuInstance->pProfile->smCount;
3933
3934 bitVectorFromRaw(&pResourceAllocation->engines, info.enginesMask, sizeof(info.enginesMask));
3935
3936 // Cache the local engine mask for this instance
3937 kmigmgrGetLocalEngineMask(&pResourceAllocation->engines, &pResourceAllocation->localEngines);
3938
3939 return NV_OK;
3940 }
3941
3942 // Create client and subdevice handles to make calls into this compute instance
3943 NV_STATUS
kmigmgrAllocComputeInstanceHandles_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,MIG_COMPUTE_INSTANCE * pMIGComputeInstance)3944 kmigmgrAllocComputeInstanceHandles_IMPL
3945 (
3946 OBJGPU *pGpu,
3947 KernelMIGManager *pKernelMIGManager,
3948 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
3949 MIG_COMPUTE_INSTANCE *pMIGComputeInstance
3950 )
3951 {
3952 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
3953 NvHandle hGPUInstanceSubscription = NV01_NULL_OBJECT;
3954 NvHandle hComputeInstanceSubscription = NV01_NULL_OBJECT;
3955 NvHandle hClient;
3956 NvHandle hDevice;
3957 NvHandle hSubdevice;
3958 NV_STATUS status;
3959
3960 NV_ASSERT_OK_OR_RETURN(
3961 rmapiutilAllocClientAndDeviceHandles(pRmApi, pGpu, &hClient, &hDevice, &hSubdevice));
3962
3963 {
3964 NVC637_ALLOCATION_PARAMETERS params;
3965 portMemSet(¶ms, 0, sizeof(params));
3966 params.swizzId = pKernelMIGGpuInstance->swizzId;
3967 NV_ASSERT_OK_OR_GOTO(status,
3968 pRmApi->Alloc(pRmApi, hClient, hSubdevice, &hGPUInstanceSubscription, AMPERE_SMC_PARTITION_REF, ¶ms, sizeof(params)),
3969 failed);
3970 }
3971
3972 {
3973 NVC638_ALLOCATION_PARAMETERS params;
3974 portMemSet(¶ms, 0, sizeof(params));
3975 params.execPartitionId = pMIGComputeInstance->id;
3976 NV_ASSERT_OK_OR_GOTO(status,
3977 pRmApi->Alloc(pRmApi, hClient, hGPUInstanceSubscription, &hComputeInstanceSubscription, AMPERE_SMC_EXEC_PARTITION_REF, ¶ms, sizeof(params)),
3978 failed);
3979 }
3980
3981 pMIGComputeInstance->instanceHandles.hClient = hClient;
3982 pMIGComputeInstance->instanceHandles.hSubdevice = hSubdevice;
3983 pMIGComputeInstance->instanceHandles.hSubscription = hComputeInstanceSubscription;
3984
3985 return NV_OK;
3986
3987 failed:
3988 pRmApi->Free(pRmApi, hClient, hClient);
3989 return status;
3990 }
3991
3992 /*!
3993 * @brief Get Compute Instance UUID
3994 *
3995 * @param[IN] pGpu
3996 * @param[IN] pKernelMIGManager
3997 * @param[IN] swizzId GPU instance swizz ID
3998 * @param[IN] globalGrIdx physical syspipe ID
3999 * @param[OUT] pUuid Compute Instance UUID
4000 */
4001 NV_STATUS
kmigmgrGenerateComputeInstanceUuid_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvU32 globalGrIdx,NvUuid * pUuid)4002 kmigmgrGenerateComputeInstanceUuid_VF
4003 (
4004 OBJGPU *pGpu,
4005 KernelMIGManager *pKernelMIGManager,
4006 NvU32 swizzId,
4007 NvU32 globalGrIdx,
4008 NvUuid *pUuid
4009 )
4010 {
4011 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
4012 NvU16 chipId = DRF_VAL(_PMC, _BOOT_42, _CHIP_ID, pGpu->chipId1);
4013 NvU64 gid;
4014
4015 NV_ASSERT_OR_RETURN(pVSI != NULL, NV_ERR_INVALID_STATE);
4016
4017 portMemCopy(&gid, sizeof(gid), pVSI->gidInfo.data, sizeof(gid));
4018
4019 //
4020 // We can't use PDI for the vGPU use-case. We need a unique ID per VM.
4021 // So, for vGPU, read the first 64-bits from the host generated UUID.
4022 // These bits represent a timestamp, which should be unique per VM.
4023 //
4024 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
4025 nvGenerateSmcUuid(chipId, gid, swizzId, globalGrIdx, pUuid));
4026
4027 return NV_OK;
4028 }
4029
4030 /*!
4031 * @brief Get GPU Instance UUID
4032 *
4033 * @param[IN] pGpu
4034 * @param[IN] pKernelMIGManager
4035 * @param[IN] swizzId GPU instance swizz ID
4036 * @param[OUT] pUuid GPU Instance UUID
4037 */
4038 NV_STATUS
kmigmgrGenerateGPUInstanceUuid_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvUuid * pUuid)4039 kmigmgrGenerateGPUInstanceUuid_VF
4040 (
4041 OBJGPU *pGpu,
4042 KernelMIGManager *pKernelMIGManager,
4043 NvU32 swizzId,
4044 NvUuid *pUuid
4045 )
4046 {
4047 return kmigmgrGenerateComputeInstanceUuid_HAL(
4048 pGpu, pKernelMIGManager, swizzId, GR_INDEX_INVALID, pUuid);
4049 }
4050
4051 /*!
4052 * @brief create compute instances
4053 *
4054 * @param[IN] pGpu
4055 * @param[IN] pKernelMIGManager
4056 * @param[IN] pKernelMIGGpuInstance
4057 * @param[IN] bQuery If NV_TRUE, don't save created instances
4058 * @param[IN] params List of requested compute instance to create
4059 * @param[OUT] pCIIDs IDs of created instances
4060 * @param[IN] bCreateCap Flag stating if MIG CI capabilities needs to be created
4061 */
4062 NV_STATUS
kmigmgrCreateComputeInstances_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvBool bQuery,KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS params,NvU32 * pCIIDs,NvBool bCreateCap)4063 kmigmgrCreateComputeInstances_VF
4064 (
4065 OBJGPU *pGpu,
4066 KernelMIGManager *pKernelMIGManager,
4067 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
4068 NvBool bQuery,
4069 KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS params,
4070 NvU32 *pCIIDs,
4071 NvBool bCreateCap
4072 )
4073 {
4074 NV_STATUS status = NV_OK;
4075 NvU32 count;
4076 ENGTYPE_BIT_VECTOR shadowExclusiveEngMask;
4077 ENGTYPE_BIT_VECTOR shadowSharedEngMask;
4078 MIG_COMPUTE_INSTANCE *pComputeInstanceInfo;
4079 NvU32 CIIdx;
4080 NvU32 freeSlots;
4081 NvU32 createdInstances;
4082 NvU32 inUseGpcCount;
4083 NvU32 remainingGpcCount;
4084 NvU32 i;
4085 NvU64 shadowCTSInUseMask;
4086 NvU64 shadowVeidInUseMask;
4087 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
4088 KMIGMGR_CONFIGURE_INSTANCE_REQUEST *pConfigRequestPerCi = NULL;
4089 NvBool bIsCTSRequired = kmigmgrIsCTSAlignmentRequired_HAL(pGpu, pKernelMIGManager);
4090
4091 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
4092
4093 count = (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_REQUEST)
4094 ? params.inst.request.count
4095 : 1;
4096
4097 NV_CHECK_OR_RETURN(LEVEL_SILENT, count != 0, NV_ERR_INVALID_ARGUMENT);
4098
4099 pComputeInstanceInfo = portMemAllocNonPaged(sizeof(*pComputeInstanceInfo) *
4100 KMIGMGR_MAX_COMPUTE_INSTANCES);
4101 NV_CHECK_OR_RETURN(LEVEL_NOTICE, pComputeInstanceInfo != NULL, NV_ERR_NO_MEMORY);
4102
4103 portMemSet(pComputeInstanceInfo, 0, sizeof(*pComputeInstanceInfo) *
4104 KMIGMGR_MAX_COMPUTE_INSTANCES);
4105
4106 pConfigRequestPerCi = portMemAllocStackOrHeap(sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4107 NV_ASSERT_OR_ELSE(pConfigRequestPerCi != NULL, status = NV_ERR_NO_MEMORY; goto done;);
4108
4109 portMemSet(pConfigRequestPerCi, 0, sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4110
4111 // Check that there's enough open compute instance slots, and count used GPCs
4112 freeSlots = 0;
4113 inUseGpcCount = 0;
4114 for (CIIdx = 0;
4115 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
4116 ++CIIdx)
4117 {
4118 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx];
4119
4120 if (pMIGComputeInstance->bValid)
4121 {
4122 NvU32 smCount = pMIGComputeInstance->resourceAllocation.smCount;
4123 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE ciProfile;
4124
4125 NV_CHECK_OK_OR_ELSE(status, LEVEL_ERROR,
4126 kmigmgrGetComputeProfileFromSmCount(pGpu, pKernelMIGManager, smCount, &ciProfile),
4127 goto done; );
4128
4129 inUseGpcCount += ciProfile.gpcCount;
4130 }
4131 else
4132 {
4133 freeSlots++;
4134 }
4135 }
4136 NV_CHECK_OR_ELSE(LEVEL_SILENT, freeSlots >= count,
4137 status = NV_ERR_INSUFFICIENT_RESOURCES; goto done);
4138
4139 //
4140 // Check that we have enough spare GPCs. We're going to reuse the GPU Instance
4141 // configuration logic later on to do the actual allocation, so for now just
4142 // check the count.
4143 //
4144 NV_ASSERT_OR_ELSE(pKernelMIGGpuInstance->resourceAllocation.virtualGpcCount >= inUseGpcCount,
4145 status = NV_ERR_INVALID_STATE; goto done);
4146 remainingGpcCount = pKernelMIGGpuInstance->resourceAllocation.virtualGpcCount - inUseGpcCount;
4147
4148 //
4149 // Cache local copies of the resource pools, we'll commit them later if we
4150 // have to
4151 //
4152 bitVectorCopy(&shadowExclusiveEngMask, &pKernelMIGGpuInstance->exclusiveEngMask);
4153 bitVectorCopy(&shadowSharedEngMask, &pKernelMIGGpuInstance->sharedEngMask);
4154 shadowCTSInUseMask = pKernelMIGGpuInstance->ctsIdsInUseMask;
4155 shadowVeidInUseMask = kgrmgrGetVeidInUseMask(pGpu, pKernelGraphicsManager);
4156 for (CIIdx = 0; CIIdx < count; ++CIIdx)
4157 {
4158 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pCIProfile;
4159 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pComputeInstanceInfo[CIIdx];
4160 MIG_RESOURCE_ALLOCATION *pResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4161 NvU32 smCount =
4162 (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_REQUEST)
4163 ? params.inst.request.pReqComputeInstanceInfo[CIIdx].smCount
4164 : params.inst.restore.pComputeInstanceSave->ciInfo.smCount;
4165 NvU32 gpcCount =
4166 (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_REQUEST)
4167 ? params.inst.request.pReqComputeInstanceInfo[CIIdx].gpcCount
4168 : nvPopCount32(params.inst.restore.pComputeInstanceSave->ciInfo.gpcMask);
4169 pMIGComputeInstance->bValid = NV_TRUE;
4170 pMIGComputeInstance->sharedEngFlag =
4171 (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_REQUEST)
4172 ? params.inst.request.pReqComputeInstanceInfo[CIIdx].sharedEngFlag
4173 : params.inst.restore.pComputeInstanceSave->ciInfo.sharedEngFlags;
4174 NvU32 spanStart;
4175 NvU32 ctsId;
4176
4177 if (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_REQUEST)
4178 {
4179 spanStart = KMIGMGR_SPAN_OFFSET_INVALID;
4180 if (FLD_TEST_REF(NVC637_CTRL_DMA_EXEC_PARTITIONS_CREATE_REQUEST_AT_SPAN, _TRUE, params.inst.request.requestFlags))
4181 {
4182 NvU32 veidStepSize;
4183
4184 NV_ASSERT_OK_OR_GOTO(status,
4185 kgrmgrGetVeidStepSize(pGpu, pKernelGraphicsManager, &veidStepSize),
4186 done);
4187
4188 //
4189 // Select spanStart from spanStart field, else calculate the spanStart using the veid offset passed in.
4190 // This is done specifically to accomodate legacy flows which don't have knowledge of the new spanStart field
4191 //
4192 spanStart = (params.inst.request.pReqComputeInstanceInfo[CIIdx].spanStart != 0)
4193 ? params.inst.request.pReqComputeInstanceInfo[CIIdx].spanStart
4194 : params.inst.request.pReqComputeInstanceInfo[CIIdx].veidStartOffset / veidStepSize;
4195 }
4196 }
4197 else
4198 {
4199 spanStart = params.inst.restore.pComputeInstanceSave->ciInfo.spanStart;
4200 }
4201
4202 pConfigRequestPerCi[CIIdx].veidSpanStart = spanStart;
4203 pCIProfile = &pConfigRequestPerCi[CIIdx].profile;
4204 ctsId = KMIGMGR_CTSID_INVALID;
4205 if (kmigmgrGetComputeProfileForRequest(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, smCount, gpcCount, pCIProfile) == NV_OK)
4206 {
4207 // CTS and Span allocation is done early to help prevent spurious requests
4208 if (bIsCTSRequired)
4209 {
4210 if (spanStart != KMIGMGR_SPAN_OFFSET_INVALID)
4211 {
4212 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4213 kmigmgrXlateSpanStartToCTSId(pGpu, pKernelMIGManager,
4214 pCIProfile->computeSize,
4215 spanStart,
4216 &ctsId),
4217 done);
4218
4219 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4220 kmigmgrIsCTSIdAvailable(pGpu, pKernelMIGManager,
4221 pKernelMIGGpuInstance->pProfile->validCTSIdMask,
4222 shadowCTSInUseMask,
4223 ctsId),
4224 status = NV_ERR_STATE_IN_USE; goto done; );
4225 }
4226 else
4227 {
4228 // Don't know how to allocate GfxGpc in VF yet
4229 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4230 kmigmgrGetFreeCTSId(pGpu, pKernelMIGManager,
4231 &ctsId,
4232 pKernelMIGGpuInstance->pProfile->validCTSIdMask,
4233 0x0,
4234 shadowCTSInUseMask,
4235 pCIProfile->computeSize,
4236 NV_FALSE,
4237 NV_FALSE),
4238 done);
4239 }
4240
4241 NV_CHECK_OR_ELSE(LEVEL_ERROR, ctsId < KMIGMGR_MAX_GPU_CTSID,
4242 status = NV_ERR_INVALID_STATE; goto done; );
4243
4244 pConfigRequestPerCi[CIIdx].veidSpanStart = kmigmgrGetSpanStartFromCTSId(pGpu, pKernelMIGManager, ctsId);
4245 shadowCTSInUseMask |= NVBIT64(ctsId);
4246 }
4247 }
4248 else
4249 {
4250 // If no CI profile was available. Populate one with bare-necessities
4251 pCIProfile->computeSize = KMIGMGR_COMPUTE_SIZE_INVALID;
4252 pCIProfile->gpcCount = gpcCount;
4253 pCIProfile->smCount = gpcCount * (pKernelMIGGpuInstance->pProfile->smCount / pKernelMIGGpuInstance->pProfile->gpcCount);
4254
4255 // Force non-profile requests to go through VEID allocator
4256 pConfigRequestPerCi[CIIdx].veidSpanStart = KMIGMGR_SPAN_OFFSET_INVALID;
4257 }
4258
4259 pConfigRequestPerCi[CIIdx].ctsId = ctsId;
4260
4261 // Perform VEID request checks or use the best fit allocator to find a slot
4262 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4263 kgrmgrCheckVeidsRequest(pGpu, pKernelGraphicsManager,
4264 &shadowVeidInUseMask,
4265 pCIProfile->veidCount,
4266 &pConfigRequestPerCi[CIIdx].veidSpanStart,
4267 pKernelMIGGpuInstance),
4268 done);
4269
4270 // Perform checks and VEID allocation
4271 if (!bIsCTSRequired)
4272 {
4273 //
4274 // Only perform explicit GPC checks if CTS alignment isn't required. A similar case
4275 // is covered by CTS requirements.
4276 //
4277 if (remainingGpcCount < pCIProfile->gpcCount)
4278 {
4279 NV_PRINTF(LEVEL_ERROR,
4280 "Not enough remaining GPCs (%d) for compute instance request (%d).\n",
4281 remainingGpcCount, pCIProfile->gpcCount);
4282 status = NV_ERR_INSUFFICIENT_RESOURCES;
4283 goto done;
4284 }
4285 remainingGpcCount -= pCIProfile->gpcCount;
4286 }
4287
4288 if (params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_RESTORE)
4289 {
4290 ENGTYPE_BIT_VECTOR engines;
4291 bitVectorClrAll(&pResourceAllocation->engines);
4292
4293 // Set engines requested directly in resource allocation mask
4294 bitVectorFromRaw(&pResourceAllocation->engines,
4295 params.inst.restore.pComputeInstanceSave->ciInfo.enginesMask,
4296 sizeof(params.inst.restore.pComputeInstanceSave->ciInfo.enginesMask));
4297
4298 // Sanity check that all engines requested exist in the GI engine mask
4299 bitVectorClrAll(&engines);
4300 bitVectorAnd(&engines, &pResourceAllocation->engines, &pKernelMIGGpuInstance->resourceAllocation.localEngines);
4301 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4302 bitVectorTestEqual(&engines, &pResourceAllocation->engines),
4303 status = NV_ERR_INVALID_ARGUMENT; goto done;);
4304
4305 // Set Shared/Exclusive Engine Masks for GRs restored
4306 bitVectorClrAll(&engines);
4307 bitVectorSetRange(&engines, RM_ENGINE_RANGE_GR());
4308 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4309
4310 // Only 1 GR can be requested per compute instance
4311 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4312 (kmigmgrCountEnginesOfType(&engines, RM_ENGINE_TYPE_GR(0)) == 1),
4313 status = NV_ERR_INVALID_ARGUMENT; goto done;);
4314
4315 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NONE) != 0x0)
4316 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4317 else
4318 {
4319 ENGTYPE_BIT_VECTOR tempVector;
4320
4321 // Exclusive engine mask should not intersect with the current exclusive mask
4322 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4323 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4324 bitVectorTestAllCleared(&tempVector),
4325 status = NV_ERR_STATE_IN_USE; goto done;);
4326 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4327 }
4328
4329 // Set Shared/Exclusive Engine Masks for CEs restored
4330 bitVectorClrAll(&engines);
4331 bitVectorSetRange(&engines, RM_ENGINE_RANGE_COPY());
4332 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4333 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_CE) != 0x0)
4334 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4335 else
4336 {
4337 ENGTYPE_BIT_VECTOR tempVector;
4338
4339 // Exclusive engine mask should not intersect with the current exclusive mask
4340 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4341 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4342 bitVectorTestAllCleared(&tempVector),
4343 status = NV_ERR_STATE_IN_USE; goto done;);
4344 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4345 }
4346
4347 // Set Shared/Exclusive Engine Masks for NVDECs restored
4348 bitVectorClrAll(&engines);
4349 bitVectorSetRange(&engines, RM_ENGINE_RANGE_NVDEC());
4350 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4351 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVDEC) != 0x0)
4352 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4353 else
4354 {
4355 ENGTYPE_BIT_VECTOR tempVector;
4356
4357 // Exclusive engine mask should not intersect with the current exclusive mask
4358 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4359 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4360 bitVectorTestAllCleared(&tempVector),
4361 status = NV_ERR_STATE_IN_USE; goto done;);
4362 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4363 }
4364
4365 // Set Shared/Exclusive Engine Masks for NVENCs restored
4366 bitVectorClrAll(&engines);
4367 bitVectorSetRange(&engines, RM_ENGINE_RANGE_NVENC());
4368 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4369 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVENC) != 0x0)
4370 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4371 else
4372 {
4373 ENGTYPE_BIT_VECTOR tempVector;
4374
4375 // Exclusive engine mask should not intersect with the current exclusive mask
4376 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4377 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4378 bitVectorTestAllCleared(&tempVector),
4379 status = NV_ERR_STATE_IN_USE; goto done;);
4380 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4381 }
4382
4383 // Set Shared/Exclusive Engine Masks for NVJPEGs restored
4384 bitVectorClrAll(&engines);
4385 bitVectorSetRange(&engines, RM_ENGINE_RANGE_NVJPEG());
4386 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4387 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVJPG) != 0x0)
4388 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4389 else
4390 {
4391 ENGTYPE_BIT_VECTOR tempVector;
4392
4393 // Exclusive engine mask should not intersect with the current exclusive mask
4394 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4395 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4396 bitVectorTestAllCleared(&tempVector),
4397 status = NV_ERR_STATE_IN_USE; goto done;);
4398 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4399 }
4400
4401 // Set Shared/Exclusive Engine Masks for OFAs restored
4402 bitVectorClrAll(&engines);
4403 bitVectorSetRange(&engines, RM_ENGINE_RANGE_OFA());
4404 bitVectorAnd(&engines, &engines, &pResourceAllocation->engines);
4405 if ((pMIGComputeInstance->sharedEngFlag & NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_OFA) != 0x0)
4406 bitVectorOr(&shadowSharedEngMask, &shadowSharedEngMask, &engines);
4407 else
4408 {
4409 ENGTYPE_BIT_VECTOR tempVector;
4410
4411 // Exclusive engine mask should not intersect with the current exclusive mask
4412 bitVectorAnd(&tempVector, &engines, &shadowExclusiveEngMask);
4413 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4414 bitVectorTestAllCleared(&tempVector),
4415 status = NV_ERR_STATE_IN_USE; goto done;);
4416 bitVectorOr(&shadowExclusiveEngMask, &shadowExclusiveEngMask, &engines);
4417 }
4418 }
4419 else
4420 {
4421 NvU32 grCount = 1;
4422 NvU32 ceCount = params.inst.request.pReqComputeInstanceInfo[CIIdx].ceCount;
4423 NvU32 decCount = params.inst.request.pReqComputeInstanceInfo[CIIdx].nvDecCount;
4424 NvU32 encCount = params.inst.request.pReqComputeInstanceInfo[CIIdx].nvEncCount;
4425 NvU32 jpgCount = params.inst.request.pReqComputeInstanceInfo[CIIdx].nvJpgCount;
4426 NvU32 ofaCount = params.inst.request.pReqComputeInstanceInfo[CIIdx].ofaCount;
4427
4428 bitVectorClrAll(&pResourceAllocation->engines);
4429
4430 // Allocate the GR engines for this compute instance
4431 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4432 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4433 ((pMIGComputeInstance->sharedEngFlag &
4434 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NONE) != 0x0),
4435 RM_ENGINE_RANGE_GR(),
4436 grCount,
4437 &pResourceAllocation->engines,
4438 &shadowExclusiveEngMask,
4439 &shadowSharedEngMask,
4440 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4441
4442 // Allocate the Copy engines for this compute instance
4443 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4444 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4445 ((pMIGComputeInstance->sharedEngFlag &
4446 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_CE) != 0x0),
4447 RM_ENGINE_RANGE_COPY(),
4448 ceCount,
4449 &pResourceAllocation->engines,
4450 &shadowExclusiveEngMask,
4451 &shadowSharedEngMask,
4452 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4453
4454 // Allocate the NVDEC engines for this compute instance
4455 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4456 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4457 ((pMIGComputeInstance->sharedEngFlag &
4458 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVDEC) != 0x0),
4459 RM_ENGINE_RANGE_NVDEC(),
4460 decCount,
4461 &pResourceAllocation->engines,
4462 &shadowExclusiveEngMask,
4463 &shadowSharedEngMask,
4464 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4465
4466 // Allocate the NVENC engines for this compute instance
4467 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4468 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4469 ((pMIGComputeInstance->sharedEngFlag &
4470 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVENC) != 0x0),
4471 RM_ENGINE_RANGE_NVENC(),
4472 encCount,
4473 &pResourceAllocation->engines,
4474 &shadowExclusiveEngMask,
4475 &shadowSharedEngMask,
4476 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4477
4478 // Allocate the NVJPG engines for this compute instance
4479 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4480 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4481 ((pMIGComputeInstance->sharedEngFlag &
4482 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVJPG) != 0x0),
4483 RM_ENGINE_RANGE_NVJPEG(),
4484 jpgCount,
4485 &pResourceAllocation->engines,
4486 &shadowExclusiveEngMask,
4487 &shadowSharedEngMask,
4488 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4489
4490 // Allocate the NVOFA engines for this compute instance
4491 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4492 kmigmgrAllocateInstanceEngines(&pKernelMIGGpuInstance->resourceAllocation.engines,
4493 ((pMIGComputeInstance->sharedEngFlag &
4494 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_OFA) != 0x0),
4495 RM_ENGINE_RANGE_OFA(),
4496 ofaCount,
4497 &pResourceAllocation->engines,
4498 &shadowExclusiveEngMask,
4499 &shadowSharedEngMask,
4500 &pKernelMIGGpuInstance->resourceAllocation.engines), done);
4501 }
4502
4503 // Cache local mask of engine IDs for this compute instance
4504 kmigmgrGetLocalEngineMask(&pResourceAllocation->engines,
4505 &pResourceAllocation->localEngines);
4506 }
4507
4508 // Commit the allocations to the instance
4509 if (!bQuery)
4510 {
4511 NvU32 swizzId = pKernelMIGGpuInstance->swizzId;
4512 KMIGMGR_CONFIGURE_INSTANCE_REQUEST configRequestsPerCiOrdered[KMIGMGR_MAX_COMPUTE_INSTANCES] = {0};
4513 NvU32 updateEngMask;
4514 NvU32 updateEngMaskShadow;
4515
4516 // Populate configure GPU instance parameters with compute instance info
4517 updateEngMask = 0x0;
4518
4519 for (CIIdx = 0; CIIdx < count; ++CIIdx)
4520 {
4521 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pComputeInstanceInfo[CIIdx];
4522 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4523 RM_ENGINE_TYPE localEngineType;
4524
4525 //
4526 // Xlate from CI-local GR 0 to GI-local GR idx
4527 // We can't use kmigmgrGetLocalToGlobalEngineType because these
4528 // compute instances aren't committed yet
4529 //
4530 NV_ASSERT_OK(
4531 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4532 &pComputeResourceAllocation->engines, &localEngineType));
4533
4534 updateEngMask |= NVBIT32(RM_ENGINE_TYPE_GR_IDX(localEngineType));
4535 }
4536
4537 //
4538 // Reorder the entries in pConfigRequestPerCi per the GR engine assigned to each CI
4539 // (Sorted from lower GR to higer GR), so kmigmgrConfigureGPUInstance can configure
4540 // each CI with correct GR.
4541 //
4542 updateEngMaskShadow = updateEngMask;
4543 i = 0;
4544 while (updateEngMaskShadow != 0)
4545 {
4546 for (CIIdx = 0; CIIdx < count; ++CIIdx)
4547 {
4548 RM_ENGINE_TYPE localRmEngineType;
4549 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pComputeInstanceInfo[CIIdx];
4550 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4551 NV_ASSERT_OK(
4552 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4553 &pComputeResourceAllocation->engines, &localRmEngineType));
4554
4555 if (portUtilCountTrailingZeros32(updateEngMaskShadow) == RM_ENGINE_TYPE_GR_IDX(localRmEngineType))
4556 {
4557 configRequestsPerCiOrdered[i] = pConfigRequestPerCi[CIIdx];
4558 updateEngMaskShadow &= ~NVBIT32(RM_ENGINE_TYPE_GR_IDX(localRmEngineType));
4559 i++;
4560 break;
4561 }
4562 }
4563 NV_ASSERT(CIIdx < count);
4564 }
4565
4566 // Configure the GR engines for each compute instance
4567 status = kmigmgrConfigureGPUInstance(pGpu, pKernelMIGManager, swizzId,
4568 configRequestsPerCiOrdered,
4569 updateEngMask);
4570
4571 // Do our best to deconfigure the engines we configured so far, then bail
4572 if (status != NV_OK)
4573 {
4574 portMemSet(pConfigRequestPerCi, 0x0, sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4575 // Quash status. This is best-effort cleanup
4576 (void)kmigmgrConfigureGPUInstance(pGpu, pKernelMIGManager, swizzId,
4577 pConfigRequestPerCi,
4578 updateEngMask);
4579
4580 goto done;
4581 }
4582
4583 // Update the GI pools with the result of this allocation
4584 bitVectorCopy(&pKernelMIGGpuInstance->exclusiveEngMask, &shadowExclusiveEngMask);
4585 bitVectorCopy(&pKernelMIGGpuInstance->sharedEngMask, &shadowSharedEngMask);
4586
4587 // update each compute instance gpc ids and veid info
4588 for (CIIdx = 0; CIIdx < count; ++CIIdx)
4589 {
4590 MIG_RESOURCE_ALLOCATION *pResourceAllocation = &pKernelMIGGpuInstance->resourceAllocation;
4591 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pComputeInstanceInfo[CIIdx];
4592 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4593 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pCIProfile;
4594 RM_ENGINE_TYPE globalEngineType;
4595 NvU32 globalGrIdx;
4596
4597 //
4598 // Xlate from CI-local GR 0 to global GR idx
4599 // We can't use kmigmgrGetLocalToGlobalEngineType because these
4600 // compute instances aren't committed yet
4601 //
4602 NV_ASSERT_OK(
4603 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4604 &pComputeResourceAllocation->engines, &globalEngineType));
4605
4606 NV_ASSERT_OK(
4607 kmigmgrEngineTypeXlate(&pResourceAllocation->localEngines, globalEngineType,
4608 &pResourceAllocation->engines, &globalEngineType));
4609 globalGrIdx = RM_ENGINE_TYPE_GR_IDX(globalEngineType);
4610 pCIProfile = &pConfigRequestPerCi[CIIdx].profile;
4611
4612 pComputeResourceAllocation->gpcCount = pCIProfile->gpcCount;
4613 pComputeResourceAllocation->smCount = pCIProfile->smCount;
4614 if (pCIProfile->computeSize != KMIGMGR_COMPUTE_SIZE_INVALID)
4615 {
4616 pComputeResourceAllocation->veidCount = pCIProfile->veidCount;
4617 }
4618 else
4619 {
4620 pComputeResourceAllocation->veidCount = (pResourceAllocation->veidCount / pResourceAllocation->gpcCount) *
4621 pComputeResourceAllocation->virtualGpcCount;
4622 }
4623
4624 pMIGComputeInstance->spanStart = pConfigRequestPerCi[CIIdx].veidSpanStart;
4625 pMIGComputeInstance->computeSize = pConfigRequestPerCi[CIIdx].profile.computeSize;
4626
4627 kgrmgrGetVeidBaseForGrIdx(pGpu, pKernelGraphicsManager, globalGrIdx, &pComputeResourceAllocation->veidOffset);
4628
4629 pComputeResourceAllocation->veidOffset = pComputeResourceAllocation->veidOffset - pResourceAllocation->veidOffset;
4630 }
4631
4632 // Copy over the local cached compute instance info
4633 createdInstances = 0;
4634 for (CIIdx = 0;
4635 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
4636 ++CIIdx)
4637 {
4638 if (pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].bValid)
4639 continue;
4640
4641 if ((params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_RESTORE) &&
4642 (params.inst.restore.pComputeInstanceSave->id != CIIdx))
4643 {
4644 continue;
4645 }
4646
4647 if (FLD_TEST_REF(NVC637_CTRL_DMA_EXEC_PARTITIONS_CREATE_REQUEST_WITH_PART_ID, _TRUE, params.inst.request.requestFlags) &&
4648 (pCIIDs[0] != CIIdx))
4649 {
4650 continue;
4651 }
4652
4653 NV_ASSERT(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].id ==
4654 KMIGMGR_COMPUTE_INSTANCE_ID_INVALID);
4655
4656 portMemCopy(&pKernelMIGGpuInstance->MIGComputeInstance[CIIdx],
4657 sizeof(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx]),
4658 &pComputeInstanceInfo[createdInstances],
4659 sizeof(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx]));
4660
4661 pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].id = CIIdx;
4662
4663 pCIIDs[createdInstances++] = CIIdx;
4664
4665 if (createdInstances == count)
4666 break;
4667 }
4668
4669 for (i = 0; i < createdInstances; ++i)
4670 {
4671 MIG_RESOURCE_ALLOCATION *pResourceAllocation;
4672 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation;
4673 MIG_COMPUTE_INSTANCE *pMIGComputeInstance;
4674 RM_ENGINE_TYPE globalEngineType;
4675 NvU32 globalGrIdx;
4676
4677 //
4678 // As per the current design, index for the pMIGComputeInstance
4679 // array is same as the compute instance ID.
4680 //
4681 CIIdx = pCIIDs[i];
4682
4683 pResourceAllocation = &pKernelMIGGpuInstance->resourceAllocation;
4684
4685 pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx];
4686 pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4687
4688 NV_ASSERT_OK(
4689 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4690 &pComputeResourceAllocation->engines, &globalEngineType));
4691 NV_ASSERT_OK(
4692 kmigmgrEngineTypeXlate(&pResourceAllocation->localEngines, globalEngineType,
4693 &pResourceAllocation->engines, &globalEngineType));
4694 globalGrIdx = RM_ENGINE_TYPE_GR_IDX(globalEngineType);
4695
4696 NV_ASSERT(pMIGComputeInstance->id == CIIdx);
4697
4698 //
4699 // Register instance with the capability framework only if it explicitly
4700 // requested. Otherwise, we rely on the persistent state.
4701 //
4702 if (bCreateCap)
4703 {
4704 // Register compute instance with the capability framework
4705 NV_ASSERT_OK_OR_GOTO(status,
4706 osRmCapRegisterSmcExecutionPartition(pKernelMIGGpuInstance->pOsRmCaps,
4707 &pMIGComputeInstance->pOsRmCaps,
4708 pMIGComputeInstance->id),
4709 cleanup_created_instances);
4710 }
4711
4712 // Populate UUID
4713 NV_ASSERT_OK_OR_GOTO(status,
4714 kmigmgrGenerateComputeInstanceUuid_HAL(pGpu, pKernelMIGManager, swizzId, globalGrIdx,
4715 &pMIGComputeInstance->uuid),
4716 cleanup_created_instances);
4717
4718 // Allocate RsShared for the instance
4719 NV_ASSERT_OK_OR_GOTO(
4720 status,
4721 serverAllocShare(&g_resServ, classInfo(RsShared),
4722 &pMIGComputeInstance->pShare),
4723 cleanup_created_instances);
4724
4725 // Allocate subscribed handles for this instance
4726 NV_ASSERT_OK_OR_GOTO(status,
4727 kmigmgrAllocComputeInstanceHandles(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, pMIGComputeInstance),
4728 cleanup_created_instances);
4729
4730 {
4731 KernelGraphics *pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, globalGrIdx);
4732 fecsSetRoutingInfo(pGpu,
4733 pKernelGraphics,
4734 pMIGComputeInstance->instanceHandles.hClient,
4735 pMIGComputeInstance->instanceHandles.hSubdevice,
4736 0);
4737
4738 NV_ASSERT_OK_OR_GOTO(status,
4739 kgraphicsCreateGoldenImageChannel(pGpu, pKernelGraphics),
4740 cleanup_created_instances);
4741 }
4742 }
4743 }
4744
4745 status = NV_OK;
4746 goto done;
4747
4748 cleanup_created_instances:
4749 for (i = 0; i < createdInstances; ++i)
4750 {
4751 (void)kmigmgrDeleteComputeInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance,
4752 pCIIDs[i], NV_FALSE);
4753 }
4754
4755 done:
4756 portMemFree(pComputeInstanceInfo);
4757 portMemFreeStackOrHeap(pConfigRequestPerCi);
4758
4759 return status;
4760 }
4761
4762 /*!
4763 * @brief create compute instances for CPU-RM
4764 *
4765 * @param[IN] pGpu
4766 * @param[IN] pKernelMIGManager
4767 * @param[IN] pKernelMIGGpuInstance
4768 * @param[IN] bQuery If NV_TRUE, don't save created instances
4769 * @param[IN] params List of requested compute instance to create
4770 * @param[OUT] pCIIDs IDs of created instances
4771 * @param[IN] bCreateCap Flag stating if MIG CI capabilities needs to be created
4772 */
4773 NV_STATUS
kmigmgrCreateComputeInstances_FWCLIENT(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvBool bQuery,KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS params,NvU32 * pCIIDs,NvBool bCreateCap)4774 kmigmgrCreateComputeInstances_FWCLIENT
4775 (
4776 OBJGPU *pGpu,
4777 KernelMIGManager *pKernelMIGManager,
4778 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
4779 NvBool bQuery,
4780 KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS params,
4781 NvU32 *pCIIDs,
4782 NvBool bCreateCap
4783 )
4784 {
4785 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
4786 NV_STATUS status = NV_OK;
4787 KernelGraphics *pKernelGraphics;
4788 MIG_COMPUTE_INSTANCE *pMIGComputeInstance;
4789 MIG_RESOURCE_ALLOCATION *pResourceAllocation;
4790 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation;
4791 NVC637_CTRL_EXEC_PARTITIONS_EXPORTED_INFO info;
4792 NvU32 CIIdx = pCIIDs[0];
4793 NvU32 tempGpcMask;
4794 KMIGMGR_CONFIGURE_INSTANCE_REQUEST *pConfigRequestPerCi;
4795 RM_ENGINE_TYPE localEngineType;
4796 RM_ENGINE_TYPE globalEngineType;
4797 NvU32 globalGrIdx;
4798 NvU64 shadowVeidInUseMask;
4799
4800 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
4801 NV_ASSERT_OR_RETURN(params.type == KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_RESTORE, NV_ERR_INVALID_ARGUMENT);
4802 NV_ASSERT_OR_RETURN(params.inst.restore.pComputeInstanceSave != NULL, NV_ERR_INVALID_ARGUMENT);
4803 NV_ASSERT_OR_RETURN(params.inst.restore.pComputeInstanceSave->bValid, NV_ERR_INVALID_ARGUMENT);
4804
4805 // CPU-RM will always restore the CI state created by GSP-RM, so will always be commit operation
4806 NV_ASSERT_OR_RETURN(!bQuery, NV_ERR_INVALID_ARGUMENT);
4807
4808 pMIGComputeInstance = portMemAllocNonPaged(sizeof(*pMIGComputeInstance));
4809 NV_CHECK_OR_RETURN(LEVEL_NOTICE, pMIGComputeInstance != NULL, NV_ERR_NO_MEMORY);
4810
4811 portMemSet(pMIGComputeInstance, 0, sizeof(*pMIGComputeInstance));
4812
4813 pResourceAllocation = &pKernelMIGGpuInstance->resourceAllocation;
4814 pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
4815
4816 NV_ASSERT_OR_RETURN(!pMIGComputeInstance->bValid, NV_ERR_INVALID_STATE);
4817
4818 pConfigRequestPerCi = portMemAllocStackOrHeap(sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4819 NV_ASSERT_OR_RETURN(pConfigRequestPerCi != NULL, NV_ERR_NO_MEMORY);
4820
4821 portMemSet(pConfigRequestPerCi, 0x0, sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4822
4823 info = params.inst.restore.pComputeInstanceSave->ciInfo;
4824
4825 if (kmigmgrIsCTSAlignmentRequired_HAL(pGpu, pKernelMIGManager))
4826 {
4827
4828 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4829 kmigmgrXlateSpanStartToCTSId(pGpu, pKernelMIGManager,
4830 info.computeSize,
4831 info.spanStart,
4832 &pConfigRequestPerCi[0].ctsId),
4833 done);
4834
4835 NV_CHECK_OR_ELSE(LEVEL_ERROR,
4836 kmigmgrIsCTSIdAvailable(pGpu, pKernelMIGManager,
4837 pKernelMIGGpuInstance->pProfile->validCTSIdMask,
4838 pKernelMIGGpuInstance->ctsIdsInUseMask,
4839 pConfigRequestPerCi[0].ctsId),
4840 status = NV_ERR_STATE_IN_USE; goto done; );
4841 }
4842 else
4843 {
4844 pConfigRequestPerCi[0].ctsId = KMIGMGR_CTSID_INVALID;
4845 }
4846
4847 portMemCopy(pMIGComputeInstance->uuid.uuid, sizeof(pMIGComputeInstance->uuid.uuid),
4848 info.uuid, sizeof(info.uuid));
4849 pMIGComputeInstance->sharedEngFlag = info.sharedEngFlags;
4850
4851 pComputeResourceAllocation->gpcCount = 0;
4852 tempGpcMask = info.gpcMask;
4853 while (tempGpcMask != 0x0)
4854 {
4855 NvU32 gpcIdx = portUtilCountTrailingZeros32(tempGpcMask);
4856 pComputeResourceAllocation->gpcIds[(pComputeResourceAllocation->gpcCount)++] = gpcIdx;
4857 tempGpcMask &= ~(NVBIT32(gpcIdx));
4858 }
4859 pComputeResourceAllocation->gfxGpcCount = info.gfxGpcCount;
4860 pComputeResourceAllocation->veidCount = info.veidCount;
4861 pComputeResourceAllocation->veidOffset = info.veidOffset;
4862 pComputeResourceAllocation->smCount = info.smCount;
4863 pMIGComputeInstance->computeSize = info.computeSize;
4864
4865 bitVectorFromRaw(&pComputeResourceAllocation->engines, info.enginesMask, sizeof(info.enginesMask));
4866
4867 // Cache the local engine mask for this CI
4868 kmigmgrGetLocalEngineMask(&pComputeResourceAllocation->engines, &pComputeResourceAllocation->localEngines);
4869
4870 pMIGComputeInstance->bValid = NV_TRUE;
4871 pMIGComputeInstance->id = CIIdx;
4872
4873 // Populate configure GPU instance parameters with compute instance info
4874
4875 //
4876 // Xlate from CI-local GR 0 to GI-local GR idx
4877 // We can't use kmigmgrGetLocalToGlobalEngineType because these
4878 // compute instances aren't committed yet
4879 //
4880 NV_ASSERT_OK(
4881 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4882 &pComputeResourceAllocation->engines, &localEngineType));
4883
4884 // Create a pseduo-profile based upon info retrieved from GSP-RM
4885 pConfigRequestPerCi[0].profile.computeSize = info.computeSize;
4886 pConfigRequestPerCi[0].profile.smCount = pComputeResourceAllocation->smCount;
4887 pConfigRequestPerCi[0].profile.gpcCount = pComputeResourceAllocation->gpcCount;
4888 pConfigRequestPerCi[0].profile.veidCount = pComputeResourceAllocation->veidCount;
4889 pConfigRequestPerCi[0].veidSpanStart = info.spanStart;
4890
4891 shadowVeidInUseMask = kgrmgrGetVeidInUseMask(pGpu, pKernelGraphicsManager);
4892 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
4893 kgrmgrCheckVeidsRequest(pGpu, pKernelGraphicsManager,
4894 &shadowVeidInUseMask,
4895 pConfigRequestPerCi[0].profile.veidCount,
4896 &pConfigRequestPerCi[0].veidSpanStart,
4897 pKernelMIGGpuInstance),
4898 done);
4899
4900
4901 NV_ASSERT(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].id == KMIGMGR_COMPUTE_INSTANCE_ID_INVALID);
4902
4903 pMIGComputeInstance->spanStart = pConfigRequestPerCi[0].veidSpanStart;
4904
4905 portMemCopy(&pKernelMIGGpuInstance->MIGComputeInstance[CIIdx],
4906 sizeof(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx]),
4907 pMIGComputeInstance,
4908 sizeof(*pMIGComputeInstance));
4909
4910 // Configure the GR engines for each compute instance
4911 status = kmigmgrConfigureGPUInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId,
4912 pConfigRequestPerCi,
4913 NVBIT32(RM_ENGINE_TYPE_GR_IDX(localEngineType)));
4914
4915 // Do our best to deconfigure the engines we configured so far, then bail
4916 if (status != NV_OK)
4917 {
4918 portMemSet(&pKernelMIGGpuInstance->MIGComputeInstance[CIIdx], 0x0, sizeof(pKernelMIGGpuInstance->MIGComputeInstance[CIIdx]));
4919 portMemSet(pConfigRequestPerCi, 0x0, sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
4920 // Quash status. This is best-effort cleanup
4921 (void)kmigmgrConfigureGPUInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId,
4922 pConfigRequestPerCi,
4923 NVBIT32(RM_ENGINE_TYPE_GR_IDX(localEngineType)));
4924
4925 goto done;
4926 }
4927
4928 //
4929 // Register instance with the capability framework only if it explicitly
4930 // requested. Otherwise, we rely on the persistent state.
4931 //
4932 if (bCreateCap)
4933 {
4934 // Register compute instance with the capability framework
4935 NV_ASSERT_OK_OR_GOTO(status,
4936 osRmCapRegisterSmcExecutionPartition(pKernelMIGGpuInstance->pOsRmCaps,
4937 &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].pOsRmCaps,
4938 pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].id),
4939 cleanup_created_instances);
4940 }
4941
4942 // Allocate RsShared for the instance
4943 NV_ASSERT_OK_OR_GOTO(status,
4944 serverAllocShare(&g_resServ, classInfo(RsShared),
4945 &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].pShare),
4946 cleanup_created_instances);
4947
4948 // Allocate subscribed handles for this instance
4949 if (!IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu))
4950 {
4951 NV_ASSERT_OK_OR_GOTO(status,
4952 kmigmgrAllocComputeInstanceHandles(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx]),
4953 cleanup_created_instances);
4954
4955 NV_ASSERT_OK(
4956 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
4957 &pComputeResourceAllocation->engines, &globalEngineType));
4958 NV_ASSERT_OK(
4959 kmigmgrEngineTypeXlate(&pResourceAllocation->localEngines, globalEngineType,
4960 &pResourceAllocation->engines, &globalEngineType));
4961 globalGrIdx = RM_ENGINE_TYPE_GR_IDX(globalEngineType);
4962
4963 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, globalGrIdx);
4964 fecsSetRoutingInfo(pGpu,
4965 pKernelGraphics,
4966 pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].instanceHandles.hClient,
4967 pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].instanceHandles.hSubdevice,
4968 0);
4969
4970 NV_ASSERT_OK_OR_GOTO(status,
4971 kgraphicsCreateGoldenImageChannel(pGpu, pKernelGraphics),
4972 cleanup_created_instances);
4973 }
4974
4975 status = NV_OK;
4976 goto done;
4977
4978 cleanup_created_instances:
4979 (void)kmigmgrDeleteComputeInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance,
4980 CIIdx, NV_FALSE);
4981 done:
4982 portMemFreeStackOrHeap(pConfigRequestPerCi);
4983 portMemFree(pMIGComputeInstance);
4984 return status;
4985 }
4986
4987 // Delete created instance handles if they exist
4988 void
kmigmgrFreeComputeInstanceHandles_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,MIG_COMPUTE_INSTANCE * pMIGComputeInstance)4989 kmigmgrFreeComputeInstanceHandles_IMPL
4990 (
4991 OBJGPU *pGpu,
4992 KernelMIGManager *pKernelMIGManager,
4993 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
4994 MIG_COMPUTE_INSTANCE *pMIGComputeInstance
4995 )
4996 {
4997 if (pMIGComputeInstance->instanceHandles.hClient != NV01_NULL_OBJECT)
4998 {
4999 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
5000
5001 pRmApi->Free(pRmApi, pMIGComputeInstance->instanceHandles.hClient, pMIGComputeInstance->instanceHandles.hClient);
5002 pMIGComputeInstance->instanceHandles.hClient = NV01_NULL_OBJECT;
5003 pMIGComputeInstance->instanceHandles.hSubdevice = NV01_NULL_OBJECT;
5004 pMIGComputeInstance->instanceHandles.hSubscription = NV01_NULL_OBJECT;
5005 }
5006 }
5007
5008 /*!
5009 * @brief Releases the engines owned by this Compute Instance of the given class
5010 * of engine (GR, COPY, etc) to the GPU Instance resource pools.
5011 */
5012 void
kmigmgrReleaseComputeInstanceEngines_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,MIG_COMPUTE_INSTANCE * pMIGComputeInstance)5013 kmigmgrReleaseComputeInstanceEngines_IMPL
5014 (
5015 OBJGPU *pGpu,
5016 KernelMIGManager *pKernelMIGManager,
5017 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
5018 MIG_COMPUTE_INSTANCE *pMIGComputeInstance
5019 )
5020 {
5021 RM_ENGINE_TYPE globalEngineType;
5022 RM_ENGINE_TYPE localEngineType;
5023 ENGTYPE_BIT_VECTOR *pGlobalMask;
5024 ENGTYPE_BIT_VECTOR *pLocalMask;
5025
5026 NV_ASSERT_OR_RETURN_VOID(pKernelMIGGpuInstance != NULL);
5027 NV_ASSERT_OR_RETURN_VOID(pMIGComputeInstance != NULL);
5028
5029 pGlobalMask = &pKernelMIGGpuInstance->resourceAllocation.engines;
5030 pLocalMask = &pKernelMIGGpuInstance->resourceAllocation.localEngines;
5031
5032 // Iterate over both global/local masks at the same time
5033 FOR_EACH_IN_BITVECTOR_PAIR(pGlobalMask, globalEngineType, pLocalMask, localEngineType)
5034 {
5035 NvU32 CIIdx;
5036
5037 // Skip anything not owned by this compute instance
5038 if (!bitVectorTest(&pMIGComputeInstance->resourceAllocation.engines, localEngineType))
5039 continue;
5040
5041 //
5042 // Clear this engine from the exclusive ownership mask. If it was being
5043 // shared, it already isn't in the exclusive ownership mask, so doing
5044 // this for all engines in this compute instance isn't harmful.
5045 //
5046 bitVectorClr(&pKernelMIGGpuInstance->exclusiveEngMask, globalEngineType);
5047
5048 // If this engine was exclusively owned, nothing else to do
5049 if (!bitVectorTest(&pKernelMIGGpuInstance->sharedEngMask, globalEngineType))
5050 continue;
5051
5052 // Determine if any other compute instance owns this engine
5053 for (CIIdx = 0;
5054 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
5055 ++CIIdx)
5056 {
5057 if (!pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].bValid)
5058 continue;
5059
5060 if (bitVectorTest(&pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].resourceAllocation.engines,
5061 localEngineType))
5062 {
5063 break;
5064 }
5065 }
5066
5067 // If engine is still owned by someone, don't mark it unused
5068 if (CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance))
5069 continue;
5070
5071 // mark this engine as no longer being shared by anyone
5072 bitVectorClr(&pKernelMIGGpuInstance->sharedEngMask, globalEngineType);
5073 }
5074 FOR_EACH_IN_BITVECTOR_PAIR_END();
5075 }
5076
5077 /*!
5078 * @brief Function to delete Compute Instance
5079 *
5080 * @param[IN] pGpu
5081 * @param[IN] pKernelMIGManager
5082 * @param[IN] pKernelMIGGpuInstance
5083 * @param[IN] CIID Compute Instance ID
5084 * @param[IN] bUnload NV_TRUE if called during gpu state unload path
5085 */
5086 NV_STATUS
kmigmgrDeleteComputeInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvU32 CIID,NvBool bUnload)5087 kmigmgrDeleteComputeInstance_IMPL
5088 (
5089 OBJGPU *pGpu,
5090 KernelMIGManager *pKernelMIGManager,
5091 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
5092 NvU32 CIID,
5093 NvBool bUnload
5094 )
5095 {
5096 MIG_COMPUTE_INSTANCE *pMIGComputeInstance;
5097 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation;
5098 ENGTYPE_BIT_VECTOR grEngines;
5099 NvU32 swizzId;
5100 KMIGMGR_CONFIGURE_INSTANCE_REQUEST *pConfigRequestPerCi;
5101 NvU32 updateEngMask;
5102 NV_STATUS status = NV_OK;
5103
5104 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
5105 NV_ASSERT_OR_RETURN(CIID < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance),
5106 NV_ERR_INVALID_ARGUMENT);
5107
5108 // Make sure that the targeted compute instance is still valid
5109 NV_CHECK_OR_RETURN(LEVEL_SILENT,
5110 pKernelMIGGpuInstance->MIGComputeInstance[CIID].bValid,
5111 NV_WARN_NOTHING_TO_DO);
5112
5113 pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[CIID];
5114 pComputeResourceAllocation = &pMIGComputeInstance->resourceAllocation;
5115
5116 //
5117 // Initial refCount is increased to "1" when instance is created and then
5118 // every subscription by a client should increase the refcount
5119 //
5120 if ((pMIGComputeInstance->pShare != NULL) &&
5121 (serverGetShareRefCount(&g_resServ, pMIGComputeInstance->pShare) > 2))
5122 {
5123 NV_PRINTF(LEVEL_ERROR,
5124 "Compute Instance with id - %d still in use by other clients\n",
5125 CIID);
5126
5127 return NV_ERR_STATE_IN_USE;
5128 }
5129
5130 if (!bUnload)
5131 {
5132 //
5133 // Unregister instance from the capability framework only if
5134 // it is explicitly destroyed i.e. not during GPU state unload path.
5135 //
5136 // Note that the saved instance persistent state will be freed by
5137 // _gpumgrUnregisterRmCapsForMIGCI during driver unload.
5138 //
5139 osRmCapUnregister(&pMIGComputeInstance->pOsRmCaps);
5140 }
5141
5142 // Deconfigure the GR engine for this compute instance
5143 swizzId = pKernelMIGGpuInstance->swizzId;
5144
5145 pConfigRequestPerCi = portMemAllocStackOrHeap(sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
5146 NV_ASSERT_OR_RETURN(pConfigRequestPerCi != NULL, NV_ERR_NO_MEMORY);
5147
5148 portMemSet(pConfigRequestPerCi, 0x0, sizeof(*pConfigRequestPerCi) * KMIGMGR_MAX_COMPUTE_INSTANCES);
5149
5150 bitVectorClrAll(&grEngines);
5151 bitVectorSetRange(&grEngines, RM_ENGINE_RANGE_GR());
5152 bitVectorAnd(&grEngines, &grEngines, &pComputeResourceAllocation->engines);
5153 NV_ASSERT_OR_ELSE(!bitVectorTestAllCleared(&grEngines), status = NV_ERR_INVALID_STATE; goto done;);
5154 updateEngMask = NVBIT32(RM_ENGINE_TYPE_GR_IDX(bitVectorCountTrailingZeros(&grEngines)));
5155 NV_ASSERT_OK_OR_GOTO(status,
5156 kmigmgrConfigureGPUInstance(pGpu, pKernelMIGManager, swizzId, pConfigRequestPerCi, updateEngMask),
5157 done);
5158
5159 {
5160 RM_ENGINE_TYPE globalRmEngType;
5161 MIG_INSTANCE_REF ref = kmigmgrMakeCIReference(pKernelMIGGpuInstance, pMIGComputeInstance);
5162 NV_ASSERT_OK_OR_GOTO(status,
5163 kmigmgrGetLocalToGlobalEngineType(pGpu, pKernelMIGManager, ref,
5164 RM_ENGINE_TYPE_GR(0),
5165 &globalRmEngType),
5166 done);
5167
5168 // Free up the internal handles for this compute instance
5169 kmigmgrFreeComputeInstanceHandles(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, pMIGComputeInstance);
5170
5171 fecsSetRoutingInfo(pGpu,
5172 GPU_GET_KERNEL_GRAPHICS(pGpu, RM_ENGINE_TYPE_GR_IDX(globalRmEngType)),
5173 pKernelMIGGpuInstance->instanceHandles.hClient,
5174 pKernelMIGGpuInstance->instanceHandles.hSubdevice,
5175 RM_ENGINE_TYPE_GR_IDX(bitVectorCountTrailingZeros(&grEngines)));
5176
5177 if (pMIGComputeInstance->pShare != NULL)
5178 {
5179 serverFreeShare(&g_resServ, pMIGComputeInstance->pShare);
5180 pMIGComputeInstance->pShare = NULL;
5181 }
5182 }
5183
5184 // Mark this compute instance as invalid
5185 pMIGComputeInstance->bValid = NV_FALSE;
5186
5187 // Release this compute instance's engines
5188 kmigmgrReleaseComputeInstanceEngines(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, pMIGComputeInstance);
5189
5190 // Now that we no longer need it, clear the shared engine flag
5191 pMIGComputeInstance->sharedEngFlag = 0x0;
5192 pMIGComputeInstance->id = KMIGMGR_COMPUTE_INSTANCE_ID_INVALID;
5193
5194 pMIGComputeInstance->pOsRmCaps = NULL;
5195
5196 done:
5197 portMemFreeStackOrHeap(pConfigRequestPerCi);
5198
5199 return status;
5200 }
5201
5202 /*!
5203 * @brief print out the CI configuration of this GI
5204 */
5205 static void
_kmigmgrPrintComputeInstances(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5206 _kmigmgrPrintComputeInstances
5207 (
5208 OBJGPU *pGpu,
5209 KernelMIGManager *pKernelMIGManager,
5210 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5211 )
5212 {
5213 #if NV_PRINTF_LEVEL_ENABLED(LEVEL_INFO)
5214 #define PADDING_STR "----------------------------------------------------"
5215 RM_ENGINE_TYPE rmEngineType;
5216 NvU32 CIIdx;
5217
5218 NV_PRINTF(LEVEL_INFO, "\n");
5219 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
5220 NV_PRINTF(LEVEL_INFO, "| %14s | %14s | %14s |\n",
5221 "SwizzId",
5222 "GR Count",
5223 "Gpc Count");
5224 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
5225 NV_PRINTF(LEVEL_INFO, "| %14d | %14d | %14d |\n",
5226 pKernelMIGGpuInstance->swizzId,
5227 kmigmgrCountEnginesOfType(&pKernelMIGGpuInstance->resourceAllocation.engines, RM_ENGINE_TYPE_GR(0)),
5228 pKernelMIGGpuInstance->resourceAllocation.gpcCount);
5229
5230 for (CIIdx = 0;
5231 CIIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance);
5232 ++CIIdx)
5233 {
5234 MIG_RESOURCE_ALLOCATION *pComputeResourceAllocation;
5235
5236 if (!pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].bValid)
5237 {
5238 continue;
5239 }
5240
5241 pComputeResourceAllocation = &pKernelMIGGpuInstance->MIGComputeInstance[CIIdx].resourceAllocation;
5242
5243 NV_ASSERT_OK(
5244 kmigmgrEngineTypeXlate(&pComputeResourceAllocation->localEngines, RM_ENGINE_TYPE_GR(0),
5245 &pComputeResourceAllocation->engines, &rmEngineType));
5246
5247 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
5248 if (IS_GSP_CLIENT(pGpu))
5249 {
5250 NvU32 gpcIdx;
5251 NvU32 gpcMask = 0x0;
5252
5253 for (gpcIdx = 0; gpcIdx < pComputeResourceAllocation->gpcCount; ++gpcIdx)
5254 {
5255 gpcMask |= NVBIT32(pComputeResourceAllocation->gpcIds[gpcIdx]);
5256 }
5257 NV_PRINTF(LEVEL_INFO, "| %23s | %23s |\n",
5258 "Gr Engine IDX",
5259 "GPC Mask");
5260 NV_PRINTF(LEVEL_INFO, "| %23d | %23X |\n",
5261 RM_ENGINE_TYPE_GR_IDX(rmEngineType),
5262 gpcMask);
5263 }
5264 else
5265 {
5266 // gpcMask is not meaningful in VGPU, thus only printing gpcCount
5267 NV_PRINTF(LEVEL_INFO, "| %23s | %23s |\n",
5268 "Gr Engine IDX",
5269 "GPC Count");
5270 NV_PRINTF(LEVEL_INFO, "| %23d | %23X |\n",
5271 RM_ENGINE_TYPE_GR_IDX(rmEngineType),
5272 pComputeResourceAllocation->gpcCount);
5273 }
5274 }
5275 NV_PRINTF(LEVEL_INFO, "%s\n", PADDING_STR);
5276
5277 #undef PADDING_STR
5278 #endif // NV_PRINTF_LEVEL_ENABLED(LEVEL_INFO)
5279 }
5280
5281 /*!
5282 * @brief Function to configure a specific GPU instance by setting available
5283 * GPCs with requested GR Engines
5284 *
5285 * @param[IN] pGpu
5286 * @param[IN} pKernelMIGManager
5287 * @param[OUT] swizzId SwizzId for this GPU instance
5288 * @param[IN] pGpcCountPerGr Requested num GPCs for every GR engine in
5289 * this instance
5290 * @param[IN] updateEngMask Entry valid flag for each engine in instance
5291 *
5292 * @return Returns NV_STATUS
5293 * NV_OK
5294 * NV_ERR_INVALID_ARGUMENT
5295 * NV_WARN_NOTHING_TO_DO
5296 * NV_ERR_INSUFFICIENT_RESOURCES
5297 */
5298 NV_STATUS
kmigmgrConfigureGPUInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,const KMIGMGR_CONFIGURE_INSTANCE_REQUEST * pConfigRequestsPerCi,NvU32 updateEngMask)5299 kmigmgrConfigureGPUInstance_IMPL
5300 (
5301 OBJGPU *pGpu,
5302 KernelMIGManager *pKernelMIGManager,
5303 NvU32 swizzId,
5304 const KMIGMGR_CONFIGURE_INSTANCE_REQUEST *pConfigRequestsPerCi,
5305 NvU32 updateEngMask
5306 )
5307 {
5308 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
5309 NV_STATUS status = NV_OK;
5310 NvU32 i;
5311 NvU32 j;
5312 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = NULL;
5313 NvBool bAssigning;
5314 RM_ENGINE_TYPE checkGrs[RM_ENGINE_TYPE_GR_SIZE];
5315 NvU32 checkGrCount = 0;
5316 RM_ENGINE_TYPE rmEngineType;
5317 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
5318 NvBool bIsCTSRequired = kmigmgrIsCTSAlignmentRequired_HAL(pGpu, pKernelMIGManager);
5319 NvU32 localIdx;
5320
5321 // Sanity check the GPU instance requested to be configured
5322 if (!kmigmgrIsSwizzIdInUse(pGpu, pKernelMIGManager, swizzId))
5323 {
5324 NV_PRINTF(LEVEL_ERROR, "Invalid swizzId - %d.\n", swizzId);
5325 return NV_ERR_INVALID_ARGUMENT;
5326 }
5327
5328 status = kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, &pKernelMIGGpuInstance);
5329 NV_CHECK_OR_RETURN(LEVEL_SILENT, status == NV_OK, status);
5330
5331 bAssigning = NV_FALSE;
5332 portMemSet(checkGrs, 0, sizeof(checkGrs));
5333
5334 i = 0;
5335 localIdx = 0;
5336 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5337 {
5338 NvU32 engineIdx;
5339 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5340 continue;
5341
5342 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5343
5344 // Skip over invalid entries
5345 if (!(updateEngMask & NVBIT32(i)))
5346 {
5347 i++;
5348 continue;
5349 }
5350
5351 // Resource checks are verified by CTS ID assignment when required, else use GPC count
5352 if (bIsCTSRequired)
5353 {
5354 NV_CHECK_OR_RETURN(LEVEL_ERROR,
5355 pConfigRequestsPerCi[localIdx].ctsId != KMIGMGR_CTSID_INVALID,
5356 NV_ERR_INSUFFICIENT_RESOURCES);
5357 }
5358 else
5359 {
5360 // Make sure no requested GPC count is greater than instance GPC count
5361 if (pConfigRequestsPerCi[localIdx].profile.gpcCount > pKernelMIGGpuInstance->resourceAllocation.gpcCount)
5362 {
5363 NV_PRINTF(LEVEL_ERROR,
5364 "Invalid GPC count - %d requested for GrIdx - %d.\n",
5365 pConfigRequestsPerCi[localIdx].profile.gpcCount,
5366 engineIdx);
5367 return NV_ERR_INVALID_ARGUMENT;
5368 }
5369 }
5370
5371 bAssigning = bAssigning || pConfigRequestsPerCi[localIdx].profile.gpcCount > 0;
5372 checkGrs[checkGrCount++] = rmEngineType;
5373
5374 localIdx++;
5375 i++;
5376 }
5377 FOR_EACH_IN_BITVECTOR_END();
5378
5379 //
5380 // Return an error if there are any channels on any engines targeted by this
5381 // request
5382 //
5383 NV_CHECK_OR_RETURN(LEVEL_SILENT,
5384 !kfifoEngineListHasChannel(pGpu, pKernelFifo, checkGrs, checkGrCount),
5385 NV_ERR_STATE_IN_USE);
5386
5387 if (!bAssigning)
5388 {
5389 // Invalidate targeted engines
5390 i = 0;
5391 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5392 {
5393 NvU32 engineIdx;
5394
5395 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5396 continue;
5397
5398 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5399
5400 if (updateEngMask & NVBIT32(i))
5401 {
5402 NV_ASSERT_OK_OR_RETURN(
5403 kmigmgrInvalidateGr(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, engineIdx));
5404 }
5405
5406 i++;
5407 }
5408 FOR_EACH_IN_BITVECTOR_END();
5409
5410 return NV_OK;
5411 }
5412
5413 //
5414 // Client passes the logical GR-IDs while RM works with physical GR-IDs
5415 // Walk the list of physical GRs associated with this GPU instance and then
5416 // set GPCs as requested
5417 //
5418 i = 0;
5419 localIdx = 0;
5420 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5421 {
5422 NvU32 engineIdx;
5423 NvU32 gpcCount = pConfigRequestsPerCi[localIdx].profile.gpcCount;
5424
5425 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5426 continue;
5427
5428 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5429
5430 if (!(updateEngMask & NVBIT32(i)))
5431 {
5432 i++;
5433 continue;
5434 }
5435
5436 if (gpcCount == 0)
5437 {
5438 localIdx++;
5439 i++;
5440 continue;
5441 }
5442
5443 // Update the GR to VEID mapping
5444 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
5445 kgrmgrAllocVeidsForGrIdx(pGpu,
5446 pKernelGraphicsManager,
5447 engineIdx,
5448 pConfigRequestsPerCi[localIdx].veidSpanStart,
5449 pConfigRequestsPerCi[localIdx].profile.veidCount,
5450 pKernelMIGGpuInstance),
5451 cleanup);
5452
5453 i++;
5454 }
5455 FOR_EACH_IN_BITVECTOR_END();
5456
5457 _kmigmgrPrintComputeInstances(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5458
5459 i = 0;
5460 localIdx = 0;
5461 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5462 {
5463 NvU32 engineIdx;
5464 NvU32 gpcCount = pConfigRequestsPerCi[localIdx].profile.gpcCount;
5465 KernelGraphics *pKGr;
5466
5467 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5468 continue;
5469
5470 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5471
5472 if (!(updateEngMask & NVBIT32(i)))
5473 {
5474 i++;
5475 continue;
5476 }
5477
5478 if (gpcCount == 0)
5479 {
5480 localIdx++;
5481 i++;
5482 continue;
5483 }
5484
5485 if (bIsCTSRequired)
5486 kmigmgrSetCTSIdInUse(pKernelMIGGpuInstance, pConfigRequestsPerCi[localIdx].ctsId, engineIdx, NV_TRUE);
5487
5488 pKGr = GPU_GET_KERNEL_GRAPHICS(pGpu, engineIdx);
5489 // Re-pull public static data for kernel graphics
5490 status = kgraphicsLoadStaticInfo_HAL(pGpu, pKGr, pKernelMIGGpuInstance->swizzId);
5491 if (status != NV_OK)
5492 goto cleanup;
5493
5494 // record sizes of local GR ctx buffers for this GR
5495 status = kgrmgrDiscoverMaxLocalCtxBufInfo(pGpu, pKernelGraphicsManager, pKGr, swizzId);
5496 if (status != NV_OK)
5497 goto cleanup;
5498
5499 i++;
5500 }
5501 FOR_EACH_IN_BITVECTOR_END();
5502
5503 return status;
5504
5505 cleanup:
5506
5507 j = 0;
5508 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5509 {
5510 NvU32 engineIdx;
5511
5512 // Rollback all previous validations
5513 if (j == i)
5514 break;
5515
5516 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5517 continue;
5518
5519 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5520
5521 if (updateEngMask & NVBIT32(j))
5522 {
5523 NV_PRINTF(LEVEL_ERROR,
5524 "Failed to configure GPU instance. Invalidating GRID - %d\n",
5525 engineIdx);
5526
5527 // Invalidate assignments to this GR, clear global state
5528 kmigmgrInvalidateGr(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, engineIdx);
5529 }
5530
5531 j++;
5532 }
5533 FOR_EACH_IN_BITVECTOR_END();
5534
5535 return status;
5536 }
5537
5538 // invalidate GR to GPC mappings
5539 NV_STATUS
kmigmgrInvalidateGrGpcMapping_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvU32 grIdx)5540 kmigmgrInvalidateGrGpcMapping_IMPL
5541 (
5542 OBJGPU *pGpu,
5543 KernelMIGManager *pKernelMIGManager,
5544 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
5545 NvU32 grIdx
5546 )
5547 {
5548 NV_STATUS status = NV_OK;
5549 NvU32 gfid;
5550 NvBool bCallingContextPlugin;
5551 KernelGraphics *pKernelGraphics;
5552
5553 NV_ASSERT_OK_OR_RETURN(vgpuGetCallingContextGfid(pGpu, &gfid));
5554 NV_ASSERT_OK_OR_RETURN(vgpuIsCallingContextPlugin(pGpu, &bCallingContextPlugin));
5555 if (bCallingContextPlugin)
5556 {
5557 gfid = GPU_GFID_PF;
5558 }
5559
5560 // Release CTS-ID fields
5561 if (kmigmgrIsCTSAlignmentRequired_HAL(pGpu, pKernelMIGManager))
5562 kmigmgrSetCTSIdInUse(pKernelMIGGpuInstance, KMIGMGR_CTSID_INVALID, grIdx, NV_FALSE);
5563
5564 // Free global ctx buffers, this will need to be regenerated
5565 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, grIdx);
5566 fecsBufferTeardown(pGpu, pKernelGraphics);
5567 kgraphicsFreeGlobalCtxBuffers(pGpu, pKernelGraphics, gfid);
5568
5569 // clear cached ctx buf sizes
5570 kgraphicsClearCtxBufferInfo(pGpu, pKernelGraphics);
5571
5572 return status;
5573 }
5574
5575 // invalidate a GR engine
5576 NV_STATUS
kmigmgrInvalidateGr_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvU32 grIdx)5577 kmigmgrInvalidateGr_IMPL
5578 (
5579 OBJGPU *pGpu,
5580 KernelMIGManager *pKernelMIGManager,
5581 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
5582 NvU32 grIdx
5583 )
5584 {
5585 KernelGraphics *pKGr = GPU_GET_KERNEL_GRAPHICS(pGpu, grIdx);
5586 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
5587
5588 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
5589 kmigmgrInvalidateGrGpcMapping(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, grIdx));
5590
5591 kgrmgrClearVeidsForGrIdx(pGpu, pKernelGraphicsManager, grIdx);
5592
5593 kgraphicsInvalidateStaticInfo(pGpu, pKGr);
5594 return NV_OK;
5595 }
5596
5597 /*!
5598 * @brief Function to invalidate a gpu instance
5599 *
5600 * @param[IN] pGpu
5601 * @param[IN] pKernelMIGManager
5602 * @param[IN] swizzId swizzId which is getting invalidated
5603 * @param[IN] bUnload NV_TRUE if called from gpu state unload path
5604 *
5605 * @return Returns NV_STATUS
5606 * NV_OK
5607 * NV_ERR_INVALID_ARGUMENT No GPC associated with Gr
5608 */
5609 NV_STATUS
kmigmgrInvalidateGPUInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvBool bUnload)5610 kmigmgrInvalidateGPUInstance_IMPL
5611 (
5612 OBJGPU *pGpu,
5613 KernelMIGManager *pKernelMIGManager,
5614 NvU32 swizzId,
5615 NvBool bUnload
5616 )
5617 {
5618 NV_STATUS rmStatus = NV_OK;
5619 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
5620 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = NULL;
5621 NvU32 i;
5622 RM_ENGINE_TYPE rmEngineType;
5623 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
5624
5625 // Sanity checks
5626 rmStatus = kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, &pKernelMIGGpuInstance);
5627 if (rmStatus != NV_OK)
5628 {
5629 // Didn't find requested gpu instance
5630 NV_PRINTF(LEVEL_ERROR, "No valid gpu instance with SwizzId - %d found\n",
5631 swizzId);
5632 return rmStatus;
5633 }
5634
5635 // Make sure that no client is using this gpu instance
5636 if (!kmigmgrIsGPUInstanceReadyToBeDestroyed(pKernelMIGGpuInstance))
5637 {
5638 NV_PRINTF(LEVEL_ERROR,
5639 "Gpu instance with SwizzId - %d still in use by other clients\n",
5640 swizzId);
5641
5642 kmigmgrPrintSubscribingClients(pGpu, pKernelMIGManager, swizzId);
5643 return NV_ERR_STATE_IN_USE;
5644 }
5645
5646 for (i = 0; i < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance); ++i)
5647 {
5648 if (pKernelMIGGpuInstance->MIGComputeInstance[i].bValid)
5649 {
5650 NV_PRINTF(LEVEL_ERROR,
5651 "Cannot destroy gpu instance %u with valid compute instance %d \n",
5652 swizzId, i);
5653
5654 return NV_ERR_STATE_IN_USE;
5655 }
5656 }
5657
5658 NV_PRINTF(LEVEL_INFO, "FREEING GPU INSTANCE\n");
5659 kmigmgrPrintGPUInstanceInfo(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5660
5661 if (!bUnload)
5662 {
5663 //
5664 // Unregister gpu instance from the capability framework only if
5665 // it is explicitly destroyed i.e. not during GPU state unload path.
5666 //
5667 // Note that the saved gpu instance persistent state will be freed by
5668 // _gpumgrUnregisterRmCapsForSmcPartitions during driver unload.
5669 //
5670 osRmCapUnregister(&pKernelMIGGpuInstance->pOsRmCaps);
5671 }
5672
5673 // Remove GR->GPC mappings in GPU instance Info
5674 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5675 {
5676 NvU32 engineIdx;
5677 KernelGraphics *pKernelGraphics;
5678
5679 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5680 continue;
5681
5682 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5683
5684 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5685 kmigmgrInvalidateGr(pGpu, pKernelMIGManager, pKernelMIGGpuInstance, engineIdx));
5686
5687 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, engineIdx);
5688 fecsClearRoutingInfo(pGpu, pKernelGraphics);
5689 }
5690 FOR_EACH_IN_BITVECTOR_END();
5691
5692 // Delete client handle after all GR's are invalidated
5693 kmigmgrFreeGPUInstanceHandles(pKernelMIGGpuInstance);
5694
5695 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5696 kmigmgrClearEnginesInUse(pGpu, pKernelMIGManager, &pKernelMIGGpuInstance->resourceAllocation.engines));
5697
5698 // Destroy runlist buffer pools
5699 kmigmgrDestroyGPUInstanceGrBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5700
5701 if (kmigmgrIsSwizzIdInUse(pGpu, pKernelMIGManager, swizzId))
5702 {
5703 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5704 kmigmgrClearSwizzIdInUse(pGpu, pKernelMIGManager, swizzId));
5705 }
5706
5707 // Sanity check that requested swizzID is not set in swizzIdMask
5708 NV_ASSERT_OR_ELSE(!(NVBIT64(swizzId) & pKernelMIGManager->swizzIdInUseMask), rmStatus = NV_ERR_INVALID_STATE);
5709
5710 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5711 kmemsysInitMIGMemoryPartitionTable_HAL(pGpu, pKernelMemorySystem));
5712
5713 // Destroy gpu instance scrubber
5714 kmigmgrDestroyGPUInstanceScrubber(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5715
5716 // Destroy gpu instance pool for page table mem
5717 kmigmgrDestroyGPUInstancePool(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5718
5719 // Delete gpu instance engine runlists
5720 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5721 kmigmgrDeleteGPUInstanceRunlists_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance));
5722
5723 // Destroy runlist buffer pools
5724 kmigmgrDestroyGPUInstanceRunlistBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5725
5726 // Free gpu instance memory
5727 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5728 memmgrFreeMIGGPUInstanceMemory(pGpu, pMemoryManager, swizzId, pKernelMIGGpuInstance->hMemory, &pKernelMIGGpuInstance->pMemoryPartitionHeap));
5729
5730 if (pKernelMIGGpuInstance->pShare != NULL)
5731 {
5732 serverFreeShare(&g_resServ, pKernelMIGGpuInstance->pShare);
5733 pKernelMIGGpuInstance->pShare = NULL;
5734 }
5735
5736 // Initialize gpu instance info to initial value
5737 kmigmgrInitGPUInstanceInfo(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
5738
5739 //
5740 // Only partitions in which VGPU guests are booted require changing
5741 // engine interrupt vectors to deterministic values for migration.
5742 //
5743 if (IS_GSP_CLIENT(pGpu) && gpuIsSriovEnabled(pGpu))
5744 {
5745 Intr *pIntr = GPU_GET_INTR(pGpu);
5746
5747 //
5748 // When running in GSP offload mode, KernelRM must re-fetch the
5749 // interrupt table on every change to the MIG partitioning layout.
5750 //
5751 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5752 intrStateUnload_HAL(pGpu, pIntr, GPU_STATE_FLAGS_PRESERVING));
5753
5754 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5755 intrInitInterruptTable_HAL(pGpu, pIntr));
5756
5757 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
5758 intrStateLoad_HAL(pGpu, pIntr, GPU_STATE_FLAGS_PRESERVING));
5759 }
5760
5761 return rmStatus;
5762 }
5763
5764 /*!
5765 * @brief Init gpu instance scrubber
5766 */
5767 NV_STATUS
kmigmgrInitGPUInstanceScrubber_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5768 kmigmgrInitGPUInstanceScrubber_IMPL
5769 (
5770 OBJGPU *pGpu,
5771 KernelMIGManager *pKernelMIGManager,
5772 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5773 )
5774 {
5775 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
5776
5777 if (!IsSLIEnabled(pGpu) &&
5778 memmgrIsScrubOnFreeEnabled(pMemoryManager) &&
5779 memmgrIsPmaInitialized(pMemoryManager))
5780 {
5781 NV_ASSERT_OK_OR_RETURN(scrubberConstruct(pGpu, pKernelMIGGpuInstance->pMemoryPartitionHeap));
5782 pKernelMIGGpuInstance->bMemoryPartitionScrubberInitialized = NV_TRUE;
5783 }
5784
5785 return NV_OK;
5786 }
5787
5788 /*!
5789 * @brief Destroy gpu instance scrubber
5790 */
5791 void
kmigmgrDestroyGPUInstanceScrubber_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5792 kmigmgrDestroyGPUInstanceScrubber_IMPL
5793 (
5794 OBJGPU *pGpu,
5795 KernelMIGManager *pKernelMIGManager,
5796 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5797 )
5798 {
5799 OBJMEMSCRUB *pMemscrub = NULL;
5800 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
5801
5802 if (!pKernelMIGGpuInstance->bMemoryPartitionScrubberInitialized)
5803 return;
5804
5805 if (!IsSLIEnabled(pGpu) &&
5806 memmgrIsScrubOnFreeEnabled(pMemoryManager) &&
5807 memmgrIsPmaInitialized(pMemoryManager))
5808 {
5809 pMemscrub = pKernelMIGGpuInstance->pMemoryPartitionHeap->pmaObject.pScrubObj;
5810 scrubberDestruct(pGpu, pKernelMIGGpuInstance->pMemoryPartitionHeap, pMemscrub);
5811 pKernelMIGGpuInstance->bMemoryPartitionScrubberInitialized = NV_FALSE;
5812 }
5813 }
5814
5815 /*!
5816 * @brief Releases GR buffer memory back from global buffer pools and destroys
5817 * these pools for all GR engines that belong to this gpu instance.
5818 */
5819 void
kmigmgrDestroyGPUInstanceGrBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5820 kmigmgrDestroyGPUInstanceGrBufPools_IMPL
5821 (
5822 OBJGPU *pGpu,
5823 KernelMIGManager *pKernelMIGManager,
5824 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5825 )
5826 {
5827 RM_ENGINE_TYPE rmEngineType;
5828
5829 if (!ctxBufPoolIsSupported(pGpu))
5830 return;
5831
5832 NV_ASSERT(pKernelMIGGpuInstance != NULL);
5833
5834 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
5835 {
5836 NvU32 engineIdx;
5837 KernelGraphics *pKernelGraphics;
5838
5839 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
5840 continue;
5841
5842 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
5843 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, engineIdx);
5844
5845 kgraphicsDestroyCtxBufPool(pGpu, pKernelGraphics);
5846 }
5847 FOR_EACH_IN_BITVECTOR_END();
5848 }
5849
5850 /*!
5851 * @brief Destroy per-gpu instance memory pool for client page tables
5852 */
5853 void
kmigmgrDestroyGPUInstancePool_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5854 kmigmgrDestroyGPUInstancePool_IMPL
5855 (
5856 OBJGPU *pGpu,
5857 KernelMIGManager *pKernelMIGManager,
5858 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5859 )
5860 {
5861 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
5862
5863 if (!memmgrIsPmaInitialized(pMemoryManager) ||
5864 !memmgrAreClientPageTablesPmaManaged(pMemoryManager))
5865 {
5866 NV_ASSERT_OR_GOTO((pKernelMIGGpuInstance->pPageTableMemPool == NULL), destroy_pool);
5867 return;
5868 }
5869
5870 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId))
5871 {
5872 NV_ASSERT_OR_GOTO((pKernelMIGGpuInstance->pPageTableMemPool == NULL), destroy_pool);
5873 return;
5874 }
5875
5876 if (pKernelMIGGpuInstance->pPageTableMemPool == NULL)
5877 {
5878 NV_PRINTF(LEVEL_INFO, "page table memory pool not setup\n");
5879 return;
5880 }
5881
5882 destroy_pool:
5883 rmMemPoolDestroy(pKernelMIGGpuInstance->pPageTableMemPool);
5884 pKernelMIGGpuInstance->pPageTableMemPool = NULL;
5885 }
5886
5887 /*!
5888 * @brief Releases runlist buffer memory back from runlist buffer pools and destroys the
5889 * runlist buffer pools for engines that belong to these gpu instance.
5890 */
5891 void
kmigmgrDestroyGPUInstanceRunlistBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)5892 kmigmgrDestroyGPUInstanceRunlistBufPools_IMPL
5893 (
5894 OBJGPU *pGpu,
5895 KernelMIGManager *pKernelMIGManager,
5896 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
5897 )
5898 {
5899 RM_ENGINE_TYPE rmEngineType;
5900 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
5901
5902 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId))
5903 return;
5904
5905 if (!ctxBufPoolIsSupported(pGpu))
5906 return;
5907
5908 for (rmEngineType = 0; rmEngineType < RM_ENGINE_TYPE_LAST; rmEngineType++)
5909 {
5910 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType) ||
5911 !kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType) ||
5912 !kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, kmigmgrMakeGIReference(pKernelMIGGpuInstance)))
5913 {
5914 continue;
5915 }
5916
5917 if (pKernelFifo->pRunlistBufPool[rmEngineType] != NULL)
5918 {
5919 ctxBufPoolRelease(pKernelFifo->pRunlistBufPool[rmEngineType]);
5920 ctxBufPoolDestroy(&pKernelFifo->pRunlistBufPool[rmEngineType]);
5921 }
5922 }
5923 }
5924
5925 /*!
5926 * @brief Print out clients subscribing to specified gpu instance
5927 */
5928 void
kmigmgrPrintSubscribingClients_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId)5929 kmigmgrPrintSubscribingClients_IMPL
5930 (
5931 OBJGPU *pGpu,
5932 KernelMIGManager *pKernelMIGManager,
5933 NvU32 swizzId
5934 )
5935 {
5936 RmClient **ppClient;
5937 for (ppClient = serverutilGetFirstClientUnderLock();
5938 ppClient != NULL;
5939 ppClient = serverutilGetNextClientUnderLock(ppClient))
5940 {
5941 RmClient *pClient = *ppClient;
5942 RsClient *pRsClient = staticCast(pClient, RsClient);
5943 MIG_INSTANCE_REF ref;
5944 RS_PRIV_LEVEL privLevel = rmclientGetCachedPrivilege(pClient);
5945 RS_ITERATOR it = clientRefIter(pRsClient, NULL, classId(Device), RS_ITERATE_CHILDREN, NV_TRUE);
5946
5947 while (clientRefIterNext(pRsClient, &it))
5948 {
5949 NV_STATUS status;
5950 Device *pDevice = dynamicCast(it.pResourceRef->pResource, Device);
5951
5952 if (pDevice == NULL)
5953 continue;
5954
5955 status = kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager,
5956 pDevice, &ref);
5957 if (status != NV_OK)
5958 continue;
5959
5960 if (ref.pKernelMIGGpuInstance->swizzId != swizzId)
5961 continue;
5962
5963 (void)privLevel;
5964 NV_PRINTF(LEVEL_INFO, "%s client %x device %x currently subscribed to swizzId %u\n",
5965 (privLevel >= RS_PRIV_LEVEL_KERNEL) ? "Kernel" : "Usermode",
5966 pRsClient->hClient, RES_GET_HANDLE(pDevice), swizzId);
5967 }
5968 }
5969 }
5970
5971 /*!
5972 * @brief Function to enable/disable MIG mode
5973 *
5974 * @param[IN] pGpu
5975 * @param[IN] pKernelMIGManager
5976 * @param[IN] bMemoryPartitioningNeeded Is Memory partitioning required?
5977 * @param[IN] bEnable Enable/Disable MIG
5978 * @param[IN] bUnload RM unload path
5979 *
5980 * @return Returns NV_STATUS
5981 * NV_OK
5982 * NV_WARN_NOTHING_TO_DO
5983 * NV_ERR_INVALID_STATE
5984 */
5985 NV_STATUS
kmigmgrSetMIGState_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvBool bMemoryPartitioningNeeded,NvBool bEnable,NvBool bUnload)5986 kmigmgrSetMIGState_VF
5987 (
5988 OBJGPU *pGpu,
5989 KernelMIGManager *pKernelMIGManager,
5990 NvBool bMemoryPartitioningNeeded,
5991 NvBool bEnable,
5992 NvBool bUnload
5993 )
5994 {
5995 if (bEnable)
5996 {
5997 KernelGraphics *pKGr = GPU_GET_KERNEL_GRAPHICS(pGpu, 0);
5998
5999 kgraphicsInvalidateStaticInfo(pGpu, pKGr);
6000 }
6001
6002 return NV_OK;
6003 }
6004
6005 /*!
6006 * @brief Function to enable/disable MIG mode
6007 *
6008 * @param[IN] pGpu
6009 * @param[IN] pKernelMIGManager
6010 * @param[IN] bMemoryPartitioningNeeded Is Memory partitioning required?
6011 * @param[IN] bEnable Enable/Disable MIG
6012 * @param[IN] bUnload RM unload path
6013 *
6014 * @return Returns NV_STATUS
6015 * NV_OK
6016 * NV_WARN_NOTHING_TO_DO
6017 * NV_ERR_INVALID_STATE
6018 */
6019 NV_STATUS
kmigmgrSetMIGState_FWCLIENT(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvBool bMemoryPartitioningNeeded,NvBool bEnable,NvBool bUnload)6020 kmigmgrSetMIGState_FWCLIENT
6021 (
6022 OBJGPU *pGpu,
6023 KernelMIGManager *pKernelMIGManager,
6024 NvBool bMemoryPartitioningNeeded,
6025 NvBool bEnable,
6026 NvBool bUnload
6027 )
6028 {
6029 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
6030 NV_STATUS rmStatus = NV_OK;
6031 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
6032 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
6033 NvBool bPrevMIGState = pKernelMIGManager->bMIGEnabled;
6034
6035 if (bEnable)
6036 {
6037 KernelGraphics *pKGr = GPU_GET_KERNEL_GRAPHICS(pGpu, 0);
6038
6039 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6040 kgrmgrDiscoverMaxGlobalCtxBufSizes(pGpu, pKernelGraphicsManager, pKGr, bMemoryPartitioningNeeded),
6041 done);
6042
6043 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6044 kmigmgrDisableWatchdog(pGpu, pKernelMIGManager),
6045 cleanup_disableWatchdog);
6046
6047 // Before enabling MIG, deconfigure GR0 in legacy mode
6048 kgraphicsInvalidateStaticInfo(pGpu, pKGr);
6049
6050 //
6051 // Destroy all global ctx buffers, we will need to recreate them in
6052 // partitionable memory later.
6053 //
6054 fecsBufferTeardown(pGpu, pKGr);
6055
6056 kgraphicsFreeGlobalCtxBuffers(pGpu, pKGr, GPU_GFID_PF);
6057
6058 //
6059 // Save the pre-MIG top-level scrubber status for later
6060 // Destroy the top level scrubber if it exists
6061 //
6062 NV_ASSERT_OK_OR_GOTO(rmStatus,
6063 memmgrDestroyInternalChannels(pGpu, pMemoryManager),
6064 cleanup_destroyInternalChannels);
6065
6066 //
6067 // Preexisting channel and memory allocation checks should be done after
6068 // all buffers(like global Gr buffers) and pre-created channels(like scrubber, watchdog etc.)
6069 // are destroyed.
6070 //
6071 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6072 kmigmgrCreateGPUInstanceCheck_HAL(pGpu, pKernelMIGManager, bMemoryPartitioningNeeded),
6073 cleanup_createPartitionCheck);
6074
6075 // On Nvswitch based systems, suspend gpu fabric probe on nvlink inband
6076 gpuFabricProbeSuspend(pGpu->pGpuFabricProbeInfoKernel);
6077
6078 // Ensure NVLINK is shutdown before enabling MIG
6079 if (!kmigmgrIsMIGNvlinkP2PSupportOverridden(pGpu, pKernelMIGManager) ||
6080 bMemoryPartitioningNeeded)
6081 {
6082 #if (defined(DEBUG) || defined(DEVELOP))
6083 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
6084
6085 if (pKernelNvlink != NULL)
6086 {
6087 NvU32 linkId;
6088
6089 //TODO: Remove below code once a more robust SRT is available to test for this condition
6090 FOR_EACH_INDEX_IN_MASK(32, linkId, knvlinkGetEnabledLinkMask(pGpu, pKernelNvlink))
6091 {
6092 NV2080_CTRL_NVLINK_CORE_CALLBACK_PARAMS params;
6093
6094 params.linkId = linkId;
6095 params.callbackType.type = NV2080_CTRL_NVLINK_CALLBACK_TYPE_GET_DL_LINK_MODE;
6096 NV_CHECK_OK(rmStatus, LEVEL_ERROR,
6097 knvlinkExecGspRmRpc(pGpu, pKernelNvlink,
6098 NV2080_CTRL_CMD_NVLINK_CORE_CALLBACK,
6099 (void *)¶ms, sizeof(params)));
6100
6101 if ((params.callbackType.callbackParams.getDlLinkMode.mode != NV2080_NVLINK_CORE_LINK_STATE_SLEEP) ||
6102 (params.callbackType.callbackParams.getDlLinkMode.mode != NV2080_NVLINK_CORE_LINK_STATE_OFF))
6103 {
6104 NV_PRINTF(LEVEL_ERROR, "Nvlink %d is not asleep upon enteing MIG mode!\n", linkId);
6105 }
6106 }
6107 FOR_EACH_INDEX_IN_MASK_END
6108 }
6109 rmStatus = NV_OK;
6110 #endif
6111 NV_ASSERT_OK_OR_GOTO(rmStatus,
6112 gpuDeleteClassFromClassDBByClassId(pGpu, NV50_P2P),
6113 cleanup_disableNvlink);
6114 }
6115
6116 // Enable ctx buf pool before allocating any resources that uses it.
6117 if (bMemoryPartitioningNeeded)
6118 {
6119 pGpu->setProperty(pGpu, PDB_PROP_GPU_MOVE_CTX_BUFFERS_TO_PMA, NV_TRUE);
6120 }
6121
6122 // Add the MIG-specific classes
6123 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
6124 gpuAddClassToClassDBByClassId(pGpu, AMPERE_SMC_PARTITION_REF));
6125
6126 if (rmStatus != NV_OK)
6127 goto cleanup_addClassToClassDB;
6128
6129 // Allocate handles for memory partitioning if needed
6130 if (bMemoryPartitioningNeeded)
6131 {
6132 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6133 memmgrAllocMIGMemoryAllocationInternalHandles(pGpu, pMemoryManager),
6134 cleanup_memsysConfigL2EvictLast);
6135 }
6136
6137 // initialize pKernelFifo->pppRunlistBufMemDesc based on max possible # of runlists.
6138 {
6139 MEMORY_DESCRIPTOR ***pppMemDesc = NULL;
6140 NvU32 maxRunlists = kfifoGetMaxNumRunlists_HAL(pGpu, pKernelFifo);
6141 NvU32 rowSize = sizeof(pppMemDesc) * maxRunlists;
6142 NvU32 arrSize = rowSize * NV2080_CTRL_INTERNAL_FIFO_MAX_RUNLIST_BUFFERS;
6143 NvU32 i;
6144
6145 // Should not have already been initialized
6146 NV_ASSERT(pKernelFifo->pppRunlistBufMemDesc == NULL);
6147
6148 pppMemDesc = portMemAllocNonPaged(rowSize);
6149 NV_ASSERT_OR_ELSE(pppMemDesc != NULL, rmStatus = NV_ERR_NO_MEMORY; goto cleanup_initialize_runlistBufMemDesc;);
6150 portMemSet(pppMemDesc, 0, rowSize);
6151
6152 *pppMemDesc = portMemAllocNonPaged(arrSize);
6153 NV_ASSERT_OR_ELSE(*pppMemDesc != NULL, rmStatus = NV_ERR_NO_MEMORY; goto cleanup_initialize_runlistBufMemDesc;);
6154 portMemSet(*pppMemDesc, 0, arrSize);
6155
6156 // Set up pointers for the 2D array
6157 for (i = 0; i < maxRunlists; i++)
6158 {
6159 pppMemDesc[i] = *pppMemDesc + (NV2080_CTRL_INTERNAL_FIFO_MAX_RUNLIST_BUFFERS * i);
6160 }
6161
6162 pKernelFifo->pppRunlistBufMemDesc = pppMemDesc;
6163 }
6164
6165 //
6166 // Populate static GPU instance memory config which will be used to manage
6167 // GPU instance memory
6168 //
6169 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
6170 NV_ASSERT_OK_OR_RETURN(kmemsysPopulateMIGGPUInstanceMemConfig_HAL(pGpu, pKernelMemorySystem));
6171 }
6172 else
6173 {
6174 if (bMemoryPartitioningNeeded)
6175 {
6176 memmgrFreeMIGMemoryAllocationInternalHandles(pGpu, pMemoryManager);
6177 }
6178
6179 cleanup_initialize_runlistBufMemDesc:
6180
6181 if (pKernelFifo->pppRunlistBufMemDesc != NULL)
6182 {
6183 portMemFree(*(pKernelFifo->pppRunlistBufMemDesc));
6184 portMemFree(pKernelFifo->pppRunlistBufMemDesc);
6185 }
6186
6187 pKernelFifo->pppRunlistBufMemDesc = NULL;
6188
6189 cleanup_memsysConfigL2EvictLast:
6190
6191 cleanup_addClassToClassDB:
6192 // Delete the MIG GR classes as MIG is disabled
6193 NV_ASSERT_OK(
6194 gpuDeleteClassFromClassDBByClassId(pGpu, AMPERE_SMC_PARTITION_REF));
6195
6196 //
6197 // Disable ctx buf pool after freeing any resources that uses it.
6198 // Leave enabled on platforms that support it outside MIG.
6199 //
6200 pGpu->setProperty(pGpu, PDB_PROP_GPU_MOVE_CTX_BUFFERS_TO_PMA,
6201 gpuIsCtxBufAllocInPmaSupported_HAL(pGpu));
6202
6203 //
6204 // HACK: GSP-RM always enables/disables LCEs during MIG enable/disable.
6205 // Client-RM must always follow it to update its settings accordingly,
6206 // so it should only call it for MIG disable (and not as part of MIG
6207 // enable).
6208 //
6209 if (!bEnable)
6210 {
6211 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
6212 kmigmgrEnableAllLCEs(pGpu, pKernelMIGManager, NV_FALSE));
6213 }
6214
6215 cleanup_disableNvlink:
6216 // Add P2P class back to class DB as memory partitioning is disabled
6217 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
6218 gpuAddClassToClassDBByClassId(pGpu, NV50_P2P));
6219
6220 //
6221 // On Nvswitch based systems, resume the gpu fabric probe
6222 // request on nvlink inband to register the GPU with the nvswitch fabric
6223 //
6224 if (pGpu->pGpuFabricProbeInfoKernel != NULL)
6225 {
6226 NV_ASSERT_OK(gpuFabricProbeResume(pGpu->pGpuFabricProbeInfoKernel));
6227 }
6228
6229 cleanup_createPartitionCheck:
6230 if (!bUnload)
6231 {
6232 // Init top level scrubber if it existed before
6233 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
6234 memmgrInitInternalChannels(pGpu, pMemoryManager));
6235 }
6236 cleanup_destroyInternalChannels:
6237
6238 // Set kmigmgr state to reflect MIG disabled while reconfiguring for NON-MIG
6239 pKernelMIGManager->bMIGEnabled = NV_FALSE;
6240
6241 if (!bUnload)
6242 {
6243 KernelGraphics *pKGr = GPU_GET_KERNEL_GRAPHICS(pGpu, 0);
6244
6245 // Since MIG is now disabled, reconfigure GR0 in legacy mode
6246 NV_ASSERT_OK_OR_CAPTURE_FIRST_ERROR(rmStatus,
6247 kgraphicsLoadStaticInfo(pGpu, pKGr, KMIGMGR_SWIZZID_INVALID));
6248 NV_ASSERT_OK(
6249 kmigmgrRestoreWatchdog(pGpu, pKernelMIGManager));
6250 }
6251
6252 //
6253 // Restore previous kmigmgr MIG state. kmigmgrSetMIGState should not
6254 // permanently modify bMIGEnabled. Restore the value to whatever was
6255 // present on entry to this function.
6256 //
6257 pKernelMIGManager->bMIGEnabled = bPrevMIGState;
6258
6259 cleanup_disableWatchdog:
6260 goto done;
6261 }
6262
6263 done:
6264 //
6265 // Restore previous kmigmgr MIG state. kmigmgrSetMIGState should not
6266 // permanently modify bMIGEnabled. Restore the value to whatever was
6267 // present on entry to this function.
6268 //
6269 pKernelMIGManager->bMIGEnabled = bPrevMIGState;
6270 return rmStatus;
6271 }
6272
6273 /*!
6274 * @brief Function to create or destroy GPU instance
6275 *
6276 * @param[IN] pGpu
6277 * @param[IN] pKernelMIGManager
6278 * @param[IN] swizzId SwizzId allocated for this gpu instance
6279 * @param[IN] pUuid UUID of the GPU instance
6280 * @param[IN] params Gpu instance creation parameters
6281 * @param[IN] bValid Flag stating if gpu instance is created or destroyed
6282 * @param[IN] bCreateCap Flag stating if MIG capabilities needs to be created
6283 */
6284 NV_STATUS
kmigmgrCreateGPUInstance_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 swizzId,NvU8 * pUuid,KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params,NvBool bValid,NvBool bCreateCap)6285 kmigmgrCreateGPUInstance_IMPL
6286 (
6287 OBJGPU *pGpu,
6288 KernelMIGManager *pKernelMIGManager,
6289 NvU32 swizzId,
6290 NvU8 *pUuid,
6291 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS params,
6292 NvBool bValid,
6293 NvBool bCreateCap
6294 )
6295 {
6296 NV_STATUS rmStatus = NV_OK;
6297
6298 // If making a gpu instance valid, memory should be allocated accordingly
6299 if (bValid)
6300 {
6301 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = NULL;
6302 KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
6303 RM_ENGINE_TYPE rmEngineType;
6304
6305 //
6306 // Determine SwizzID for this gpu instance. If this isn't a restore, this
6307 // has already been determined by physical RM.
6308 //
6309 if (params.type == KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_RESTORE)
6310 {
6311 NV_ASSERT_OR_RETURN(!kmigmgrIsSwizzIdInUse(pGpu, pKernelMIGManager, swizzId),
6312 NV_ERR_INVALID_STATE);
6313 }
6314
6315 //
6316 // HACK: GSP-RM updated the PCE-LCE mappings while setting MIG state.
6317 // The Client-RM hasn't had an opportunity to refresh its mappings
6318 // yet until the first gpu instance creation, so do it now.
6319 //
6320 if ((pKernelMIGManager->swizzIdInUseMask == 0x0) && IS_GSP_CLIENT(pGpu))
6321 {
6322 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6323 kmigmgrEnableAllLCEs(pGpu, pKernelMIGManager, NV_TRUE), invalidate);
6324 }
6325
6326 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6327 kmigmgrSetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, pUuid, params), invalidate);
6328
6329 // Mark swizzId as "in-use" in cached mask
6330 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6331 kmigmgrSetSwizzIdInUse(pGpu, pKernelMIGManager, swizzId), invalidate);
6332
6333 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6334 kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, &pKernelMIGGpuInstance), invalidate);
6335
6336 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6337 kmigmgrAllocGPUInstanceHandles(pGpu, swizzId, pKernelMIGGpuInstance), invalidate);
6338
6339 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6340 kmigmgrInitGPUInstanceBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance), invalidate);
6341
6342 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_SILENT,
6343 kmigmgrCreateGPUInstanceRunlists_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance), invalidate);
6344
6345 NV_ASSERT_OK_OR_GOTO(rmStatus,
6346 kmemsysInitMIGMemoryPartitionTable_HAL(pGpu, pKernelMemorySystem), invalidate);
6347
6348 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
6349 {
6350 NvU32 engineIdx;
6351 KernelGraphics *pKernelGraphics;
6352 RM_ENGINE_TYPE localEngineType;
6353
6354 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
6355 continue;
6356
6357 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
6358 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, engineIdx);
6359
6360 NV_ASSERT_OK_OR_GOTO(rmStatus,
6361 kmigmgrGetGlobalToLocalEngineType(pGpu,
6362 pKernelMIGManager,
6363 kmigmgrMakeGIReference(pKernelMIGGpuInstance),
6364 rmEngineType,
6365 &localEngineType),
6366 invalidate);
6367
6368 fecsSetRoutingInfo(pGpu,
6369 pKernelGraphics,
6370 pKernelMIGGpuInstance->instanceHandles.hClient,
6371 pKernelMIGGpuInstance->instanceHandles.hSubdevice,
6372 RM_ENGINE_TYPE_GR_IDX(localEngineType));
6373 }
6374 FOR_EACH_IN_BITVECTOR_END();
6375
6376 // Init gpu instance pool for page table mem
6377 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6378 kmigmgrInitGPUInstancePool(pGpu, pKernelMIGManager, pKernelMIGGpuInstance), invalidate);
6379
6380 // Init gpu instance scrubber
6381 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6382 kmigmgrInitGPUInstanceScrubber(pGpu, pKernelMIGManager, pKernelMIGGpuInstance), invalidate);
6383
6384 //
6385 // Only partitions in which VGPU guests are booted require changing
6386 // engine interrupt vectors to deterministic values for migration.
6387 //
6388 if (IS_GSP_CLIENT(pGpu) && gpuIsSriovEnabled(pGpu))
6389 {
6390 Intr *pIntr = GPU_GET_INTR(pGpu);
6391
6392 //
6393 // Making changes to MIG partition structure reassigns engine interrupts
6394 // of the engines assigned to a partition. This is done so that a guest
6395 // VM sees the same vectors across a migration (suspend/resume / cloning
6396 // / resuming from a snapshot, etc.).
6397 //
6398 // Physical RM will update the interrupt table to reflect the actual
6399 // changes made to engine interrupts.
6400 //
6401 // When running in GSP offload mode, KernelRM must re-fetch the
6402 // interrupt table on every change to the MIG partitioning layout.
6403 //
6404 // The changes to partitions as well as preparing an existing partition
6405 // to boot a VM are done together on Physical RM in
6406 // NV2080_CTRL_CMD_VGPU_MGR_INTERNAL_BOOTLOAD_GSP_VGPU_PLUGIN_TASK.
6407 //
6408 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6409 intrStateUnload_HAL(pGpu, pIntr, GPU_STATE_FLAGS_PRESERVING),
6410 invalidate);
6411
6412 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6413 intrInitInterruptTable_HAL(pGpu, pIntr),
6414 invalidate);
6415
6416 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6417 intrStateLoad_HAL(pGpu, pIntr, GPU_STATE_FLAGS_PRESERVING),
6418 invalidate);
6419 }
6420
6421 //
6422 // Register gpu instance with the capability framework only if it explicitly
6423 // requested. Otherwise, we rely on the persistent state.
6424 //
6425 if (bCreateCap)
6426 {
6427 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_ERROR,
6428 osRmCapRegisterSmcPartition(pGpu->pOsRmCaps, &pKernelMIGGpuInstance->pOsRmCaps,
6429 pKernelMIGGpuInstance->swizzId), invalidate);
6430 }
6431 }
6432 else
6433 {
6434 NV_PRINTF(LEVEL_INFO, "Invalidating swizzId - %d.\n", swizzId);
6435
6436 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
6437 kmigmgrInvalidateGPUInstance(pGpu, pKernelMIGManager, swizzId, NV_FALSE));
6438 }
6439
6440 return rmStatus;
6441
6442 invalidate:
6443 kmigmgrInvalidateGPUInstance(pGpu, pKernelMIGManager, swizzId, NV_FALSE);
6444
6445 return rmStatus;
6446 }
6447
6448 /*
6449 * @brief Init per-gpu instance memory pool so that memory for client page tables
6450 * can be allocated from this memory pool
6451 */
6452 NV_STATUS
kmigmgrInitGPUInstancePool_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)6453 kmigmgrInitGPUInstancePool_IMPL
6454 (
6455 OBJGPU *pGpu,
6456 KernelMIGManager *pKernelMIGManager,
6457 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
6458 )
6459 {
6460 KernelGmmu *pKernelGmmu = GPU_GET_KERNEL_GMMU(pGpu);
6461 const GMMU_FMT *pFmt = kgmmuFmtGet(pKernelGmmu, GMMU_FMT_VERSION_DEFAULT, 0);
6462 NvU32 version;
6463 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
6464
6465 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
6466
6467 if (!memmgrIsPmaInitialized(pMemoryManager) ||
6468 !memmgrAreClientPageTablesPmaManaged(pMemoryManager))
6469 {
6470 return NV_OK;
6471 }
6472
6473 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance->swizzId))
6474 return NV_OK;
6475
6476 NV_ASSERT_OR_RETURN(pFmt != NULL, NV_ERR_INVALID_ARGUMENT);
6477 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance->pMemoryPartitionHeap != NULL, NV_ERR_INVALID_STATE);
6478
6479 version = ((pFmt->version == GMMU_FMT_VERSION_1) ? POOL_CONFIG_GMMU_FMT_1 : POOL_CONFIG_GMMU_FMT_2);
6480
6481 NV_ASSERT_OK_OR_RETURN(
6482 rmMemPoolSetup((void*)&pKernelMIGGpuInstance->pMemoryPartitionHeap->pmaObject,
6483 &pKernelMIGGpuInstance->pPageTableMemPool, version));
6484
6485 // Allocate the pool in CPR in case of Confidential Compute
6486 if (gpuIsCCFeatureEnabled(pGpu))
6487 {
6488 rmMemPoolAllocateProtectedMemory(pKernelMIGGpuInstance->pPageTableMemPool, NV_TRUE);
6489 }
6490
6491 return NV_OK;
6492 }
6493
6494 /*
6495 * @brief Initializes ctx buf pools for runlist buffer and GR global ctx buffers
6496 * for engines that belong to this gpu instance.
6497 */
6498 NV_STATUS
kmigmgrInitGPUInstanceBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)6499 kmigmgrInitGPUInstanceBufPools_IMPL
6500 (
6501 OBJGPU *pGpu,
6502 KernelMIGManager *pKernelMIGManager,
6503 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
6504 )
6505 {
6506 Heap *pHeap;
6507 MemoryManager *pMemoryManager = GPU_GET_MEMORY_MANAGER(pGpu);
6508 NvU32 pmaConfig = PMA_QUERY_NUMA_ONLINED;
6509 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
6510 pHeap = pKernelMIGGpuInstance->pMemoryPartitionHeap;
6511 NV_ASSERT_OR_RETURN(pHeap != NULL, NV_ERR_INVALID_STATE);
6512
6513 if (!ctxBufPoolIsSupported(pGpu))
6514 return NV_OK;
6515
6516 //
6517 // We have to drop GPU lock before making allocations from PMA
6518 // as RM allocations can trigger UVM evictions.
6519 // However, in this case we can skip dropping GPU lock as gpu instance PMA
6520 // isn't visible to UVM yet.
6521 // This is just a sanity check to make sure this assumption is correct and
6522 // allocation from PMA cannot trigger UVM evictions.
6523 //
6524 // When FB memory is onlined as NUMA node, kernel can directly alloc FB memory
6525 // and hence free memory can not be expected to be same as total memory.
6526 //
6527 if (memmgrIsPmaInitialized(pMemoryManager) &&
6528 (pmaQueryConfigs(&pHeap->pmaObject, &pmaConfig) == NV_OK) &&
6529 !(pmaConfig & PMA_QUERY_NUMA_ONLINED))
6530 {
6531 NvU64 freeSpace, totalSpace;
6532 pmaGetFreeMemory(&pHeap->pmaObject, &freeSpace);
6533 pmaGetTotalMemory(&pHeap->pmaObject, &totalSpace);
6534 if (freeSpace != totalSpace)
6535 {
6536 NV_PRINTF(LEVEL_ERROR, "Assumption that PMA is empty at this time is broken\n");
6537 NV_PRINTF(LEVEL_ERROR, "free space = 0x%llx bytes total space = 0x%llx bytes\n",
6538 freeSpace, totalSpace);
6539 NV_PRINTF(LEVEL_ERROR, "This means PMA allocations may trigger UVM evictions at this point causing deadlocks!\n");
6540 return NV_ERR_INVALID_STATE;
6541 }
6542 }
6543
6544 NV_ASSERT_OK_OR_RETURN(kmigmgrInitGPUInstanceRunlistBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance));
6545 NV_ASSERT_OK_OR_RETURN(kmigmgrInitGPUInstanceGrBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance));
6546 return NV_OK;
6547 }
6548
6549 /*
6550 * Initializes the runlist buffer pools for engines that belong to this gpu instance
6551 * Also reserves memory for runlist buffers into these pools.
6552 * later, runlists will be allocated from these pools.
6553 */
6554 NV_STATUS
kmigmgrInitGPUInstanceRunlistBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)6555 kmigmgrInitGPUInstanceRunlistBufPools_IMPL
6556 (
6557 OBJGPU *pGpu,
6558 KernelMIGManager *pKernelMIGManager,
6559 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
6560 )
6561 {
6562 RM_ENGINE_TYPE rmEngineType;
6563 KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu);
6564 CTX_BUF_INFO runlistBufInfo[NUM_BUFFERS_PER_RUNLIST] = {0};
6565 NvU64 rlSize;
6566 NvU64 rlAlign;
6567 NvU32 swizzId;
6568 NvU32 i;
6569 NvU32 runlistId;
6570 Heap *pHeap;
6571
6572 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
6573 swizzId = pKernelMIGGpuInstance->swizzId;
6574 pHeap = pKernelMIGGpuInstance->pMemoryPartitionHeap;
6575 NV_ASSERT_OR_RETURN(pHeap != NULL, NV_ERR_INVALID_STATE);
6576
6577 if (!kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, swizzId))
6578 return NV_OK;
6579
6580 for (rmEngineType = 0; rmEngineType < RM_ENGINE_TYPE_LAST; rmEngineType++)
6581 {
6582 if (!RM_ENGINE_TYPE_IS_VALID(rmEngineType) ||
6583 !kmigmgrIsEnginePartitionable(pGpu, pKernelMIGManager, rmEngineType) ||
6584 !kmigmgrIsEngineInInstance(pGpu, pKernelMIGManager, rmEngineType, kmigmgrMakeGIReference(pKernelMIGGpuInstance)))
6585 {
6586 continue;
6587 }
6588
6589 // Get runlist ID for Engine type.
6590 NV_ASSERT_OK_OR_RETURN(kfifoEngineInfoXlate_HAL(pGpu, pKernelFifo,
6591 ENGINE_INFO_TYPE_RM_ENGINE_TYPE, (NvU32)rmEngineType,
6592 ENGINE_INFO_TYPE_RUNLIST, &runlistId));
6593
6594 //
6595 // ctx buf pools only support HW runlists today
6596 // we assume TSGs are supported for all runlists which is true for Ampere
6597 //
6598 for (i = 0; i < NUM_BUFFERS_PER_RUNLIST; i++)
6599 {
6600 NV_ASSERT_OK_OR_RETURN(kfifoGetRunlistBufInfo(pGpu, pKernelFifo, runlistId, NV_TRUE,
6601 0, &rlSize, &rlAlign));
6602 runlistBufInfo[i].size = rlSize;
6603 runlistBufInfo[i].align = rlAlign;
6604 runlistBufInfo[i].attr = RM_ATTR_PAGE_SIZE_DEFAULT;
6605 runlistBufInfo[i].bContig = NV_TRUE;
6606 }
6607
6608 NV_ASSERT_OK_OR_RETURN(ctxBufPoolInit(pGpu, pHeap, &pKernelFifo->pRunlistBufPool[rmEngineType]));
6609 NV_ASSERT_OR_RETURN(pKernelFifo->pRunlistBufPool[rmEngineType] != NULL, NV_ERR_INVALID_STATE);
6610
6611 //
6612 // Skip scrubber for runlist buffer alloctions since gpu instance scrubber is not setup yet
6613 // and it will be destroyed before deleting the runlist buffer pool.
6614 //
6615 ctxBufPoolSetScrubSkip(pKernelFifo->pRunlistBufPool[rmEngineType], NV_TRUE);
6616 NV_ASSERT_OK_OR_RETURN(ctxBufPoolReserve(pGpu, pKernelFifo->pRunlistBufPool[rmEngineType], &runlistBufInfo[0], NUM_BUFFERS_PER_RUNLIST));
6617 }
6618
6619 return NV_OK;
6620 }
6621
6622 /*
6623 * @brief Initializes gr buffer pools for all GR engines that belong to this gpu instance
6624 * Also reserves memory for global GR buffers into these pools.
6625 */
6626 NV_STATUS
kmigmgrInitGPUInstanceGrBufPools_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)6627 kmigmgrInitGPUInstanceGrBufPools_IMPL
6628 (
6629 OBJGPU *pGpu,
6630 KernelMIGManager *pKernelMIGManager,
6631 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
6632 )
6633 {
6634 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
6635 GR_GLOBALCTX_BUFFER bufId;
6636 NvU32 bufCount;
6637 CTX_BUF_INFO globalCtxBufInfo[GR_GLOBALCTX_BUFFER_COUNT];
6638 Heap *pHeap = NULL;
6639 NV_STATUS rmStatus = NV_OK;
6640 RM_ENGINE_TYPE rmEngineType;
6641
6642 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
6643 pHeap = pKernelMIGGpuInstance->pMemoryPartitionHeap;
6644 NV_ASSERT_OR_RETURN(pHeap != NULL, NV_ERR_INVALID_STATE);
6645
6646 bufCount = 0;
6647 FOR_EACH_IN_ENUM(GR_GLOBALCTX_BUFFER, bufId)
6648 {
6649 if (kgrmgrIsGlobalCtxBufSupported(bufId, NV_FALSE))
6650 {
6651 const CTX_BUF_INFO *pBufInfo = kgrmgrGetGlobalCtxBufInfo(pGpu, pKernelGraphicsManager, bufId);
6652 NV_ASSERT_OR_RETURN(pBufInfo != NULL, NV_ERR_INVALID_STATE);
6653
6654 globalCtxBufInfo[bufCount] = *pBufInfo;
6655
6656 if ((bufId == GR_GLOBALCTX_BUFFER_FECS_EVENT) || (bufId == GR_GLOBAL_BUFFER_GLOBAL_PRIV_ACCESS_MAP))
6657 {
6658 globalCtxBufInfo[bufCount].bContig = NV_TRUE;
6659 }
6660 else if ((bufId == GR_GLOBALCTX_BUFFER_PRIV_ACCESS_MAP) || (bufId == GR_GLOBALCTX_BUFFER_UNRESTRICTED_PRIV_ACCESS_MAP))
6661 {
6662 globalCtxBufInfo[bufCount].bContig = gpuIsClientRmAllocatedCtxBufferEnabled(pGpu);
6663 }
6664 kgrmgrSetGlobalCtxBufInfo(pGpu, pKernelGraphicsManager, bufId,
6665 globalCtxBufInfo[bufCount].size,
6666 globalCtxBufInfo[bufCount].align,
6667 globalCtxBufInfo[bufCount].attr,
6668 globalCtxBufInfo[bufCount].bContig);
6669 bufCount++;
6670 }
6671 }
6672 FOR_EACH_IN_ENUM_END;
6673
6674 FOR_EACH_IN_BITVECTOR(&pKernelMIGGpuInstance->resourceAllocation.engines, rmEngineType)
6675 {
6676 NvU32 engineIdx;
6677 KernelGraphics *pKernelGraphics;
6678 CTX_BUF_POOL_INFO *pGrCtxBufPool;
6679
6680 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
6681 continue;
6682
6683 engineIdx = RM_ENGINE_TYPE_GR_IDX(rmEngineType);
6684 pKernelGraphics = GPU_GET_KERNEL_GRAPHICS(pGpu, engineIdx);
6685
6686 NV_ASSERT_OK_OR_GOTO(rmStatus,
6687 kgraphicsInitCtxBufPool(pGpu, pKernelGraphics, pHeap),
6688 failed);
6689
6690 pGrCtxBufPool = kgraphicsGetCtxBufPool(pGpu, pKernelGraphics);
6691
6692 if (pGrCtxBufPool == NULL)
6693 {
6694 rmStatus = NV_ERR_INVALID_STATE;
6695 goto failed;
6696 }
6697
6698 //
6699 // Skip scrubber for GR buffer alloctions since gpu instance scrubber is not setup yet
6700 // and it will be destroyed before deleting the GR buffer pool.
6701 //
6702 ctxBufPoolSetScrubSkip(pGrCtxBufPool, NV_TRUE);
6703 NV_ASSERT_OK_OR_GOTO(
6704 rmStatus,
6705 ctxBufPoolReserve(pGpu, pGrCtxBufPool, &globalCtxBufInfo[0], bufCount),
6706 failed);
6707 }
6708 FOR_EACH_IN_BITVECTOR_END();
6709
6710 return NV_OK;
6711
6712 failed:
6713 kmigmgrDestroyGPUInstanceGrBufPools(pGpu, pKernelMIGManager, pKernelMIGGpuInstance);
6714 return rmStatus;
6715 }
6716
6717 /*!
6718 * @brief Save MIG instance topology to persistence, if available.
6719 */
6720 NV_STATUS
kmigmgrSaveToPersistence_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)6721 kmigmgrSaveToPersistence_IMPL
6722 (
6723 OBJGPU *pGpu,
6724 KernelMIGManager *pKernelMIGManager
6725 )
6726 {
6727 GPUMGR_SAVE_MIG_INSTANCE_TOPOLOGY *pTopologySave = NULL;
6728 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGPUInstance;
6729 NvU32 gpcIdx;
6730 NvU32 savedGIIdx;
6731
6732 NV_CHECK_OR_RETURN(LEVEL_SILENT,
6733 gpumgrGetSystemMIGInstanceTopo(gpuGetDBDF(pGpu), &pTopologySave),
6734 NV_OK);
6735
6736 // Clear existing topology, if any.
6737 portMemSet(pTopologySave->saveGI, 0, sizeof(pTopologySave->saveGI));
6738
6739 // Update the MIG enablement bit
6740 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_RESETLESS_MIG_SUPPORTED))
6741 {
6742 gpumgrSetSystemMIGEnabled(gpuGetDBDF(pGpu), pKernelMIGManager->bMIGEnabled);
6743 }
6744
6745 // If there are no instances then don't bother checking anything.
6746 NV_CHECK_OR_RETURN(LEVEL_SILENT, IS_MIG_IN_USE(pGpu), NV_OK);
6747
6748 savedGIIdx = 0;
6749 FOR_EACH_VALID_GPU_INSTANCE(pGpu, pKernelMIGManager, pKernelMIGGPUInstance)
6750 {
6751 GPUMGR_SAVE_GPU_INSTANCE *pGPUInstanceSave = &pTopologySave->saveGI[savedGIIdx];
6752
6753 pGPUInstanceSave->bValid = NV_TRUE;
6754 pGPUInstanceSave->swizzId = pKernelMIGGPUInstance->swizzId;
6755 pGPUInstanceSave->pOsRmCaps = pKernelMIGGPUInstance->pOsRmCaps;
6756 pGPUInstanceSave->giInfo.partitionFlags = pKernelMIGGPUInstance->partitionFlag;
6757 bitVectorToRaw(&pKernelMIGGPUInstance->resourceAllocation.engines,
6758 pGPUInstanceSave->giInfo.enginesMask, sizeof(pGPUInstanceSave->giInfo.enginesMask));
6759 for (gpcIdx = 0; gpcIdx < pKernelMIGGPUInstance->resourceAllocation.gpcCount; ++gpcIdx)
6760 {
6761 pGPUInstanceSave->giInfo.gpcMask |= NVBIT32(pKernelMIGGPUInstance->resourceAllocation.gpcIds[gpcIdx]);
6762 }
6763 pGPUInstanceSave->giInfo.veidOffset = pKernelMIGGPUInstance->resourceAllocation.veidOffset;
6764 pGPUInstanceSave->giInfo.veidCount = pKernelMIGGPUInstance->resourceAllocation.veidCount;
6765 pGPUInstanceSave->giInfo.virtualGpcCount = pKernelMIGGPUInstance->resourceAllocation.virtualGpcCount;
6766
6767 NV_ASSERT_OK_OR_RETURN(kmigmgrSaveComputeInstances(pGpu, pKernelMIGManager, pKernelMIGGPUInstance,
6768 pGPUInstanceSave->saveCI));
6769
6770 ++savedGIIdx;
6771 }
6772 FOR_EACH_VALID_GPU_INSTANCE_END();
6773
6774 return NV_OK;
6775 }
6776
6777 /*!
6778 * @brief Update MIG CI config for CPU-RM if compute instance is created
6779 * by a guest and RPC is directly handled by GSP-RM
6780 */
6781 NV_STATUS
kmigmgrUpdateCiConfigForVgpu_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 execPartCount,NvU32 * pExecPartId,NvU32 gfid,NvBool bDelete)6782 kmigmgrUpdateCiConfigForVgpu_IMPL
6783 (
6784 OBJGPU *pGpu,
6785 KernelMIGManager *pKernelMIGManager,
6786 NvU32 execPartCount,
6787 NvU32 *pExecPartId,
6788 NvU32 gfid,
6789 NvBool bDelete
6790 )
6791 {
6792 NvU32 i;
6793 NV_STATUS status = NV_OK;
6794 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
6795 KERNEL_HOST_VGPU_DEVICE *pKernelHostVgpuDevice;
6796 RsClient *pRsClient;
6797 GPUInstanceSubscription *pGPUInstanceSubscription;
6798 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance;
6799 Subdevice *pSubdevice;
6800
6801 if (!(IS_GSP_CLIENT(pGpu) && IS_MIG_IN_USE(pGpu) &&
6802 IS_VGPU_GSP_PLUGIN_OFFLOAD_ENABLED(pGpu)))
6803 {
6804 return NV_ERR_NOT_SUPPORTED;
6805 }
6806
6807 NV_ASSERT_OR_RETURN(execPartCount <= NVC637_CTRL_MAX_EXEC_PARTITIONS,
6808 NV_ERR_INVALID_ARGUMENT);
6809
6810 // Get hostVgpuDevice from provided GFID and validate the subscription
6811 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kvgpumgrGetHostVgpuDeviceFromGfid(pGpu->gpuId, gfid,
6812 &pKernelHostVgpuDevice));
6813
6814 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, serverGetClientUnderLock(&g_resServ, pKernelHostVgpuDevice->hMigClient,
6815 &pRsClient));
6816 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, subdeviceGetByInstance(pRsClient, pKernelHostVgpuDevice->hMigDevice, 0,
6817 &pSubdevice));
6818
6819 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
6820 gisubscriptionGetGPUInstanceSubscription(pRsClient, RES_GET_HANDLE(pSubdevice),
6821 &pGPUInstanceSubscription));
6822
6823 pKernelMIGGpuInstance = gisubscriptionGetMIGGPUInstance(pGPUInstanceSubscription);
6824
6825 if (!bDelete)
6826 {
6827 NVC637_CTRL_EXEC_PARTITIONS_IMPORT_EXPORT_PARAMS execPartExportParams;
6828
6829 // Create the execution partition state on CPU
6830 for (i = 0; i < execPartCount; i++)
6831 {
6832 GPUMGR_SAVE_COMPUTE_INSTANCE save;
6833 KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS restore =
6834 {
6835 .type = KMIGMGR_CREATE_COMPUTE_INSTANCE_PARAMS_TYPE_RESTORE,
6836 .inst.restore.pComputeInstanceSave = &save,
6837 };
6838
6839 portMemSet(&execPartExportParams, 0, sizeof(execPartExportParams));
6840 execPartExportParams.id = pExecPartId[i];
6841
6842 // Retrieve the CI state created by GSP-RM, then restore it to CPU-RM
6843 NV_ASSERT_OK_OR_GOTO(status,
6844 pRmApi->Control(pRmApi,
6845 pKernelMIGGpuInstance->instanceHandles.hClient,
6846 pKernelMIGGpuInstance->instanceHandles.hSubscription,
6847 NVC637_CTRL_CMD_EXEC_PARTITIONS_EXPORT,
6848 &execPartExportParams,
6849 sizeof(execPartExportParams)),
6850 failed);
6851
6852 portMemSet(&save, 0, sizeof(save));
6853 save.bValid = NV_TRUE;
6854 save.id = pExecPartId[i];
6855 save.ciInfo = execPartExportParams.info;
6856
6857 NV_ASSERT_OK_OR_GOTO(status,
6858 kmigmgrCreateComputeInstances_HAL(pGpu, pKernelMIGManager, pKernelMIGGpuInstance,
6859 NV_FALSE, restore, &pExecPartId[i], NV_TRUE),
6860 failed);
6861 }
6862 }
6863 else
6864 {
6865 for (i = 0; i < execPartCount; i++)
6866 {
6867 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
6868 kmigmgrDeleteComputeInstance(pGpu, pKernelMIGManager, pKernelMIGGpuInstance,
6869 pExecPartId[i], NV_FALSE));
6870 }
6871 }
6872
6873 // Generate a CPU event to notify CPU clients of updated config
6874 gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_SMC_CONFIG_UPDATE, NULL,
6875 0, 0, 0);
6876 return NV_OK;
6877
6878 failed:
6879 // Send an RPC to GSP-RM to cleanup the state as we failed the call
6880 if (!bDelete)
6881 {
6882 NVC637_CTRL_EXEC_PARTITIONS_DELETE_PARAMS params;
6883 portMemSet(¶ms, 0, sizeof(params));
6884 params.execPartCount = 1;
6885 params.execPartId[0] = pExecPartId[i];
6886 NV_RM_RPC_CONTROL(pGpu, pKernelMIGGpuInstance->instanceHandles.hClient,
6887 pKernelMIGGpuInstance->instanceHandles.hSubscription,
6888 NVC637_CTRL_CMD_EXEC_PARTITIONS_DELETE, ¶ms,
6889 sizeof(params), status);
6890 }
6891
6892 return status;
6893 }
6894
6895 // Control call for getting active gpu instance Ids
6896 NV_STATUS
subdeviceCtrlCmdGpuGetActivePartitionIds_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_GET_ACTIVE_PARTITION_IDS_PARAMS * pParams)6897 subdeviceCtrlCmdGpuGetActivePartitionIds_IMPL
6898 (
6899 Subdevice *pSubdevice,
6900 NV2080_CTRL_GPU_GET_ACTIVE_PARTITION_IDS_PARAMS *pParams
6901 )
6902 {
6903 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
6904 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
6905 NvU64 validSwizzIdMask;
6906
6907 pParams->partitionCount = 0;
6908
6909 ct_assert(NV2080_CTRL_GPU_MAX_PARTITIONS == KMIGMGR_MAX_GPU_INSTANCES);
6910
6911 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmDeviceGpuLockIsOwner(pGpu->gpuInstance));
6912
6913 if ((pKernelMIGManager == NULL) || !pGpu->getProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED))
6914 {
6915 NV_PRINTF(LEVEL_INFO, "MIG not supported on this GPU.\n");
6916 return NV_ERR_NOT_SUPPORTED;
6917 }
6918
6919 if (!IS_MIG_ENABLED(pGpu))
6920 {
6921 NV_PRINTF(LEVEL_INFO, "MIG Mode has not been turned on.\n");
6922 return NV_ERR_NOT_SUPPORTED;
6923 }
6924
6925 //
6926 // We can always have device_monitoring swizzID available in system even without
6927 // GPU split into MIG instances
6928 //
6929 pParams->swizzId[pParams->partitionCount++] = NVC637_DEVICE_LEVEL_SWIZZID;
6930
6931 // Populate all active swizzIDs
6932 validSwizzIdMask = pKernelMIGManager->swizzIdInUseMask;
6933 while(validSwizzIdMask != 0x0)
6934 {
6935 pParams->swizzId[pParams->partitionCount] = portUtilCountTrailingZeros64(validSwizzIdMask);
6936 validSwizzIdMask &= ~NVBIT64(pParams->swizzId[pParams->partitionCount]);
6937 pParams->partitionCount++;
6938 }
6939
6940 return NV_OK;
6941 }
6942
6943 //
6944 // Control call to determine the number of gpu instances of the given size which
6945 // can still be created, given the current configuration of the GPU.
6946 //
6947 NV_STATUS
subdeviceCtrlCmdGpuGetPartitionCapacity_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_GET_PARTITION_CAPACITY_PARAMS * pParams)6948 subdeviceCtrlCmdGpuGetPartitionCapacity_IMPL
6949 (
6950 Subdevice *pSubdevice,
6951 NV2080_CTRL_GPU_GET_PARTITION_CAPACITY_PARAMS *pParams
6952 )
6953 {
6954 NV_STATUS status = NV_OK;
6955 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
6956 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
6957 NvHandle hClient = RES_GET_CLIENT_HANDLE(pSubdevice);
6958
6959 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmGpuLockIsOwner());
6960
6961 NV_CHECK_OR_RETURN(LEVEL_INFO, kmigmgrIsMIGSupported(pGpu, pKernelMIGManager), NV_ERR_NOT_SUPPORTED);
6962
6963 if (IS_VIRTUAL(pGpu))
6964 {
6965 // This is not supported in legacy MIG vGPU policy
6966 if (kmigmgrUseLegacyVgpuPolicy(pGpu, pKernelMIGManager))
6967 return NV_ERR_NOT_SUPPORTED;
6968
6969 if (!pParams->bStaticInfo)
6970 {
6971 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
6972
6973 NV_ASSERT_OR_RETURN(pCallContext != NULL, NV_ERR_INVALID_STATE);
6974
6975 // Only expose current capacity to admins or capable clients.
6976 if (!rmclientIsCapableOrAdminByHandle(hClient,
6977 NV_RM_CAP_SYS_SMC_CONFIG,
6978 pCallContext->secInfo.privLevel))
6979 {
6980 return NV_ERR_INSUFFICIENT_PERMISSIONS;
6981 }
6982
6983 if (!kmigmgrIsGPUInstanceCombinationValid_HAL(pGpu, pKernelMIGManager, pParams->partitionFlag) ||
6984 !FLD_TEST_DRF(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, _FULL, pParams->partitionFlag))
6985 {
6986 pParams->partitionCount = 0;
6987 pParams->availableSpansCount = 0;
6988 }
6989 else
6990 {
6991 if (IS_MIG_IN_USE(pGpu))
6992 {
6993 pParams->partitionCount = 0;
6994 pParams->availableSpansCount = 0;
6995 }
6996 else
6997 {
6998 pParams->partitionCount = 1;
6999 pParams->availableSpansCount = 1;
7000 pParams->availableSpans[0].lo = NV_RANGE_EMPTY.lo;
7001 pParams->availableSpans[0].hi = NV_RANGE_EMPTY.hi;
7002 }
7003 }
7004 }
7005
7006 if (!kmigmgrIsGPUInstanceCombinationValid_HAL(pGpu, pKernelMIGManager, pParams->partitionFlag) ||
7007 !FLD_TEST_DRF(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, _FULL, pParams->partitionFlag))
7008 {
7009 pParams->totalPartitionCount = 0;
7010 pParams->totalSpansCount = 0;
7011 }
7012 else
7013 {
7014 pParams->totalPartitionCount = 1;
7015 pParams->totalSpansCount = 1;
7016 pParams->totalSpans[0].lo = NV_RANGE_EMPTY.lo;
7017 pParams->totalSpans[0].hi = NV_RANGE_EMPTY.hi;
7018 }
7019
7020 return NV_OK;
7021 }
7022
7023 return NV_ERR_NOT_SUPPORTED;
7024
7025 return status;
7026 }
7027
7028 //
7029 // Control call to provide information about gpu instances which can be created on
7030 // this GPU.
7031 //
7032 NV_STATUS
subdeviceCtrlCmdGpuDescribePartitions_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_DESCRIBE_PARTITIONS_PARAMS * pParams)7033 subdeviceCtrlCmdGpuDescribePartitions_IMPL
7034 (
7035 Subdevice *pSubdevice,
7036 NV2080_CTRL_GPU_DESCRIBE_PARTITIONS_PARAMS *pParams
7037 )
7038 {
7039 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7040 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7041
7042 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmGpuLockIsOwner());
7043
7044 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED))
7045 {
7046 NV_PRINTF(LEVEL_INFO, "MIG not supported on this GPU.\n");
7047 return NV_ERR_NOT_SUPPORTED;
7048 }
7049
7050 return kmigmgrDescribeGPUInstances(pGpu, pKernelMIGManager, pParams);
7051 }
7052
7053 //
7054 // Control call to set the global partitioning mode for this GPU. This call may
7055 // require a PF-FLR to be performed on the GPU before work may be submitted on
7056 // the GPU.
7057 //
7058 NV_STATUS
subdeviceCtrlCmdGpuSetPartitioningMode_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_SET_PARTITIONING_MODE_PARAMS * pParams)7059 subdeviceCtrlCmdGpuSetPartitioningMode_IMPL
7060 (
7061 Subdevice *pSubdevice,
7062 NV2080_CTRL_GPU_SET_PARTITIONING_MODE_PARAMS *pParams
7063 )
7064 {
7065 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7066 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7067 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
7068
7069 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmGpuLockIsOwner());
7070
7071 if (IS_VIRTUAL(pGpu))
7072 {
7073 return NV_ERR_NOT_SUPPORTED;
7074 }
7075
7076 if ((pKernelMIGManager == NULL) || !kmigmgrIsMIGSupported(pGpu, pKernelMIGManager))
7077 {
7078 NV_PRINTF(LEVEL_INFO, "MIG not supported on this GPU.\n");
7079 return NV_ERR_NOT_SUPPORTED;
7080 }
7081
7082 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
7083 pRmApi->Control(pRmApi,
7084 pGpu->hInternalClient,
7085 pGpu->hInternalSubdevice,
7086 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_PARTITIONING_MODE,
7087 pParams,
7088 sizeof(*pParams)));
7089
7090 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
7091 kmigmgrSetPartitioningMode(pGpu, pKernelMIGManager));
7092
7093 return NV_OK;
7094 }
7095
7096 /*!
7097 * @brief Process a single request to create / destroy a gpu instance.
7098 * Handles enabling / disabling MIG mode on entry/exit.
7099 */
7100 static NV_STATUS
_kmigmgrProcessGPUInstanceEntry(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NV2080_CTRL_GPU_SET_PARTITION_INFO * pEntry)7101 _kmigmgrProcessGPUInstanceEntry
7102 (
7103 OBJGPU *pGpu,
7104 KernelMIGManager *pKernelMIGManager,
7105 NV2080_CTRL_GPU_SET_PARTITION_INFO *pEntry
7106 )
7107 {
7108 NV_STATUS status = NV_OK;
7109 NV2080_CTRL_GPU_SET_PARTITIONS_PARAMS *pParams = portMemAllocNonPaged(sizeof(*pParams));
7110 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
7111
7112 NV_CHECK_OR_RETURN(LEVEL_ERROR, pParams != NULL, NV_ERR_NO_MEMORY);
7113
7114 pParams->partitionCount = 1;
7115 pParams->partitionInfo[0] = *pEntry;
7116
7117 //
7118 // Mirrored GPU Instance Management:
7119 // 1: CPU enable MIG
7120 // 2: GSP enable MIG
7121 // 3: GSP create gpu instance
7122 // 4: CPU create gpu instance
7123 // 5: CPU delete gpu instance
7124 // 6: GSP delete gpu instance
7125 // 7: GSP disable MIG
7126 // 8: CPU disable MIG
7127 //
7128
7129 // Step 1, 2: If this is the first gpu instance, enable MIG
7130 if (pEntry->bValid && (pKernelMIGManager->swizzIdInUseMask == 0x0))
7131 {
7132 NvBool bMemoryPartitioningRequested = kmigmgrIsMemoryPartitioningRequested_HAL(pGpu, pKernelMIGManager, pEntry->partitionFlag);
7133
7134 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
7135 kmigmgrSetMIGState(pGpu, pKernelMIGManager, bMemoryPartitioningRequested, NV_TRUE, NV_FALSE),
7136 cleanup_params);
7137 }
7138
7139 if (pEntry->bValid)
7140 {
7141 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
7142 pRmApi->Control(pRmApi,
7143 pGpu->hInternalClient,
7144 pGpu->hInternalSubdevice,
7145 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_GPU_INSTANCES,
7146 pParams,
7147 sizeof(*pParams)),
7148 cleanup_smc_state);
7149 pEntry->swizzId = pParams->partitionInfo[0].swizzId;
7150 portMemCopy(&pEntry->uuid, sizeof(pEntry->uuid),
7151 &pParams->partitionInfo[0].uuid, sizeof(pParams->partitionInfo[0].uuid));
7152 }
7153
7154 if (IS_GSP_CLIENT(pGpu))
7155 {
7156 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS request =
7157 {
7158 .type = KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_REQUEST,
7159 .inst.request.partitionFlag = pEntry->partitionFlag,
7160 .inst.request.bUsePlacement =
7161 FLD_TEST_REF(NV2080_CTRL_GPU_PARTITION_FLAG_PLACE_AT_SPAN, _ENABLE,
7162 pEntry->partitionFlag),
7163 .inst.request.placement = rangeMake(pEntry->placement.lo, pEntry->placement.hi)
7164 };
7165 request.inst.request.partitionFlag = FLD_SET_DRF(2080_CTRL_GPU, _PARTITION_FLAG, _PLACE_AT_SPAN, _DISABLE,
7166 request.inst.request.partitionFlag);
7167
7168 // Step 3, 4, 5, 6: Create / delete gpu instance
7169 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
7170 kmigmgrCreateGPUInstance(pGpu, pKernelMIGManager, pEntry->swizzId, pEntry->uuid,
7171 request, pEntry->bValid, NV_TRUE /* create MIG capabilities */),
7172 cleanup_rpc);
7173 }
7174
7175 if (!pEntry->bValid)
7176 {
7177 NV_ASSERT_OK_OR_GOTO(status,
7178 pRmApi->Control(pRmApi,
7179 pGpu->hInternalClient,
7180 pGpu->hInternalSubdevice,
7181 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_GPU_INSTANCES,
7182 pParams,
7183 sizeof(*pParams)),
7184 cleanup_params);
7185
7186 gpumgrCacheDestroyGpuInstance(pGpu, pEntry->swizzId);
7187 }
7188 else
7189 {
7190 gpumgrCacheCreateGpuInstance(pGpu, pEntry->swizzId);
7191 }
7192
7193 // Step 7, 8: If this is the last gpu instance to go, disable MIG
7194 if (pKernelMIGManager->swizzIdInUseMask == 0x0)
7195 {
7196 NvBool bMemoryPartitioningNeeded = kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pParams->partitionInfo[0].swizzId);
7197
7198 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
7199 kmigmgrSetMIGState(pGpu, pKernelMIGManager, bMemoryPartitioningNeeded, NV_FALSE, NV_FALSE),
7200 cleanup_params);
7201 }
7202
7203 portMemFree(pParams);
7204 return status;
7205
7206 cleanup_rpc:
7207 if (pEntry->bValid)
7208 {
7209 // Reuse the same RPC information we prepared earlier, but flip the bValid bit
7210 pParams->partitionInfo[0].bValid = NV_FALSE;
7211 NV_ASSERT_OK(pRmApi->Control(pRmApi,
7212 pGpu->hInternalClient,
7213 pGpu->hInternalSubdevice,
7214 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_GPU_INSTANCES,
7215 pParams,
7216 sizeof(*pParams)));
7217 }
7218
7219 cleanup_smc_state:
7220 if (pEntry->bValid && (pKernelMIGManager->swizzIdInUseMask == 0x0))
7221 {
7222 NvBool bMemoryPartitioningRequested = kmigmgrIsMemoryPartitioningRequested_HAL(pGpu, pKernelMIGManager, pEntry->partitionFlag);
7223
7224 NV_ASSERT_OK(
7225 kmigmgrSetMIGState(pGpu, pKernelMIGManager, bMemoryPartitioningRequested, NV_FALSE, NV_FALSE));
7226 }
7227
7228 cleanup_params:
7229 portMemFree(pParams);
7230 return status;
7231 }
7232
7233 /*!
7234 * @brief Control call for dividing GPU into requested gpu instances
7235 *
7236 * @returns NV_OK if successful.
7237 * NV_ERR_INVALID_ARGUMENT if parameter is not found
7238 * NV_ERR_NOT_SUPPORTED if parameter is not supported
7239 *
7240 */
7241 NV_STATUS
subdeviceCtrlCmdGpuSetPartitions_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_SET_PARTITIONS_PARAMS * pParams)7242 subdeviceCtrlCmdGpuSetPartitions_IMPL
7243 (
7244 Subdevice *pSubdevice,
7245 NV2080_CTRL_GPU_SET_PARTITIONS_PARAMS *pParams
7246 )
7247 {
7248 NV_STATUS rmStatus = NV_OK;
7249 NvU32 i;
7250 NvU32 j;
7251 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7252 NvHandle hClient = RES_GET_CLIENT_HANDLE(pSubdevice);
7253 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7254 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
7255
7256 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmGpuLockIsOwner());
7257
7258 NV_ASSERT_OR_RETURN(pCallContext != NULL, NV_ERR_INVALID_STATE);
7259
7260 if (!rmclientIsCapableOrAdminByHandle(hClient,
7261 NV_RM_CAP_SYS_SMC_CONFIG,
7262 pCallContext->secInfo.privLevel))
7263 {
7264 NV_PRINTF(LEVEL_ERROR, "Non-privileged context issued privileged cmd\n");
7265 return NV_ERR_INSUFFICIENT_PERMISSIONS;
7266 }
7267
7268 NV_CHECK_OR_RETURN(LEVEL_INFO, IS_MIG_ENABLED(pGpu), NV_ERR_NOT_SUPPORTED);
7269
7270 // Sanity checks
7271 if (pParams->partitionCount > KMIGMGR_MAX_GPU_INSTANCES)
7272 {
7273 return NV_ERR_INVALID_ARGUMENT;
7274 }
7275 else if (0 == pParams->partitionCount)
7276 {
7277 return NV_WARN_NOTHING_TO_DO;
7278 }
7279
7280 for (i = 0; i < pParams->partitionCount; i++)
7281 {
7282 if (pParams->partitionInfo[i].bValid)
7283 {
7284 NvU32 partitionFlag = FLD_SET_DRF(2080_CTRL_GPU, _PARTITION_FLAG, _PLACE_AT_SPAN, _DISABLE,
7285 pParams->partitionInfo[i].partitionFlag);
7286 NV_CHECK_OR_RETURN(LEVEL_ERROR,
7287 kmigmgrIsGPUInstanceCombinationValid_HAL(pGpu, pKernelMIGManager, partitionFlag),
7288 NV_ERR_NOT_SUPPORTED);
7289 }
7290 }
7291
7292 // This is not supported in vGPU
7293 if (IS_VIRTUAL(pGpu))
7294 {
7295 return NV_ERR_NOT_SUPPORTED;
7296 }
7297
7298 for (i = 0; i < pParams->partitionCount; i++)
7299 {
7300 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_INFO,
7301 _kmigmgrProcessGPUInstanceEntry(pGpu, pKernelMIGManager, &pParams->partitionInfo[i]),
7302 cleanup);
7303 }
7304
7305 //
7306 // Generate a subdevice event stating something has changed in GPU instance
7307 // config. Clients currently do not care about changes and their scope
7308 //
7309 gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_SMC_CONFIG_UPDATE, NULL, 0, 0, 0);
7310
7311 return rmStatus;
7312
7313 cleanup:
7314 // Invalidate gpu instances which has been created
7315 for (j = 0; j < i; j++)
7316 {
7317 pParams->partitionInfo[j].bValid = !pParams->partitionInfo[j].bValid;
7318 NV_ASSERT_OK(
7319 _kmigmgrProcessGPUInstanceEntry(pGpu, pKernelMIGManager, &pParams->partitionInfo[j]));
7320 pParams->partitionInfo[j].bValid = !pParams->partitionInfo[j].bValid;
7321 }
7322
7323 return rmStatus;
7324 }
7325
7326 // Control call for getting specific gpu instance info
7327 NV_STATUS
subdeviceCtrlCmdGpuGetPartitions_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_GET_PARTITIONS_PARAMS * pParams)7328 subdeviceCtrlCmdGpuGetPartitions_IMPL
7329 (
7330 Subdevice *pSubdevice,
7331 NV2080_CTRL_GPU_GET_PARTITIONS_PARAMS *pParams
7332 )
7333 {
7334 NV_STATUS rmStatus = NV_OK;
7335 NvU32 i;
7336 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7337 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7338 MIG_INSTANCE_REF ref;
7339 NvU64 validSwizzIdMask;
7340 NvHandle hClient = RES_GET_CLIENT_HANDLE(pSubdevice);
7341 NV2080_CTRL_GPU_GET_PARTITIONS_PARAMS *pRpcParams = NULL;
7342
7343 ct_assert(NV2080_CTRL_GPU_MAX_PARTITIONS == KMIGMGR_MAX_GPU_INSTANCES);
7344 ct_assert(NV2080_CTRL_GPU_MAX_GPC_PER_SMC == KGRMGR_MAX_GPC);
7345
7346 LOCK_ASSERT_AND_RETURN(rmapiLockIsOwner() && rmDeviceGpuLockIsOwner(pGpu->gpuInstance));
7347
7348 pRpcParams = portMemAllocNonPaged(sizeof(*pRpcParams));
7349 NV_CHECK_OR_RETURN(LEVEL_INFO, pRpcParams != NULL, NV_ERR_NO_MEMORY);
7350
7351 *pRpcParams = *pParams;
7352
7353 if (!IS_VIRTUAL(pGpu))
7354 {
7355 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
7356 RmCtrlParams *pRmCtrlParams = pCallContext->pControlParams;
7357 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
7358
7359
7360 NV_CHECK_OK_OR_GOTO(rmStatus, LEVEL_WARNING,
7361 pRmApi->Control(pRmApi,
7362 pRmCtrlParams->hClient,
7363 pRmCtrlParams->hObject,
7364 NV2080_CTRL_CMD_INTERNAL_MIGMGR_GET_GPU_INSTANCES,
7365 pRpcParams,
7366 sizeof(*pRpcParams)), done);
7367 }
7368
7369 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED))
7370 {
7371 NV_PRINTF(LEVEL_INFO, "MIG not supported on this GPU.\n");
7372 rmStatus = NV_ERR_NOT_SUPPORTED;
7373 goto done;
7374 }
7375
7376 if (!IS_MIG_ENABLED(pGpu))
7377 NV_PRINTF(LEVEL_INFO, "Entered MIG API with MIG disabled.\n");
7378
7379 if (!IS_MIG_IN_USE(pGpu))
7380 {
7381 // set the valid gpu instance count to "0" and return
7382 pParams->validPartitionCount = 0;
7383 rmStatus = NV_OK;
7384 goto done;
7385 }
7386
7387 // See if all gpu instances are requested and get info for all gpu instance
7388 if (pParams->bGetAllPartitionInfo)
7389 {
7390 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
7391
7392 NV_ASSERT_OR_ELSE(pCallContext != NULL,
7393 rmStatus = NV_ERR_INVALID_STATE; goto done);
7394
7395 if (!rmclientIsCapableOrAdminByHandle(hClient,
7396 NV_RM_CAP_SYS_SMC_CONFIG,
7397 pCallContext->secInfo.privLevel))
7398 {
7399 NV_PRINTF(LEVEL_ERROR,
7400 "Non privileged client requesting global gpu instance info\n");
7401 rmStatus = NV_ERR_INSUFFICIENT_PERMISSIONS;
7402 goto done;
7403 }
7404
7405 // Take all swizzId's for consideration
7406 validSwizzIdMask = pKernelMIGManager->swizzIdInUseMask;
7407 }
7408 else
7409 {
7410 rmStatus = kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager,
7411 GPU_RES_GET_DEVICE(pSubdevice), &ref);
7412 if (rmStatus != NV_OK)
7413 {
7414 // set the valid gpu instance count to "0" and return
7415 pParams->validPartitionCount = 0;
7416 rmStatus = NV_OK;
7417 goto done;
7418 }
7419
7420 validSwizzIdMask = NVBIT64(ref.pKernelMIGGpuInstance->swizzId);
7421 }
7422
7423 pParams->validPartitionCount = 0;
7424 for (i = 0; i < KMIGMGR_MAX_GPU_INSTANCES; i++)
7425 {
7426 MIG_RESOURCE_ALLOCATION *pResourceAllocation;
7427 NvU32 swizzId = portUtilCountTrailingZeros64(validSwizzIdMask);
7428 NvU32 j;
7429 RM_ENGINE_TYPE rmEngineType;
7430
7431 rmStatus = kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, swizzId, &ref.pKernelMIGGpuInstance);
7432 if (rmStatus != NV_OK)
7433 {
7434 NV_PRINTF(LEVEL_ERROR,
7435 "Unable to get gpu instance info for swizzId - %d\n",
7436 swizzId);
7437 goto done;
7438 }
7439
7440 pResourceAllocation = &ref.pKernelMIGGpuInstance->resourceAllocation;
7441
7442 pParams->queryPartitionInfo[i].partitionFlag = ref.pKernelMIGGpuInstance->partitionFlag;
7443 pParams->queryPartitionInfo[i].swizzId = ref.pKernelMIGGpuInstance->swizzId;
7444 pParams->queryPartitionInfo[i].grEngCount =
7445 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_GR(0));
7446 pParams->queryPartitionInfo[i].smCount = ref.pKernelMIGGpuInstance->pProfile->smCount;
7447 pParams->queryPartitionInfo[i].veidCount = pResourceAllocation->veidCount;
7448 pParams->queryPartitionInfo[i].ceCount =
7449 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_COPY(0));
7450 pParams->queryPartitionInfo[i].gpcCount = pResourceAllocation->gpcCount;
7451 pParams->queryPartitionInfo[i].gfxGpcCount = pResourceAllocation->gfxGpcCount;
7452 pParams->queryPartitionInfo[i].virtualGpcCount = pResourceAllocation->virtualGpcCount;
7453 pParams->queryPartitionInfo[i].nvDecCount =
7454 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_NVDEC(0));
7455 pParams->queryPartitionInfo[i].nvEncCount =
7456 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_NVENC(0));
7457 pParams->queryPartitionInfo[i].nvJpgCount =
7458 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_NVJPG);
7459 pParams->queryPartitionInfo[i].nvOfaCount =
7460 kmigmgrCountEnginesOfType(&pResourceAllocation->engines, RM_ENGINE_TYPE_OFA(0));
7461 pParams->queryPartitionInfo[i].memSize = rangeLength(ref.pKernelMIGGpuInstance->memRange);
7462 pParams->queryPartitionInfo[i].validCTSIdMask = ref.pKernelMIGGpuInstance->pProfile->validCTSIdMask;
7463 pParams->queryPartitionInfo[i].validGfxCTSIdMask = ref.pKernelMIGGpuInstance->pProfile->validGfxCTSIdMask;
7464 pParams->queryPartitionInfo[i].bValid = NV_TRUE;
7465
7466 if (IS_VIRTUAL(pGpu))
7467 {
7468 NV_RANGE span = NV_RANGE_EMPTY;
7469 VGPU_STATIC_INFO *pVSI = GPU_GET_STATIC_INFO(pGpu);
7470 NV_ASSERT_OR_ELSE(pVSI != NULL,
7471 rmStatus = NV_ERR_OBJECT_NOT_FOUND; goto done);
7472
7473 // VGPU doesn't support this
7474 pParams->queryPartitionInfo[i].bPartitionError = NV_FALSE;
7475 pParams->queryPartitionInfo[i].span.lo = span.lo;
7476 pParams->queryPartitionInfo[i].span.hi = span.hi;
7477
7478 // Fill GPCs associated with every GR
7479 j = 0;
7480 FOR_EACH_IN_BITVECTOR(&pResourceAllocation->engines, rmEngineType)
7481 {
7482 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
7483 continue;
7484
7485 pParams->queryPartitionInfo[i].gpcsPerGr[j] = pVSI->gpuPartitionInfo.gpcsPerGr[j];
7486 pParams->queryPartitionInfo[i].veidsPerGr[j] = pVSI->gpuPartitionInfo.veidsPerGr[j];
7487 pParams->queryPartitionInfo[i].virtualGpcsPerGr[j] = pVSI->gpuPartitionInfo.virtualGpcsPerGr[j];
7488 pParams->queryPartitionInfo[i].gfxGpcPerGr[j] = pVSI->gpuPartitionInfo.gfxGpcPerGr[j];
7489
7490 j++;
7491 }
7492 FOR_EACH_IN_BITVECTOR_END();
7493 }
7494 else
7495 {
7496 NV_ASSERT_OR_ELSE(pRpcParams->queryPartitionInfo[i].bValid,
7497 rmStatus = NV_ERR_INVALID_STATE; goto done);
7498 NV_ASSERT_OR_ELSE(
7499 pParams->queryPartitionInfo[i].swizzId == pRpcParams->queryPartitionInfo[i].swizzId,
7500 rmStatus = NV_ERR_INVALID_STATE; goto done);
7501
7502 // Fill GPCs associated with every GR
7503 j = 0;
7504 FOR_EACH_IN_BITVECTOR(&pResourceAllocation->engines, rmEngineType)
7505 {
7506 if (!RM_ENGINE_TYPE_IS_GR(rmEngineType))
7507 continue;
7508
7509 pParams->queryPartitionInfo[i].gpcsPerGr[j] = pRpcParams->queryPartitionInfo[i].gpcsPerGr[j];
7510 pParams->queryPartitionInfo[i].gfxGpcPerGr[j] = pRpcParams->queryPartitionInfo[i].gfxGpcPerGr[j];
7511 pParams->queryPartitionInfo[i].veidsPerGr[j] = pRpcParams->queryPartitionInfo[i].veidsPerGr[j];
7512 pParams->queryPartitionInfo[i].virtualGpcsPerGr[j] = pRpcParams->queryPartitionInfo[i].virtualGpcsPerGr[j];
7513
7514 j++;
7515 }
7516 FOR_EACH_IN_BITVECTOR_END();
7517
7518 // Take the value provided by physical
7519 pParams->queryPartitionInfo[i].bPartitionError = pRpcParams->queryPartitionInfo[i].bPartitionError;
7520 pParams->queryPartitionInfo[i].span = pRpcParams->queryPartitionInfo[i].span;
7521 }
7522
7523 ++pParams->validPartitionCount;
7524
7525 validSwizzIdMask &= ~NVBIT64(swizzId);
7526 if (validSwizzIdMask == 0)
7527 {
7528 break;
7529 }
7530 }
7531
7532 done:
7533 portMemFree(pRpcParams);
7534
7535 return rmStatus;
7536 }
7537
7538 NV_STATUS
subdeviceCtrlCmdInternalKMIGmgrExportGPUInstance_IMPL(Subdevice * pSubdevice,NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS * pParams)7539 subdeviceCtrlCmdInternalKMIGmgrExportGPUInstance_IMPL
7540 (
7541 Subdevice *pSubdevice,
7542 NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS *pParams
7543 )
7544 {
7545 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7546 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
7547 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
7548
7549 // No gpu instances to export
7550 if (!IS_MIG_IN_USE(pGpu))
7551 return NV_ERR_NOT_SUPPORTED;
7552
7553 // An unprivileged client has no use case for import/export
7554 if (!rmclientIsCapableOrAdminByHandle(RES_GET_CLIENT_HANDLE(pSubdevice),
7555 NV_RM_CAP_SYS_SMC_CONFIG,
7556 pCallContext->secInfo.privLevel))
7557 {
7558 return NV_ERR_INSUFFICIENT_PERMISSIONS;
7559 }
7560
7561 // Guest RM does not support import/export
7562 if (IS_VIRTUAL(pGpu))
7563 {
7564 return NV_ERR_NOT_SUPPORTED;
7565 }
7566
7567 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
7568 pRmApi->Control(pRmApi,
7569 pGpu->hInternalClient,
7570 pGpu->hInternalSubdevice,
7571 NV2080_CTRL_CMD_INTERNAL_MIGMGR_EXPORT_GPU_INSTANCE,
7572 pParams,
7573 sizeof(*pParams)));
7574
7575 return NV_OK;
7576 }
7577
7578 NV_STATUS
subdeviceCtrlCmdInternalKMIGmgrImportGPUInstance_IMPL(Subdevice * pSubdevice,NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS * pParams)7579 subdeviceCtrlCmdInternalKMIGmgrImportGPUInstance_IMPL
7580 (
7581 Subdevice *pSubdevice,
7582 NV2080_CTRL_INTERNAL_KMIGMGR_IMPORT_EXPORT_GPU_INSTANCE_PARAMS *pParams
7583 )
7584 {
7585 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7586 NV_STATUS status = NV_OK;
7587 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7588 CALL_CONTEXT *pCallContext = resservGetTlsCallContext();
7589 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
7590
7591 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_MIG_SUPPORTED))
7592 return NV_ERR_NOT_SUPPORTED;
7593
7594 NV_ASSERT_OR_RETURN(pCallContext != NULL, NV_ERR_INVALID_STATE);
7595
7596 // An unprivileged client has no use case for import/export
7597 if (!rmclientIsCapableOrAdminByHandle(RES_GET_CLIENT_HANDLE(pSubdevice),
7598 NV_RM_CAP_SYS_SMC_CONFIG,
7599 pCallContext->secInfo.privLevel))
7600 {
7601 return NV_ERR_INSUFFICIENT_PERMISSIONS;
7602 }
7603
7604 // Guest RM does not support import/export
7605 if (IS_VIRTUAL(pGpu))
7606 {
7607 return NV_ERR_NOT_SUPPORTED;
7608 }
7609
7610 if (kmigmgrGetSwizzIdInUseMask(pGpu, pKernelMIGManager) == 0x0)
7611 {
7612 NvBool bMemoryPartitioningNeeded = kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pParams->swizzId);
7613
7614 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
7615 kmigmgrSetMIGState(pGpu, GPU_GET_KERNEL_MIG_MANAGER(pGpu), bMemoryPartitioningNeeded, NV_TRUE, NV_FALSE));
7616 }
7617
7618 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
7619 pRmApi->Control(pRmApi,
7620 pGpu->hInternalClient,
7621 pGpu->hInternalSubdevice,
7622 NV2080_CTRL_CMD_INTERNAL_MIGMGR_IMPORT_GPU_INSTANCE,
7623 pParams,
7624 sizeof(*pParams)),
7625 cleanup_mig_state);
7626
7627 if (IS_GSP_CLIENT(pGpu))
7628 {
7629 GPUMGR_SAVE_GPU_INSTANCE *pSave = portMemAllocNonPaged(sizeof(*pSave));
7630 NV_CHECK_OR_ELSE(LEVEL_ERROR,
7631 pSave != NULL,
7632 status = NV_ERR_NO_MEMORY;
7633 goto cleanup_mig_state;);
7634
7635 KMIGMGR_CREATE_GPU_INSTANCE_PARAMS restore =
7636 {
7637 .type = KMIGMGR_CREATE_GPU_INSTANCE_PARAMS_TYPE_RESTORE,
7638 .inst.restore.pGPUInstanceSave = pSave,
7639 };
7640 pSave->bValid = NV_TRUE;
7641 pSave->swizzId = pParams->swizzId;
7642 pSave->pOsRmCaps = NULL;
7643 portMemCopy(&(pSave->giInfo), sizeof(pSave->giInfo), &pParams->info, sizeof(pParams->info));
7644
7645 status = kmigmgrCreateGPUInstance(pGpu, pKernelMIGManager, pParams->swizzId, pParams->uuid,
7646 restore, NV_TRUE, NV_FALSE);
7647
7648 portMemFree(pSave);
7649 NV_CHECK_OR_GOTO(LEVEL_ERROR, status == NV_OK, cleanup_rpc);
7650 }
7651
7652 return NV_OK;
7653
7654 cleanup_rpc:
7655 {
7656 NV2080_CTRL_GPU_SET_PARTITIONS_PARAMS params;
7657
7658 portMemSet(¶ms, 0, sizeof(params));
7659 params.partitionCount = 1;
7660 params.partitionInfo[0].bValid = NV_FALSE;
7661 params.partitionInfo[0].swizzId = pParams->swizzId;
7662
7663 NV_ASSERT_OK(
7664 pRmApi->Control(pRmApi,
7665 pGpu->hInternalClient,
7666 pGpu->hInternalSubdevice,
7667 NV2080_CTRL_CMD_INTERNAL_MIGMGR_SET_GPU_INSTANCES,
7668 pParams,
7669 sizeof(*pParams)));
7670 }
7671
7672 cleanup_mig_state:
7673 if (kmigmgrGetSwizzIdInUseMask(pGpu, pKernelMIGManager) == 0x0)
7674 {
7675 NvBool bMemoryPartitioningNeeded = kmigmgrIsMemoryPartitioningNeeded_HAL(pGpu, pKernelMIGManager, pParams->swizzId);
7676
7677 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
7678 kmigmgrSetMIGState(pGpu, GPU_GET_KERNEL_MIG_MANAGER(pGpu), bMemoryPartitioningNeeded, NV_FALSE, NV_FALSE));
7679 }
7680
7681 return status;
7682 }
7683
7684 NV_STATUS
subdeviceCtrlCmdGpuGetComputeProfiles_IMPL(Subdevice * pSubdevice,NV2080_CTRL_GPU_GET_COMPUTE_PROFILES_PARAMS * pParams)7685 subdeviceCtrlCmdGpuGetComputeProfiles_IMPL
7686 (
7687 Subdevice *pSubdevice,
7688 NV2080_CTRL_GPU_GET_COMPUTE_PROFILES_PARAMS *pParams
7689 )
7690 {
7691 OBJGPU *pGpu = GPU_RES_GET_GPU(pSubdevice);
7692 KernelMIGManager *pKernelMIGManager = GPU_GET_KERNEL_MIG_MANAGER(pGpu);
7693 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7694 Device *pDevice = GPU_RES_GET_DEVICE(pSubdevice);
7695 NvU32 maxSmCount = NV_U32_MAX;
7696 MIG_INSTANCE_REF ref;
7697 NvU32 entryCount;
7698 NvU32 i;
7699
7700 if (!IS_MIG_ENABLED(pGpu))
7701 return NV_ERR_INVALID_STATE;
7702
7703 //
7704 // Grab MIG partition reference if available. The profile's SM count is used
7705 // to filter out compute profiles which wouldn't fit on the GI anyway. This
7706 // is not fatal as we still want to allow compute profiles for entire GPU view
7707 // to be queried without a specific GPU instance.
7708 //
7709 if (kmigmgrGetInstanceRefFromDevice(pGpu, pKernelMIGManager, pDevice, &ref) == NV_OK)
7710 {
7711 maxSmCount = ref.pKernelMIGGpuInstance->pProfile->smCount;
7712 }
7713
7714 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_INVALID_STATE);
7715 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo->pCIProfiles != NULL, NV_ERR_INVALID_STATE);
7716 NV_ASSERT(pStaticInfo->pCIProfiles->profileCount <= NV_ARRAY_ELEMENTS(pParams->profiles));
7717
7718 entryCount = 0;
7719 for (i = 0; i < pStaticInfo->pCIProfiles->profileCount; i++)
7720 {
7721 if (pStaticInfo->pCIProfiles->profiles[i].smCount > maxSmCount)
7722 continue;
7723
7724 // If there are any duplicate compute profiles (i.e. same GPC and SM counts), skip broadcasting the
7725 // profile out.
7726 if ((entryCount > 0) &&
7727 (pParams->profiles[entryCount - 1].gfxGpcCount == pStaticInfo->pCIProfiles->profiles[i].gfxGpcCount) &&
7728 (pParams->profiles[entryCount - 1].gpcCount == pStaticInfo->pCIProfiles->profiles[i].gpcCount) &&
7729 (pParams->profiles[entryCount - 1].smCount == pStaticInfo->pCIProfiles->profiles[i].smCount))
7730 {
7731 continue;
7732 }
7733
7734 pParams->profiles[entryCount].computeSize = pStaticInfo->pCIProfiles->profiles[i].computeSize;
7735 pParams->profiles[entryCount].gfxGpcCount = pStaticInfo->pCIProfiles->profiles[i].gfxGpcCount;
7736 pParams->profiles[entryCount].gpcCount = pStaticInfo->pCIProfiles->profiles[i].physicalSlots;
7737 pParams->profiles[entryCount].smCount = pStaticInfo->pCIProfiles->profiles[i].smCount;
7738 pParams->profiles[entryCount].veidCount = pStaticInfo->pCIProfiles->profiles[i].veidCount;
7739 entryCount++;
7740 }
7741 pParams->profileCount = entryCount;
7742 return NV_OK;
7743 }
7744
7745 /*!
7746 * @brief Function to get the next computeSize flag either larger or smaller than
7747 * the passed in flag.
7748 *
7749 * @param[IN] bGetNextSmallest Flag controlling whether the next largest or smallest
7750 * compute size is returned
7751 * @param[IN] computeSize Base computeSize to lookup
7752 *
7753 * @return Input is the original compute size
7754 * a.) If compute size input is KMIGMGR_COMPUTE_SIZE_INVALID, out is:
7755 * 1.) NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL if bGetNextSmallest
7756 * 2.) NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH if !bGetNextSmallest
7757 * b.) Else output is next largest/smallest based upon bGetNextSmallest
7758 */
7759 NvU32
kmigmgrGetNextComputeSize_IMPL(NvBool bGetNextSmallest,NvU32 computeSize)7760 kmigmgrGetNextComputeSize_IMPL
7761 (
7762 NvBool bGetNextSmallest,
7763 NvU32 computeSize
7764 )
7765 {
7766 const NvU32 computeSizeFlags[] =
7767 {
7768 KMIGMGR_COMPUTE_SIZE_INVALID,
7769 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL,
7770 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_HALF,
7771 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_HALF,
7772 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_QUARTER,
7773 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_QUARTER,
7774 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH,
7775 KMIGMGR_COMPUTE_SIZE_INVALID
7776 };
7777
7778 NV_ASSERT_OR_RETURN(computeSize <= KMIGMGR_COMPUTE_SIZE_INVALID, KMIGMGR_COMPUTE_SIZE_INVALID);
7779
7780 if (computeSize == KMIGMGR_COMPUTE_SIZE_INVALID)
7781 {
7782 return (bGetNextSmallest) ? computeSizeFlags[1] : computeSizeFlags[NV_ARRAY_ELEMENTS(computeSizeFlags) - 2];
7783 }
7784 else
7785 {
7786 NvU32 i;
7787
7788 for (i = 1; i < NV_ARRAY_ELEMENTS(computeSizeFlags) - 1; i++)
7789 if (computeSizeFlags[i] == computeSize)
7790 return (bGetNextSmallest) ? computeSizeFlags[i + 1] : computeSizeFlags[i - 1];
7791
7792 // Requested input flag was not found
7793 return KMIGMGR_COMPUTE_SIZE_INVALID;
7794 }
7795 }
7796
7797 /*!
7798 * @brief Function to lookup a skyline for a given compute size
7799 *
7800 * @param[IN] pGpu
7801 * @param[IN] pKernelMIGManager
7802 * @param[IN] computeSize Compute size to find skyline for
7803 * @param[OUT] pSkyline Pointer to NV2080_CTRL_INTERNAL_GRMGR_SKYLINE_INFO struct filled with
7804 * a copy of the skyline info associated with the gpc count
7805 */
7806 NV_STATUS
kmigmgrGetSkylineFromSize_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 computeSize,const NV2080_CTRL_INTERNAL_GRMGR_SKYLINE_INFO ** ppSkyline)7807 kmigmgrGetSkylineFromSize_IMPL
7808 (
7809 OBJGPU *pGpu,
7810 KernelMIGManager *pKernelMIGManager,
7811 NvU32 computeSize,
7812 const NV2080_CTRL_INTERNAL_GRMGR_SKYLINE_INFO **ppSkyline
7813 )
7814 {
7815 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7816 NvU32 i;
7817
7818 NV_ASSERT_OR_RETURN(ppSkyline != NULL, NV_ERR_INVALID_ARGUMENT);
7819 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7820 NV_CHECK_OR_RETURN(LEVEL_WARNING, pStaticInfo->pSkylineInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7821
7822 for (i = 0; i < pStaticInfo->pSkylineInfo->validEntries; i++)
7823 {
7824 if (pStaticInfo->pSkylineInfo->skylineTable[i].computeSizeFlag == computeSize)
7825 {
7826 *ppSkyline = &pStaticInfo->pSkylineInfo->skylineTable[i];
7827 return NV_OK;
7828 }
7829 }
7830 NV_PRINTF(LEVEL_INFO, "No skyline for with compute size %d\n", computeSize);
7831 return NV_ERR_OBJECT_NOT_FOUND;
7832 }
7833
7834 /*!
7835 * @brief Function to lookup a compute profile for a given compute size
7836 *
7837 * @param[IN] pGpu
7838 * @param[IN] pKernelMIGManager
7839 * @param[IN] computeSize Compute size to find skyline for
7840 * @param[OUT] pProfile Pointer to NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE struct filled with
7841 * a copy of the compute profile info associated with the gpc count
7842 */
7843 NV_STATUS
kmigmgrGetComputeProfileFromSize_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 computeSize,NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE * pProfile)7844 kmigmgrGetComputeProfileFromSize_IMPL
7845 (
7846 OBJGPU *pGpu,
7847 KernelMIGManager *pKernelMIGManager,
7848 NvU32 computeSize,
7849 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pProfile
7850 )
7851 {
7852 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7853 NvU32 i;
7854
7855 NV_ASSERT_OR_RETURN(pProfile != NULL, NV_ERR_INVALID_ARGUMENT);
7856 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7857 NV_CHECK_OR_RETURN(LEVEL_WARNING, pStaticInfo->pCIProfiles != NULL, NV_ERR_OBJECT_NOT_FOUND);
7858
7859 for (i = 0; i < pStaticInfo->pCIProfiles->profileCount; i++)
7860 {
7861 if (pStaticInfo->pCIProfiles->profiles[i].computeSize == computeSize)
7862 {
7863 portMemCopy(pProfile, sizeof(*pProfile), &pStaticInfo->pCIProfiles->profiles[i], sizeof(pStaticInfo->pCIProfiles->profiles[i]));
7864 return NV_OK;
7865 }
7866 }
7867 NV_PRINTF(LEVEL_INFO, "Found no Compute Profile for computeSize=%d\n", computeSize);
7868 return NV_ERR_OBJECT_NOT_FOUND;
7869 }
7870
7871 /*!
7872 * @brief Function to lookup a compute profile for a given SM count
7873 *
7874 * @param[IN] pGpu
7875 * @param[IN] pKernelMIGManager
7876 * @param[IN] smCount SM Count to look up the associated compute profile
7877 * @param[OUT] pProfile Pointer to NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE struct filled with
7878 * a copy of the compute profile info associated with the SM count
7879 */
7880 NV_STATUS
kmigmgrGetComputeProfileFromSmCount_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 smCount,NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE * pProfile)7881 kmigmgrGetComputeProfileFromSmCount_IMPL
7882 (
7883 OBJGPU *pGpu,
7884 KernelMIGManager *pKernelMIGManager,
7885 NvU32 smCount,
7886 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pProfile
7887 )
7888 {
7889 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7890 NvU32 i;
7891
7892 NV_ASSERT_OR_RETURN(pProfile != NULL, NV_ERR_INVALID_ARGUMENT);
7893 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7894 NV_CHECK_OR_RETURN(LEVEL_WARNING, pStaticInfo->pCIProfiles != NULL, NV_ERR_OBJECT_NOT_FOUND);
7895
7896 for (i = 0; i < pStaticInfo->pCIProfiles->profileCount; i++)
7897 {
7898 if (pStaticInfo->pCIProfiles->profiles[i].smCount == smCount)
7899 {
7900 portMemCopy(pProfile, sizeof(*pProfile), &pStaticInfo->pCIProfiles->profiles[i], sizeof(pStaticInfo->pCIProfiles->profiles[i]));
7901 return NV_OK;
7902 }
7903 }
7904 NV_PRINTF(LEVEL_ERROR, "Found no Compute Profile for smCount=%d\n", smCount);
7905 return NV_ERR_OBJECT_NOT_FOUND;
7906 }
7907
7908
7909 /*!
7910 * @brief Function to lookup a compute profile for a given GPC count. This function converts
7911 * the provided gpcCount into a COMPUTE_SIZE partition flag which is then looked up
7912 * in the static info compute profile list.
7913 *
7914 * @param[IN] pGpu
7915 * @param[IN] pKernelMIGManager
7916 * @param[IN] gpcCount GPC Count to look up the associated compute profile
7917 * @param[OUT] pProfile Pointer to NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE struct filled with
7918 * a copy of the compute profile info associated with the GPC count
7919 */
7920 NV_STATUS
kmigmgrGetComputeProfileFromGpcCount_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 gpcCount,NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE * pProfile)7921 kmigmgrGetComputeProfileFromGpcCount_IMPL
7922 (
7923 OBJGPU *pGpu,
7924 KernelMIGManager *pKernelMIGManager,
7925 NvU32 gpcCount,
7926 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pProfile
7927 )
7928 {
7929 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
7930 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7931 NvBool bReducedConfig = kmigmgrIsA100ReducedConfig(pGpu, pKernelMIGManager);
7932 NvU32 compSize;
7933 NvU32 maxGpc;
7934 NvU32 maxMIG;
7935 NvU32 i;
7936
7937 NV_ASSERT_OR_RETURN(pProfile != NULL, NV_ERR_INVALID_ARGUMENT);
7938 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7939 NV_CHECK_OR_RETURN(LEVEL_WARNING, pStaticInfo->pCIProfiles != NULL, NV_ERR_OBJECT_NOT_FOUND);
7940
7941 maxMIG = kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo->infoList[NV2080_CTRL_GR_INFO_INDEX_MAX_MIG_ENGINES].data;
7942 maxGpc = kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo->infoList[NV2080_CTRL_GR_INFO_INDEX_MAX_PARTITIONABLE_GPCS].data;
7943 if (bReducedConfig)
7944 maxGpc /= 2;
7945
7946 if ((gpcCount <= (maxGpc / 8)) && ((maxMIG / 8) > 0))
7947 compSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH;
7948 else if ((gpcCount <= (maxGpc / 4)) && ((maxMIG / 4) > 0))
7949 compSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_QUARTER;
7950 else if ((gpcCount <= ((maxGpc / 2) - 1)) && (((maxMIG / 2) - 1) > 0))
7951 compSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_HALF;
7952 else if ((gpcCount <= (maxGpc / 2)) && ((maxMIG / 2) > 0))
7953 compSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_HALF;
7954 else
7955 compSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL;
7956
7957 for (i = 0; i < pStaticInfo->pCIProfiles->profileCount; i++)
7958 {
7959 if (pStaticInfo->pCIProfiles->profiles[i].computeSize == compSize)
7960 {
7961 if (pStaticInfo->pCIProfiles->profiles[i].gpcCount != gpcCount)
7962 {
7963 NV_PRINTF(LEVEL_INFO, "GPC count %d doesn't match compute size %d \n", gpcCount, compSize);
7964 }
7965 portMemCopy(pProfile, sizeof(*pProfile), &pStaticInfo->pCIProfiles->profiles[i], sizeof(pStaticInfo->pCIProfiles->profiles[i]));
7966 return NV_OK;
7967 }
7968 }
7969
7970 NV_PRINTF(LEVEL_INFO, "Found no Compute Profile for gpcCount=%d\n", gpcCount);
7971 return NV_ERR_OBJECT_NOT_FOUND;
7972 }
7973
7974 /*!
7975 * @brief Function to lookup a compute profile for a given cts ID
7976 *
7977 * @param[IN] pGpu
7978 * @param[IN] pKernelMIGManager
7979 * @param[IN] ctsId CTS ID to find compute profile for
7980 * @param[OUT] pProfile Pointer to NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE struct filled with
7981 * a copy of the compute profile info associated with the gpc count
7982 */
7983 NV_STATUS
kmigmgrGetComputeProfileFromCTSId_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 ctsId,NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE * pProfile)7984 kmigmgrGetComputeProfileFromCTSId_IMPL
7985 (
7986 OBJGPU *pGpu,
7987 KernelMIGManager *pKernelMIGManager,
7988 NvU32 ctsId,
7989 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pProfile
7990 )
7991 {
7992 const KERNEL_MIG_MANAGER_STATIC_INFO *pStaticInfo = kmigmgrGetStaticInfo(pGpu, pKernelMIGManager);
7993 NvU32 computeSize;
7994
7995 NV_ASSERT_OR_RETURN(pProfile != NULL, NV_ERR_INVALID_ARGUMENT);
7996 NV_CHECK_OR_RETURN(LEVEL_ERROR, pStaticInfo != NULL, NV_ERR_OBJECT_NOT_FOUND);
7997 NV_CHECK_OR_RETURN(LEVEL_WARNING, pStaticInfo->pCIProfiles != NULL, NV_ERR_OBJECT_NOT_FOUND);
7998
7999 computeSize = kmigmgrGetComputeSizeFromCTSId(ctsId);
8000 return kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager, computeSize, pProfile);
8001 }
8002
8003 /*!
8004 * @brief Function which returns a mask of CTS IDs which are not usable when the input CTS
8005 * ID is in-use.
8006 *
8007 * @param[IN] pGpu
8008 * @param[IN] pKernelMIGManager
8009 * @param[IN] ctsId Input CTS ID to look-up invalid mask for
8010 * @param[OUT] pInvalidCTSIdMask Output mask of CTS IDs not useable with input ID
8011 */
8012 NV_STATUS
kmigmgrGetInvalidCTSIdMask_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 ctsId,NvU64 * pInvalidCTSIdMask)8013 kmigmgrGetInvalidCTSIdMask_IMPL
8014 (
8015 OBJGPU *pGpu,
8016 KernelMIGManager *pKernelMIGManager,
8017 NvU32 ctsId,
8018 NvU64 *pInvalidCTSIdMask
8019 )
8020 {
8021 //
8022 // +---------------------------------------+
8023 // | 0 |
8024 // +-------------------+-------------------+
8025 // | 1 | 2 |
8026 // +-------------------+-------------------+
8027 // | 3 | 4 |
8028 // +---------+---------+---------+---------+
8029 // | 5 | 6 | 7 | 8 |
8030 // +---------+---------+---------+---------+
8031 // | 9 | 10 | 11 | 12 |
8032 // +----+----+----+----+----+----+----+----+
8033 // | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
8034 // +----+----+----+----+----+----+----+----+
8035 //
8036 NvU64 gpcSlot[KGRMGR_MAX_GR] =
8037 {
8038 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(5) | NVBIT64(9) | NVBIT64(13)),
8039 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(5) | NVBIT64(9) | NVBIT64(14)),
8040 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(6) | NVBIT64(10) | NVBIT64(15)),
8041 (NVBIT64(0) | NVBIT64(1) | NVBIT64(3) | NVBIT64(6) | NVBIT64(10) | NVBIT64(16)),
8042 (NVBIT64(0) | NVBIT64(2) | NVBIT64(4) | NVBIT64(7) | NVBIT64(11) | NVBIT64(17)),
8043 (NVBIT64(0) | NVBIT64(2) | NVBIT64(4) | NVBIT64(7) | NVBIT64(11) | NVBIT64(18)),
8044 (NVBIT64(0) | NVBIT64(2) | NVBIT64(4) | NVBIT64(8) | NVBIT64(12) | NVBIT64(19)),
8045 (NVBIT64(0) | NVBIT64(2) | NVBIT64(4) | NVBIT64(8) | NVBIT64(12) | NVBIT64(20))
8046 };
8047 NvU64 i;
8048
8049 NV_ASSERT_OR_RETURN(NULL != pInvalidCTSIdMask, NV_ERR_INVALID_ARGUMENT);
8050
8051 // All bits corresponding to nonexistent CTS ids are invalid
8052 *pInvalidCTSIdMask = DRF_SHIFTMASK64(63:KMIGMGR_MAX_GPU_CTSID);
8053
8054 for (i = 0; i < KGRMGR_MAX_GR; ++i)
8055 {
8056 if (0 != (gpcSlot[i] & NVBIT64(ctsId)))
8057 {
8058 *pInvalidCTSIdMask |= gpcSlot[i];
8059 }
8060 }
8061
8062 return NV_OK;
8063 }
8064
8065 /*!
8066 * @brief Returns the range of possible CTS IDs for a given compute size flag
8067 */
8068 NV_RANGE
kmigmgrComputeProfileSizeToCTSIdRange_IMPL(NvU32 computeSize)8069 kmigmgrComputeProfileSizeToCTSIdRange_IMPL
8070 (
8071 NvU32 computeSize
8072 )
8073 {
8074 switch (computeSize)
8075 {
8076 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL:
8077 return rangeMake(0,0);
8078
8079 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_HALF:
8080 return rangeMake(1,2);
8081
8082 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_HALF:
8083 return rangeMake(3,4);
8084
8085 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_QUARTER:
8086 return rangeMake(5,8);
8087
8088 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_QUARTER:
8089 return rangeMake(9,12);
8090
8091 case NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH:
8092 return rangeMake(13,20);
8093
8094 default:
8095 return NV_RANGE_EMPTY;
8096 }
8097 }
8098
8099 /*!
8100 * @brief Returns the span covered by the CTS ID
8101 */
8102 NV_RANGE
kmigmgrCtsIdToSpan_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 ctsId)8103 kmigmgrCtsIdToSpan_IMPL
8104 (
8105 OBJGPU *pGpu,
8106 KernelMIGManager *pKernelMIGManager,
8107 NvU32 ctsId
8108 )
8109 {
8110 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
8111 NvU32 spanLen;
8112 NV_RANGE ret;
8113
8114 NV_ASSERT_OR_RETURN(kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->bInitialized, NV_RANGE_EMPTY);
8115 NV_ASSERT_OR_RETURN(kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo != NULL, NV_RANGE_EMPTY);
8116
8117 spanLen = kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo->infoList[NV2080_CTRL_GR_INFO_INDEX_MAX_PARTITIONABLE_GPCS].data;
8118
8119 if (kmigmgrIsA100ReducedConfig(pGpu, pKernelMIGManager))
8120 spanLen /= 2;
8121
8122 switch (ctsId)
8123 {
8124 case 0:
8125 ret = rangeMake(0, spanLen - 1);
8126 break;
8127 case 1:
8128 case 3:
8129 ret = rangeMake(0, (spanLen/2) - 1);
8130 break;
8131 case 2:
8132 case 4:
8133 ret = rangeMake(spanLen/2, spanLen - 1);
8134 break;
8135 case 5:
8136 case 9:
8137 ret = rangeMake(0, (spanLen/4) - 1);
8138 break;
8139 case 6:
8140 case 10:
8141 ret = rangeMake((spanLen/4), (spanLen/2) - 1);
8142 break;
8143 case 7:
8144 case 11:
8145 ret = rangeMake((spanLen/2), (3*(spanLen/4)) - 1);
8146 break;
8147 case 8:
8148 case 12:
8149 ret = rangeMake((3*(spanLen/4)), spanLen - 1);
8150 break;
8151 case 13:
8152 ret = rangeMake(0, 0);
8153 break;
8154 case 14:
8155 ret = rangeMake(1, 1);
8156 break;
8157 case 15:
8158 ret = rangeMake(2, 2);
8159 break;
8160 case 16:
8161 ret = rangeMake(3, 3);
8162 break;
8163 case 17:
8164 ret = rangeMake(4, 4);
8165 break;
8166 case 18:
8167 ret = rangeMake(5, 5);
8168 break;
8169 case 19:
8170 ret = rangeMake(6, 6);
8171 break;
8172 case 20:
8173 ret = rangeMake(7, 7);
8174 break;
8175 default:
8176 NV_PRINTF(LEVEL_ERROR, "Unsupported CTS ID 0x%x\n", ctsId);
8177 DBG_BREAKPOINT();
8178 ret = NV_RANGE_EMPTY;
8179 break;
8180 }
8181
8182 return ret;
8183 }
8184
8185 /*!
8186 * @brief Function to get next free CTS ID
8187 *
8188 * @param[IN] pGpu
8189 * @param[IN] pMIGManager
8190 * @param[OUT] pCtsId CTS ID to be used if NV_OK returned
8191 * @param[IN] globalValidCtsMask Mask of CTS IDs which could possibly be allocated
8192 * @param[IN] globalValidGfxCtsMask Mask of Gfx capable CTS IDs which could possibly be allocated.
8193 * Unused if bRestrictWithGfx is NV_FALSE
8194 * @param[IN] ctsIdsInUseMask Mask of CTS IDs currently in use
8195 * @param[IN] profileSize Profile size to get a CTS ID for
8196 * @param[IN] bRestrictWithGfx Whether to restrict the CTS ID chosen with Gfx info
8197 * @param[IN] bGfxRequested Whether Gfx info is requested. Unused if bRestrictWithGfx is NV_FALSE
8198 *
8199 * @return Returns NV_STATUS
8200 * NV_OK
8201 * NV_ERR_INVALID_ARGUMENT If un-supported partition size is
8202 * requested
8203 * NV_ERR_INSUFFICIENT_RESOURCES If a CTS ID cannot be assigned
8204 */
8205 NV_STATUS
kmigmgrGetFreeCTSId_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 * pCtsId,NvU64 globalValidCtsMask,NvU64 globalValidGfxCtsMask,NvU64 ctsIdsInUseMask,NvU32 profileSize,NvBool bRestrictWithGfx,NvBool bGfxRequested)8206 kmigmgrGetFreeCTSId_IMPL
8207 (
8208 OBJGPU *pGpu,
8209 KernelMIGManager *pKernelMIGManager,
8210 NvU32 *pCtsId,
8211 NvU64 globalValidCtsMask,
8212 NvU64 globalValidGfxCtsMask,
8213 NvU64 ctsIdsInUseMask,
8214 NvU32 profileSize,
8215 NvBool bRestrictWithGfx,
8216 NvBool bGfxRequested
8217 )
8218 {
8219 NV_RANGE ctsRange = kmigmgrComputeProfileSizeToCTSIdRange(profileSize);
8220 NvU64 validMask;
8221 NvU32 maxRemainingCapacity;
8222 NvU32 idealCTSId;
8223 NvU32 ctsId;
8224 NvU64 shadowValidCTSIdMask;
8225
8226 NV_CHECK_OR_RETURN(LEVEL_WARNING, !rangeIsEmpty(ctsRange), NV_ERR_INSUFFICIENT_RESOURCES);
8227 NV_ASSERT_OR_RETURN(pCtsId != NULL, NV_ERR_INVALID_ARGUMENT);
8228
8229 // construct a mask of all non-floorswept ctsIds
8230 validMask = globalValidCtsMask;
8231
8232 // Remove all ctsIds with slices currently in use
8233 FOR_EACH_INDEX_IN_MASK(64, ctsId, ctsIdsInUseMask)
8234 {
8235 NvU64 invalidMask;
8236
8237 NV_ASSERT_OK(kmigmgrGetInvalidCTSIdMask(pGpu, pKernelMIGManager, ctsId, &invalidMask));
8238
8239 validMask &= ~invalidMask;
8240 }
8241 FOR_EACH_INDEX_IN_MASK_END;
8242
8243 // compute valid ctsIds for this request that can still be assigned
8244 shadowValidCTSIdMask = validMask;
8245 validMask &= DRF_SHIFTMASK64(ctsRange.hi:ctsRange.lo);
8246
8247 // If there are no valid, open ctsIds, then bail here
8248 NV_CHECK_OR_RETURN(LEVEL_SILENT, validMask != 0x0, NV_ERR_INSUFFICIENT_RESOURCES);
8249
8250 // Determine which available CTS ids will reduce the remaining capacity the least
8251 maxRemainingCapacity = 0;
8252 idealCTSId = portUtilCountTrailingZeros64(validMask);
8253 FOR_EACH_INDEX_IN_MASK(64, ctsId, validMask)
8254 {
8255 NvU64 invalidMask;
8256 NV_ASSERT_OK(kmigmgrGetInvalidCTSIdMask(pGpu, pKernelMIGManager, ctsId, &invalidMask));
8257
8258 NvU32 remainingCapacity = nvPopCount64(shadowValidCTSIdMask & ~invalidMask);
8259
8260 if (remainingCapacity > maxRemainingCapacity)
8261 {
8262 maxRemainingCapacity = remainingCapacity;
8263 idealCTSId = ctsId;
8264 }
8265 }
8266 FOR_EACH_INDEX_IN_MASK_END;
8267
8268 *pCtsId = idealCTSId;
8269
8270 return NV_OK;
8271 }
8272
8273 /*! @brief This function determines whether or not CTS alignment and slot requirements are needed.
8274 * For PF, this is determined by whether or not a MINI_QUARTER skyline exists.
8275 */
8276 NvBool
kmigmgrIsCTSAlignmentRequired_PF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)8277 kmigmgrIsCTSAlignmentRequired_PF
8278 (
8279 OBJGPU *pGpu,
8280 KernelMIGManager *pKernelMIGManager
8281 )
8282 {
8283 const NV2080_CTRL_INTERNAL_GRMGR_SKYLINE_INFO *pUnused;
8284
8285 // CTS alignment is always required when a unique MINI_QUARTER is present
8286 return (kmigmgrGetSkylineFromSize(pGpu, pKernelMIGManager,
8287 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_QUARTER, &pUnused) == NV_OK);
8288 }
8289
8290 /*! @brief This function determines whether or not CTS alignment and slot requirements are needed.
8291 * For VF, this is determined by whether or not a MINI_QUARTER compute profile exists.
8292 */
8293 NvBool
kmigmgrIsCTSAlignmentRequired_VF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)8294 kmigmgrIsCTSAlignmentRequired_VF
8295 (
8296 OBJGPU *pGpu,
8297 KernelMIGManager *pKernelMIGManager
8298 )
8299 {
8300 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE unused;
8301
8302 // CTS alignment is always required when a unique MINI_QUARTER is present
8303 return (kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager,
8304 NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_MINI_QUARTER, &unused) == NV_OK);
8305 }
8306
8307 /*!
8308 * @brief Returns the computeSize flag of a given CTS ID
8309 */
8310 NvU32
kmigmgrGetComputeSizeFromCTSId_IMPL(NvU32 ctsId)8311 kmigmgrGetComputeSizeFromCTSId_IMPL
8312 (
8313 NvU32 ctsId
8314 )
8315 {
8316 NvU32 computeSize = kmigmgrGetNextComputeSize(NV_TRUE, KMIGMGR_COMPUTE_SIZE_INVALID);
8317
8318 while (computeSize != KMIGMGR_COMPUTE_SIZE_INVALID)
8319 {
8320 NV_RANGE range = kmigmgrComputeProfileSizeToCTSIdRange(computeSize);
8321 if ((range.lo <= ctsId) && (ctsId <= range.hi))
8322 break;
8323 computeSize = kmigmgrGetNextComputeSize(NV_TRUE, computeSize);
8324 }
8325
8326 return computeSize;
8327 }
8328
8329 /*!
8330 * @brief Returns Compute size of the smallest supported compute profile
8331 */
8332 NvU32
kmigmgrSmallestComputeProfileSize_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)8333 kmigmgrSmallestComputeProfileSize_IMPL
8334 (
8335 OBJGPU *pGpu,
8336 KernelMIGManager *pKernelMIGManager
8337 )
8338 {
8339 NvU32 computeSize = kmigmgrGetNextComputeSize(NV_FALSE, KMIGMGR_COMPUTE_SIZE_INVALID);
8340
8341 while (computeSize != KMIGMGR_COMPUTE_SIZE_INVALID)
8342 {
8343 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE unused;
8344 if (kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager, computeSize, &unused) == NV_OK)
8345 break;
8346 computeSize = kmigmgrGetNextComputeSize(NV_FALSE, computeSize);
8347 }
8348
8349 return computeSize;
8350 }
8351
8352 /*!
8353 * @brief Sets/resets various CTS tracking structures in a GPU instance
8354 * based upon whether bInUse is set
8355 *
8356 * @param[IN] pKernelMIGGpuInstance
8357 * @param[IN] ctsId CTS ID to be set/reset
8358 * @param[IN] grId Global GR engine targeted for CTS ID
8359 * @param[IN] bInUse Flag indicating to set/reset cts tracking structures
8360 *
8361 */
8362 void
kmigmgrSetCTSIdInUse_IMPL(KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvU32 ctsId,NvU32 grId,NvBool bInUse)8363 kmigmgrSetCTSIdInUse_IMPL
8364 (
8365 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
8366 NvU32 ctsId,
8367 NvU32 grId,
8368 NvBool bInUse
8369 )
8370 {
8371 NV_ASSERT_OR_RETURN_VOID(pKernelMIGGpuInstance != NULL);
8372
8373 if (bInUse)
8374 {
8375 pKernelMIGGpuInstance->grCtsIdMap[grId] = ctsId;
8376
8377 // Nothing to set in ctsIdInUseMask if KMIGMGR_CTSID_INVALID passed in
8378 NV_ASSERT_OR_RETURN_VOID(ctsId != KMIGMGR_CTSID_INVALID);
8379
8380 pKernelMIGGpuInstance->ctsIdsInUseMask |= NVBIT64(ctsId);
8381 }
8382 else
8383 {
8384 //
8385 // Take CTS ID directly from gr mapping array to ensure both structures
8386 // remain in-sync.
8387 //
8388 ctsId = pKernelMIGGpuInstance->grCtsIdMap[grId];
8389
8390 // Nothing to do if nothing was set
8391 NV_CHECK_OR_RETURN_VOID(LEVEL_WARNING, ctsId != KMIGMGR_CTSID_INVALID);
8392
8393 pKernelMIGGpuInstance->ctsIdsInUseMask &= ~NVBIT64(ctsId);
8394 pKernelMIGGpuInstance->grCtsIdMap[grId] = KMIGMGR_CTSID_INVALID;
8395 }
8396 }
8397
8398 /*!
8399 * @brief Translates a spanStart and computeSize to the corresponding CTS ID.
8400 * When an invalid compute size is passed in, this function will still
8401 * return NV_OK, but populates an invalid CTS ID for use.
8402 *
8403 * @param[IN] pGpu
8404 * @param[IN] pKernelMIGManager
8405 * @param[IN] computeSize Compute size of CTS to get span offset of
8406 * @param[IN] spanStart spanStart requested
8407 * @param[OUT] pCtsId Output CTS ID in computeSize's range
8408 *
8409 */
8410 NV_STATUS
kmigmgrXlateSpanStartToCTSId_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 computeSize,NvU32 spanStart,NvU32 * pCtsId)8411 kmigmgrXlateSpanStartToCTSId_IMPL
8412 (
8413 OBJGPU *pGpu,
8414 KernelMIGManager *pKernelMIGManager,
8415 NvU32 computeSize,
8416 NvU32 spanStart,
8417 NvU32 *pCtsId
8418 )
8419 {
8420 NV_RANGE computeSizeIdRange;
8421 NvU64 computeSizeIdMask;
8422 NvU64 slotBasisMask;
8423 NvU32 slotsPerCTS;
8424
8425 NV_ASSERT_OR_RETURN(pCtsId != NULL, NV_ERR_INVALID_ARGUMENT);
8426
8427 //
8428 // Initialize output to invalid CTS ID, as KMIGMGR_COMPUTE_SIZE_INVALID may have been passed in
8429 // which is ok. It Is the callers rsponsibility to check for the CTS ID validitiy.
8430 //
8431 *pCtsId = KMIGMGR_CTSID_INVALID;
8432
8433 NV_CHECK_OR_RETURN(LEVEL_WARNING, computeSize != KMIGMGR_COMPUTE_SIZE_INVALID, NV_OK);
8434 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kmigmgrGetSlotBasisMask(pGpu, pKernelMIGManager, &slotBasisMask));
8435
8436 // Validate that the spanStart does not exceed the basis slot count (which constitutes the acceptable span range)
8437 NV_CHECK_OR_RETURN(LEVEL_ERROR, spanStart < nvPopCount64(slotBasisMask), NV_ERR_INVALID_ARGUMENT);
8438
8439 computeSizeIdRange = kmigmgrComputeProfileSizeToCTSIdRange(computeSize);
8440
8441 // Grab the first CTS ID for computeSize, as it doesn't really mater which one we choose here.
8442 NV_ASSERT_OK(kmigmgrGetInvalidCTSIdMask(pGpu, pKernelMIGManager, computeSizeIdRange.lo, &computeSizeIdMask));
8443
8444 // slots per CTSID is number of basis IDs marked in the invalid mask for this ID
8445 slotsPerCTS = nvPopCount64(computeSizeIdMask & slotBasisMask);
8446
8447 if ((spanStart % slotsPerCTS) != 0)
8448 {
8449 NV_PRINTF(LEVEL_ERROR, "Compute span start of %d is not aligned\n", spanStart);
8450 return NV_ERR_INVALID_ARGUMENT;
8451 }
8452
8453 *pCtsId = computeSizeIdRange.lo + (spanStart / slotsPerCTS);
8454
8455 // The ID returned should be within the computeSize's range at this point
8456 NV_ASSERT((computeSizeIdRange.lo <= *pCtsId) && (*pCtsId <= computeSizeIdRange.hi));
8457
8458 return NV_OK;
8459 }
8460
8461 /*!
8462 * @brief Retrievies the mask of CTS IDs which are used to derive other properties
8463 * such as spans, offsets, and capacities.
8464 *
8465 * @param[IN] pGpu
8466 * @param[IN] pKernelMIGManager
8467 * @param[OUT] computeSize Mask of all CTS IDs part of the profile slot basis
8468 */
8469 NV_STATUS
kmigmgrGetSlotBasisMask_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU64 * pMask)8470 kmigmgrGetSlotBasisMask_IMPL
8471 (
8472 OBJGPU *pGpu,
8473 KernelMIGManager *pKernelMIGManager,
8474 NvU64 *pMask
8475 )
8476 {
8477 NV_RANGE slotBasisIdRange;
8478 NvU32 slotBasisComputeSize;
8479
8480 NV_CHECK_OR_RETURN(LEVEL_ERROR, pMask != NULL, NV_ERR_INVALID_ARGUMENT);
8481
8482 slotBasisComputeSize = kmigmgrSmallestComputeProfileSize(pGpu, pKernelMIGManager);
8483 slotBasisIdRange = kmigmgrComputeProfileSizeToCTSIdRange(slotBasisComputeSize);
8484
8485 NV_ASSERT_OR_RETURN(!rangeIsEmpty(slotBasisIdRange), NV_ERR_INVALID_STATE);
8486
8487 *pMask = DRF_SHIFTMASK64(slotBasisIdRange.hi:slotBasisIdRange.lo);
8488
8489 return NV_OK;
8490 }
8491
8492 /*!
8493 * @brief Translates a CTS ID to the corresponding spanStart of the CTS
8494 *
8495 * @param[IN] pGpu
8496 * @param[IN] pKernelMIGManager
8497 * @param[IN] ctsId
8498 *
8499 */
8500 NvU32
kmigmgrGetSpanStartFromCTSId_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 ctsId)8501 kmigmgrGetSpanStartFromCTSId_IMPL
8502 (
8503 OBJGPU *pGpu,
8504 KernelMIGManager *pKernelMIGManager,
8505 NvU32 ctsId
8506 )
8507 {
8508 NvU32 computeSize = kmigmgrGetComputeSizeFromCTSId(ctsId);
8509 NV_RANGE computeSizeIdRange;
8510 NvU64 computeSizeIdMask;
8511 NvU64 slotBasisMask;
8512 NvU32 slotsPerCTS;
8513
8514 NV_CHECK_OR_RETURN(LEVEL_WARNING, computeSize != KMIGMGR_COMPUTE_SIZE_INVALID, 0);
8515
8516 computeSizeIdRange = kmigmgrComputeProfileSizeToCTSIdRange(computeSize);
8517
8518 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, kmigmgrGetSlotBasisMask(pGpu, pKernelMIGManager, &slotBasisMask));
8519
8520 // Grab the first CTS ID for computeSize, as it doesn't really mater which one we choose here.
8521 NV_ASSERT_OK(kmigmgrGetInvalidCTSIdMask(pGpu, pKernelMIGManager, computeSizeIdRange.lo, &computeSizeIdMask));
8522
8523 // slots per CTSID is number of basis IDs marked in the invalid mask for this ID
8524 slotsPerCTS = nvPopCount64(computeSizeIdMask & slotBasisMask);
8525
8526 return (ctsId - computeSizeIdRange.lo) * slotsPerCTS;
8527 }
8528
8529 /*!
8530 * @brief Function checking whether the passed-in ctsId is available given the
8531 * current states of ctsIdValidMask and ctsIdInUseMask
8532 *
8533 * @param[IN] pGpu
8534 * @param[IN] pKernelMIGManager
8535 * @param[IN] ctsIdValidMask Valid CTS ID mask to compare against
8536 * @param[IN] ctsIdInUseMask Mask of CTS IDs which are marked as being used
8537 * @param[IN] ctsid CTS ID to check
8538 */
8539 NvBool
kmigmgrIsCTSIdAvailable_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU64 ctsIdValidMask,NvU64 ctsIdInUseMask,NvU32 ctsId)8540 kmigmgrIsCTSIdAvailable_IMPL
8541 (
8542 OBJGPU *pGpu,
8543 KernelMIGManager *pKernelMIGManager,
8544 NvU64 ctsIdValidMask,
8545 NvU64 ctsIdInUseMask,
8546 NvU32 ctsId
8547 )
8548 {
8549 NvU64 invalidMask = 0x0;
8550 NvU32 i;
8551
8552 FOR_EACH_INDEX_IN_MASK(64, i, ctsIdInUseMask)
8553 {
8554 NvU64 mask;
8555
8556 NV_ASSERT_OK(kmigmgrGetInvalidCTSIdMask(pGpu, pKernelMIGManager, i, &mask));
8557
8558 invalidMask |= mask;
8559 }
8560 FOR_EACH_INDEX_IN_MASK_END;
8561 return !!((ctsIdValidMask & ~invalidMask) & NVBIT64(ctsId));
8562 }
8563
8564 #define _kmigmgrReadRegistryDword(pGpu, pKernelMIGManager, pRegParmStr, pData) \
8565 ((pKernelMIGManager)->bGlobalBootConfigUsed \
8566 ? osGetNvGlobalRegistryDword((pGpu), (pRegParmStr), (pData)) \
8567 : osReadRegistryDword((pGpu), (pRegParmStr), (pData)))
8568
8569 /*!
8570 * @brief Read MIG boot config from the registry
8571 *
8572 * @param[IN] pGpu
8573 * @param[IN] pKernelMIGManager
8574 */
8575 static NvBool
_kmigmgrReadBootConfig(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,MIG_BOOT_CONFIG * pBootConfig)8576 _kmigmgrReadBootConfig
8577 (
8578 OBJGPU *pGpu,
8579 KernelMIGManager *pKernelMIGManager,
8580 MIG_BOOT_CONFIG *pBootConfig
8581 )
8582 {
8583 NvU32 data32;
8584 NvBool bCIAssignmentPresent = NV_FALSE;
8585
8586 ct_assert(NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_GI__SIZE == NV_ARRAY_ELEMENTS(pBootConfig->GIs));
8587 ct_assert(NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI__SIZE == NV_ARRAY_ELEMENTS(pBootConfig->CIs));
8588 ct_assert(NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI_ASSIGNMENT_GI__SIZE == NV_ARRAY_ELEMENTS(pBootConfig->CIs));
8589
8590 // Read GPU instance config regkeys
8591 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(pBootConfig->GIs); i++)
8592 {
8593 char regStr[sizeof(NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_GI(0))];
8594
8595 nvDbgSnprintf(regStr, sizeof(regStr), NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_GI(%u), i);
8596 if (_kmigmgrReadRegistryDword(pGpu, pKernelMIGManager, regStr, &data32) != NV_OK)
8597 {
8598 // Do not break here, so we could later check if there are any holes in the config
8599 continue;
8600 }
8601
8602 pBootConfig->GIs[i].flags = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_GI, _FLAGS, data32);
8603 pBootConfig->GIs[i].placement.lo = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_GI, _PLACEMENT_LO, data32);
8604 pBootConfig->GIs[i].placement.hi = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_GI, _PLACEMENT_HI, data32);
8605
8606 if (DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_GI, _REQ_DEC_JPG_OFA, data32))
8607 {
8608 pBootConfig->GIs[i].flags |= DRF_DEF(2080, _CTRL_GPU_PARTITION_FLAG, _REQ_DEC_JPG_OFA, _ENABLE);
8609 }
8610
8611 if (!rangeIsEmpty(pBootConfig->GIs[i].placement))
8612 {
8613 pBootConfig->GIs[i].flags |= DRF_DEF(2080, _CTRL_GPU_PARTITION_FLAG, _PLACE_AT_SPAN, _ENABLE);
8614 }
8615
8616 NV_PRINTF(LEVEL_INFO, "Found a GI config regkey '%s': flags=0x%x, placementLo=%llu, placementHi=%llu\n",
8617 regStr, pBootConfig->GIs[i].flags, pBootConfig->GIs[i].placement.lo,
8618 pBootConfig->GIs[i].placement.hi);
8619
8620 // Ensure that the specified flags are valid
8621 if (!kmigmgrIsGPUInstanceCombinationValid_HAL(pGpu, pKernelMIGManager, pBootConfig->GIs[i].flags))
8622 {
8623 NV_PRINTF(LEVEL_ERROR, "Invalid partition flags 0x%x in %s\n", pBootConfig->GIs[i].flags, regStr);
8624 return NV_FALSE;
8625 }
8626
8627 pBootConfig->GIs[i].bValid = NV_TRUE;
8628 }
8629
8630 // Read compute instance assignment regkey
8631 if (_kmigmgrReadRegistryDword(pGpu, pKernelMIGManager, NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI_ASSIGNMENT, &data32) == NV_OK)
8632 {
8633 NV_PRINTF(LEVEL_INFO, "Found a CI assignment regkey '" NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI_ASSIGNMENT "': value=%x", data32);
8634
8635 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(pBootConfig->CIs); i++)
8636 {
8637 pBootConfig->CIs[i].GIIdx = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI_ASSIGNMENT, _GI(i), data32);
8638 }
8639
8640 bCIAssignmentPresent = NV_TRUE;
8641 }
8642
8643 // Read compute instance config regkeys
8644 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(pBootConfig->CIs); i++)
8645 {
8646 char regStr[sizeof(NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI(0))];
8647
8648 nvDbgSnprintf(regStr, sizeof(regStr), NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI(%u), i);
8649 if (_kmigmgrReadRegistryDword(pGpu, pKernelMIGManager, regStr, &data32) != NV_OK)
8650 {
8651 // If the CI entry is not specified, its assignment should be 0
8652 if (pBootConfig->CIs[i].GIIdx != 0)
8653 {
8654 NV_PRINTF(LEVEL_ERROR, "CI assignment for GI #%u must be 0\n", i);
8655 return NV_FALSE;
8656 }
8657
8658 // Do not break here, so we could later check if there are any holes in the config
8659 continue;
8660 }
8661
8662 // The CI assigment regkey must be present if there are any CIs being specified
8663 if (!bCIAssignmentPresent)
8664 {
8665 NV_PRINTF(LEVEL_ERROR, "Regkey '" NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI_ASSIGNMENT "' is missing\n");
8666 return NV_FALSE;
8667 }
8668
8669 pBootConfig->CIs[i].flags = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _FLAGS, data32);
8670 pBootConfig->CIs[i].spanStart = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _PLACEMENT_LO, data32);
8671 pBootConfig->CIs[i].ceCount = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _CES, data32);
8672 pBootConfig->CIs[i].nvDecCount = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _DECS, data32);
8673 pBootConfig->CIs[i].nvEncCount = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _ENCS, data32);
8674 pBootConfig->CIs[i].nvJpgCount = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _JPGS, data32);
8675 pBootConfig->CIs[i].ofaCount = DRF_VAL(_REG_STR_RM, _MIG_BOOT_CONFIGURATION_CI, _OFAS, data32);
8676 pBootConfig->CIs[i].bValid = NV_TRUE;
8677
8678 NV_PRINTF(LEVEL_INFO, "Found a CI config regkey '%s': flags=0x%x, placementLo=%u, CEs=%u, DECs=%u, ENCs=%u, JPGs=%u, OFAs=%u\n",
8679 regStr, pBootConfig->CIs[i].flags, pBootConfig->CIs[i].spanStart, pBootConfig->CIs[i].ceCount,
8680 pBootConfig->CIs[i].nvDecCount, pBootConfig->CIs[i].nvEncCount, pBootConfig->CIs[i].nvJpgCount,
8681 pBootConfig->CIs[i].ofaCount);
8682 }
8683
8684 // Check that the GPU instances config has no holes
8685 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(pBootConfig->GIs) - 1; i++)
8686 {
8687 if (!pBootConfig->GIs[i].bValid && pBootConfig->GIs[i + 1].bValid)
8688 {
8689 NV_PRINTF(LEVEL_ERROR, "Regkey '" NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_GI(%u) "' is missing\n", i);
8690 return NV_FALSE;
8691 }
8692 }
8693
8694 // Check that the compute instances config has no holes
8695 for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(pBootConfig->CIs) - 1; i++)
8696 {
8697 if (!pBootConfig->CIs[i].bValid && pBootConfig->CIs[i + 1].bValid)
8698 {
8699 NV_PRINTF(LEVEL_ERROR, "Regkey '" NV_REG_STR_RM_MIG_BOOT_CONFIGURATION_CI(%u) "' is missing\n", i);
8700 return NV_FALSE;
8701 }
8702 }
8703
8704 return NV_TRUE;
8705 }
8706
8707 /*!
8708 * @brief Create GPU and compute instances based on the boot config
8709 *
8710 * @param[IN] pGpu
8711 * @param[IN] pKernelMIGManager
8712 */
8713 NV_STATUS
kmigmgrRestoreFromBootConfig_PF(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager)8714 kmigmgrRestoreFromBootConfig_PF
8715 (
8716 OBJGPU *pGpu,
8717 KernelMIGManager *pKernelMIGManager
8718 )
8719 {
8720 NV_STATUS status = NV_OK;
8721 NvU32 GIIdx;
8722 NvU32 CIIdx;
8723 RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
8724 NV2080_CTRL_GPU_SET_PARTITION_INFO partitionInfo[NV2080_CTRL_GPU_MAX_PARTITIONS] = {0};
8725 MIG_BOOT_CONFIG bootConfig = {0};
8726
8727 // Check that boot config is supported
8728 NV_CHECK_OR_RETURN(LEVEL_INFO, pKernelMIGManager->bBootConfigSupported, NV_OK);
8729
8730 // If MIG isn't enabled, nothing to do
8731 NV_CHECK_OR_RETURN(LEVEL_INFO, IS_MIG_ENABLED(pGpu), NV_OK);
8732
8733 // Read the boot config from the registry
8734 NV_CHECK_OR_RETURN(LEVEL_INFO,
8735 _kmigmgrReadBootConfig(pGpu, pKernelMIGManager, &bootConfig),
8736 NV_ERR_INVALID_PARAMETER);
8737
8738 // Create the GPU instances
8739 for (GIIdx = 0; GIIdx < NV_ARRAY_ELEMENTS(bootConfig.GIs); GIIdx++)
8740 {
8741 if (!bootConfig.GIs[GIIdx].bValid)
8742 {
8743 break;
8744 }
8745
8746 partitionInfo[GIIdx].bValid = NV_TRUE;
8747 partitionInfo[GIIdx].partitionFlag = bootConfig.GIs[GIIdx].flags;
8748 partitionInfo[GIIdx].placement.lo = bootConfig.GIs[GIIdx].placement.lo;
8749 partitionInfo[GIIdx].placement.hi = bootConfig.GIs[GIIdx].placement.hi;
8750
8751 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
8752 _kmigmgrProcessGPUInstanceEntry(pGpu, pKernelMIGManager, &partitionInfo[GIIdx]),
8753 cleanupGI);
8754 }
8755
8756 // Create the compute instances
8757 for (CIIdx = 0; CIIdx < NV2080_CTRL_GPU_MAX_PARTITIONS; CIIdx++)
8758 {
8759 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = NULL;
8760 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE computeProfileInfo = {0};
8761 NVC637_CTRL_EXEC_PARTITIONS_CREATE_PARAMS createParams = {0};
8762
8763 if (!bootConfig.CIs[CIIdx].bValid)
8764 {
8765 break;
8766 }
8767
8768 // Find what compute profile corresponds to the specified partition flags
8769 NV_CHECK_OK_OR_ELSE(status, LEVEL_ERROR,
8770 kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager, bootConfig.CIs[CIIdx].flags, &computeProfileInfo),
8771 NV_PRINTF(LEVEL_ERROR, "Invalid partition flags 0x%x for CI #%u\n", bootConfig.CIs[CIIdx].flags, CIIdx);
8772 goto cleanupCI);
8773
8774 NV_ASSERT_OK_OR_GOTO(status,
8775 kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager,
8776 partitionInfo[bootConfig.CIs[CIIdx].GIIdx].swizzId,
8777 &pKernelMIGGpuInstance),
8778 cleanupCI);
8779
8780 createParams.execPartCount = 1;
8781 createParams.flags = DRF_DEF(C637_CTRL, _DMA_EXEC_PARTITIONS_CREATE_REQUEST, _AT_SPAN, _TRUE);
8782 createParams.execPartInfo[0].gpcCount = computeProfileInfo.gpcCount;
8783 createParams.execPartInfo[0].smCount = computeProfileInfo.smCount;
8784 createParams.execPartInfo[0].computeSize = computeProfileInfo.computeSize;
8785 createParams.execPartInfo[0].ceCount = bootConfig.CIs[CIIdx].ceCount;
8786 createParams.execPartInfo[0].nvEncCount = bootConfig.CIs[CIIdx].nvEncCount;
8787 createParams.execPartInfo[0].nvDecCount = bootConfig.CIs[CIIdx].nvDecCount;
8788 createParams.execPartInfo[0].nvJpgCount = bootConfig.CIs[CIIdx].nvJpgCount;
8789 createParams.execPartInfo[0].ofaCount = bootConfig.CIs[CIIdx].ofaCount;
8790 createParams.execPartInfo[0].spanStart = bootConfig.CIs[CIIdx].spanStart;
8791 createParams.execPartInfo[0].sharedEngFlag = NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_CE |
8792 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVDEC |
8793 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVENC |
8794 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_OFA |
8795 NVC637_CTRL_EXEC_PARTITIONS_SHARED_FLAG_NVJPG;
8796
8797 NV_ASSERT_OK_OR_GOTO(status,
8798 pRmApi->Control(pRmApi,
8799 pKernelMIGGpuInstance->instanceHandles.hClient,
8800 pKernelMIGGpuInstance->instanceHandles.hSubscription,
8801 NVC637_CTRL_CMD_EXEC_PARTITIONS_CREATE,
8802 &createParams,
8803 sizeof(createParams)),
8804 cleanupCI);
8805 }
8806
8807 return NV_OK;
8808
8809 cleanupCI:
8810 // Remove all compute instances on the created GPU instances
8811 for (NvU32 i = 0; i < CIIdx; i++)
8812 {
8813 NvU32 tmpStatus;
8814 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance = NULL;
8815 NVC637_CTRL_EXEC_PARTITIONS_GET_PARAMS getParams = {0};
8816 NVC637_CTRL_EXEC_PARTITIONS_DELETE_PARAMS deleteParams = {0};
8817
8818 NV_ASSERT_OK_OR_ELSE(tmpStatus,
8819 kmigmgrGetGPUInstanceInfo(pGpu, pKernelMIGManager, partitionInfo[i].swizzId, &pKernelMIGGpuInstance),
8820 continue);
8821
8822 NV_ASSERT_OK_OR_ELSE(tmpStatus,
8823 pRmApi->Control(pRmApi,
8824 pKernelMIGGpuInstance->instanceHandles.hClient,
8825 pKernelMIGGpuInstance->instanceHandles.hSubscription,
8826 NVC637_CTRL_CMD_EXEC_PARTITIONS_GET,
8827 &getParams,
8828 sizeof(getParams)),
8829 continue);
8830
8831 deleteParams.execPartCount = getParams.execPartCount;
8832 portMemCopy(deleteParams.execPartId, sizeof(deleteParams.execPartId),
8833 getParams.execPartId, sizeof(getParams.execPartId));
8834
8835 NV_ASSERT_OK(pRmApi->Control(pRmApi,
8836 pKernelMIGGpuInstance->instanceHandles.hClient,
8837 pKernelMIGGpuInstance->instanceHandles.hSubscription,
8838 NVC637_CTRL_CMD_EXEC_PARTITIONS_DELETE,
8839 &deleteParams,
8840 sizeof(deleteParams)));
8841 }
8842
8843 cleanupGI:
8844 // Invalidate the created GPU instances
8845 for (NvU32 i = 0; i < GIIdx; i++)
8846 {
8847 partitionInfo[i].bValid = NV_FALSE;
8848 NV_ASSERT_OK(_kmigmgrProcessGPUInstanceEntry(pGpu, pKernelMIGManager, &partitionInfo[i]));
8849 }
8850
8851 return status;
8852 }
8853
8854 /*!
8855 * Returns the Smallest Compute size (NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_*)
8856 * that is supported on this GPU.
8857 *
8858 * @param[in] pGpu - OBJGPU
8859 * @param[in] pKernelMIGManager - KernelMIGManager Object
8860 * @param[out] pSmallestComputeSize - NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_* flag for the smallest supported size.
8861 * This should be used only for MIG Supported GPUs.
8862 *
8863 * @return NV_OK on success,
8864 * NV_ERR_NOT_SUPPORTED when the MaxMIG count is unsupported
8865 * NV_ERR_INVALID_STATE When this is called before the internal data is initialized
8866 */
8867 NV_STATUS
kmigmgrGetSmallestGpuInstanceSize_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,NvU32 * pSmallestComputeSize)8868 kmigmgrGetSmallestGpuInstanceSize_IMPL
8869 (
8870 OBJGPU *pGpu,
8871 KernelMIGManager *pKernelMIGManager,
8872 NvU32 *pSmallestComputeSize
8873 )
8874 {
8875 NvU32 maxMIG;
8876 KernelGraphicsManager *pKernelGraphicsManager = GPU_GET_KERNEL_GRAPHICS_MANAGER(pGpu);
8877
8878 NV_ASSERT_OR_RETURN(kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->bInitialized, NV_ERR_INVALID_STATE);
8879 NV_ASSERT_OR_RETURN(kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo != NULL, NV_ERR_INVALID_STATE);
8880
8881 maxMIG = kgrmgrGetLegacyKGraphicsStaticInfo(pGpu, pKernelGraphicsManager)->pGrInfo->infoList[NV2080_CTRL_GR_INFO_INDEX_MAX_MIG_ENGINES].data;
8882 switch (maxMIG)
8883 {
8884 case 8:
8885 *pSmallestComputeSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_EIGHTH;
8886 break;
8887 case 1:
8888 *pSmallestComputeSize = NV2080_CTRL_GPU_PARTITION_FLAG_COMPUTE_SIZE_FULL;
8889 break;
8890 default:
8891 NV_PRINTF(LEVEL_ERROR, "maxMIG(%d) is unsupported\n", maxMIG);
8892 return NV_ERR_NOT_SUPPORTED;
8893 break;
8894 }
8895
8896 return NV_OK;
8897 }
8898 /*!
8899 * @brief Function to lookup a compute profile for a sm or gpc count. This function
8900 * has the caller provide a KERNEL_MIG_GPU_INSTANCE object, to catch requests
8901 * which lie outside the normal bounds of the GPU instance.
8902 * If the request is not at (or above) the GPU instances limit, then the CI
8903 * profile will be selected by using the smCountRequest first, and only use
8904 * gpcCount if the SM count look-up fails.
8905 *
8906 * @param[in] pGpu
8907 * @param[in] pKernelMIGManager
8908 * @param[in] pKernelMIGGpuInstance GPU instance for which the request was made
8909 * @param[in] smCountRequest SM Count to look up the associated compute profile
8910 * @param[in] gpcCountRequest GPC Count to look up the associated compute profile if SM lookup fails
8911 * @param[out] pProfile Pointer to NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE struct filled with
8912 * a copy of the compute profile info associated with the requested SM or GPC count.
8913 */
8914 NV_STATUS
kmigmgrGetComputeProfileForRequest_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance,NvU32 smCountRequest,NvU32 gpcCountRequest,NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE * pProfile)8915 kmigmgrGetComputeProfileForRequest_IMPL
8916 (
8917 OBJGPU *pGpu,
8918 KernelMIGManager *pKernelMIGManager,
8919 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance,
8920 NvU32 smCountRequest,
8921 NvU32 gpcCountRequest,
8922 NV2080_CTRL_INTERNAL_MIGMGR_COMPUTE_PROFILE *pProfile
8923 )
8924 {
8925 NvU32 computeSize;
8926 NV_ASSERT_OR_RETURN(pProfile != NULL, NV_ERR_INVALID_ARGUMENT);
8927 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_ERR_INVALID_ARGUMENT);
8928
8929 // If SM Count is >= the GI's total size, use GI's computeSize as CI profile
8930 computeSize = DRF_VAL(2080_CTRL_GPU, _PARTITION_FLAG, _COMPUTE_SIZE, pKernelMIGGpuInstance->partitionFlag);
8931 if (!IS_SILICON(pGpu) &&
8932 (pKernelMIGGpuInstance->resourceAllocation.smCount <= smCountRequest))
8933 {
8934 NV_PRINTF(LEVEL_INFO, "CI request is at GPU instance's limit. Using GPU instance's size: %d\n", computeSize);
8935 if (kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager, computeSize, pProfile) == NV_OK)
8936 return NV_OK;
8937 }
8938
8939 if (kmigmgrGetComputeProfileFromSmCount(pGpu, pKernelMIGManager, smCountRequest, pProfile) == NV_OK)
8940 {
8941 return NV_OK;
8942 }
8943
8944 if (!IS_SILICON(pGpu) && (pKernelMIGGpuInstance->resourceAllocation.gpcCount == gpcCountRequest))
8945 {
8946 NV_PRINTF(LEVEL_INFO, "CI request is at GPU instance's limit. Using GPU instance's size: %d\n", computeSize);
8947 if (kmigmgrGetComputeProfileFromSize(pGpu, pKernelMIGManager, computeSize, pProfile) == NV_OK)
8948 return NV_OK;
8949 }
8950
8951 // Do basic GPC look-up as last resort if all the above failed
8952 return kmigmgrGetComputeProfileFromGpcCount_HAL(pGpu, pKernelMIGManager, gpcCountRequest, pProfile);
8953 }
8954
8955 /*!
8956 * @brief Returns if all allocated VEIDs in a GPU instance are contiguous and
8957 * have no holes
8958 */
8959 NvBool
kmigmgrIsPartitionVeidAllocationContiguous_IMPL(OBJGPU * pGpu,KernelMIGManager * pKernelMIGManager,KERNEL_MIG_GPU_INSTANCE * pKernelMIGGpuInstance)8960 kmigmgrIsPartitionVeidAllocationContiguous_IMPL
8961 (
8962 OBJGPU *pGpu,
8963 KernelMIGManager *pKernelMIGManager,
8964 KERNEL_MIG_GPU_INSTANCE *pKernelMIGGpuInstance
8965 )
8966 {
8967 NvU32 ciIdx;
8968 NvU64 instanceVeidMask = 0x0;
8969 NvU64 tempMask;
8970 NvU64 shift;
8971
8972 // Sanity checks
8973 NV_ASSERT_OR_RETURN(pKernelMIGGpuInstance != NULL, NV_FALSE);
8974
8975 for (ciIdx = 0; ciIdx < NV_ARRAY_ELEMENTS(pKernelMIGGpuInstance->MIGComputeInstance); ++ciIdx)
8976 {
8977 NvU32 veidStart;
8978 NvU32 veidEnd;
8979 MIG_COMPUTE_INSTANCE *pMIGComputeInstance = &pKernelMIGGpuInstance->MIGComputeInstance[ciIdx];
8980
8981 // Skip invalid compute instances
8982 if (!pMIGComputeInstance->bValid)
8983 continue;
8984
8985 veidStart = pMIGComputeInstance->resourceAllocation.veidOffset;
8986 veidEnd = veidStart + pMIGComputeInstance->resourceAllocation.veidCount - 1;
8987 instanceVeidMask |= DRF_SHIFTMASK64(veidEnd:veidStart);
8988 }
8989
8990 // If mask is fully populated or empty, no need to check
8991 if ((instanceVeidMask == 0) || (instanceVeidMask == NV_U64_MAX))
8992 return NV_TRUE;
8993
8994 // Count the zeros at the end to align mask to always start with "1"
8995 shift = portUtilCountTrailingZeros64(instanceVeidMask);
8996 tempMask = (instanceVeidMask >> shift);
8997
8998 //
8999 // If the above mask is contiguous "1s", an addition will result in next
9000 // pow-2, so simply check if we have only one-bit set or multiple.
9001 //
9002 return ONEBITSET(tempMask + 1);
9003 }
9004