1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "gpu_mgr/gpu_mgr.h"
25 #include "gpu/gpu.h"
26 #include "gpu/gpu_child_class_defs.h"
27 #include "gpu_mgr/gpu_mgr_sli.h"
28 #include "gpu/bif/kernel_bif.h"
29 #include "jt.h"
30 #include "published/maxwell/gm107/dev_bus.h"
31 #include "published/maxwell/gm107/dev_nv_xve.h"
32 #include "published/maxwell/gm107/dev_nv_xve1.h"
33 #include "published/maxwell/gm107/dev_fuse.h"
34 #include "published/maxwell/gm107/dev_pri_ringstation_sys.h"
35 
36 /*!
37  * @brief Returns SR-IOV capabilities
38  *
39  * @param[in]  pGpu           OBJGPU pointer
40  * @param[out] pParams        Pointer for get_sriov_caps params
41  *
42  * @returns NV_OK always
43  */
44 NV_STATUS
gpuGetSriovCaps_GM107(OBJGPU * pGpu,NV0080_CTRL_GPU_GET_SRIOV_CAPS_PARAMS * pParams)45 gpuGetSriovCaps_GM107
46 (
47     OBJGPU *pGpu,
48     NV0080_CTRL_GPU_GET_SRIOV_CAPS_PARAMS *pParams
49 )
50 {
51     pParams->bSriovEnabled = NV_FALSE;
52     return NV_OK;
53 }
54 
55 /*!
56  * @brief Read fuse for display supported status.
57  *        Some chips not marked displayless do not support display
58  */
59 NvBool
gpuFuseSupportsDisplay_GM107(OBJGPU * pGpu)60 gpuFuseSupportsDisplay_GM107
61 (
62     OBJGPU *pGpu
63 )
64 {
65     return GPU_FLD_TEST_DRF_DEF(pGpu, _FUSE, _STATUS_OPT_DISPLAY, _DATA, _ENABLE);
66 }
67 
68 /*!
69  * @brief gpuReadBusConfigRegEx_GM107
70  *
71  * param[in] pGpu         The GPU object pointer
72  * param[in] index        NvU32
73  * param[in] *data        NvU32 *
74  * param[in] pThreadState thread state node pointer
75  */
76 NV_STATUS
gpuReadBusConfigRegEx_GM107(OBJGPU * pGpu,NvU32 index,NvU32 * data,THREAD_STATE_NODE * pThreadState)77 gpuReadBusConfigRegEx_GM107
78 (
79     OBJGPU            *pGpu,
80     NvU32              index,
81     NvU32             *data,
82     THREAD_STATE_NODE *pThreadState
83 )
84 {
85 
86     if (index > (PCIE_CONFIG_SPACE_SIZE - sizeof(NvU32)))
87     {
88         NV_PRINTF(LEVEL_ERROR,
89                   "Offset 0x%08x exceeds range!\n",
90                   index);
91         NV_ASSERT(0);
92         return NV_ERR_GENERIC;
93     }
94 
95     *data = GPU_REG_RD32_EX(pGpu, DEVICE_BASE(NV_PCFG) + index, pThreadState);
96 
97     return NV_OK;
98 }
99 
100 /*!
101  * @brief gpuReadBusConfigReg_GM107()
102  *
103  * param[in] pGpu       The GPU object pointer
104  * param[in] index      NvU32
105  * param[in] *data      NvU32 *
106  */
107 NV_STATUS
gpuReadBusConfigReg_GM107(OBJGPU * pGpu,NvU32 index,NvU32 * data)108 gpuReadBusConfigReg_GM107
109 (
110     OBJGPU *pGpu,
111     NvU32   index,
112     NvU32  *data
113 )
114 {
115     return gpuReadBusConfigRegEx_HAL(pGpu, index, data, NULL);
116 }
117 
118 /*!
119  * @brief gpuWriteBusConfigReg_GM107
120  *
121  * param[in] pGpu       The GPU object pointer
122  * param[in] index      NvU32
123  * param[in] value      NvU32
124  */
125 NV_STATUS
gpuWriteBusConfigReg_GM107(OBJGPU * pGpu,NvU32 index,NvU32 value)126 gpuWriteBusConfigReg_GM107
127 (
128     OBJGPU *pGpu,
129     NvU32   index,
130     NvU32   value
131 )
132 {
133 
134     if (index > (PCIE_CONFIG_SPACE_SIZE - sizeof(NvU32)))
135     {
136         NV_PRINTF(LEVEL_ERROR,
137                   "Offset 0x%08x exceeds range!\n",
138                   index);
139         NV_ASSERT(0);
140         return NV_ERR_GENERIC;
141     }
142 
143     GPU_REG_WR32(pGpu, DEVICE_BASE(NV_PCFG) + index, value);
144 
145     return NV_OK;
146 }
147 
148 NV_STATUS
gpuReadFunctionConfigReg_GM107(OBJGPU * pGpu,NvU32 function,NvU32 index,NvU32 * data)149 gpuReadFunctionConfigReg_GM107
150 (
151     OBJGPU *pGpu,
152     NvU32   function,
153     NvU32   index,
154     NvU32  *data
155 )
156 {
157     NvU32  retval;
158 
159     if (index > (PCIE_CONFIG_SPACE_SIZE - sizeof(NvU32)))
160     {
161         NV_PRINTF(LEVEL_ERROR, "Offset 0x%08x exceeds range!\n", index);
162 
163         return NV_ERR_GENERIC;
164     }
165 
166     switch (function)
167     {
168         case 0:
169         {
170             retval = GPU_REG_RD32(pGpu, DEVICE_BASE(NV_PCFG) + index);
171             break;
172         }
173 
174         case 1:
175         {
176             if (IS_FMODEL(pGpu))
177             {
178                 //
179                 // Function 1 is not modeled on fmodel
180                 //
181                 *data = 0;
182                 return NV_OK;
183             }
184             else
185             {
186                 retval = GPU_REG_RD32(pGpu, DEVICE_BASE(NV_PCFG1) + index);
187             }
188             break;
189         }
190 
191         default:
192             NV_PRINTF(LEVEL_ERROR,
193                       "attempt to read cfg space of non-existant function %x\n",
194                       function);
195             return NV_ERR_GENERIC;
196     }
197 
198     *data = retval;
199     return NV_OK;
200 }
201 
202 
203 NV_STATUS
gpuWriteFunctionConfigReg_GM107(OBJGPU * pGpu,NvU32 function,NvU32 index,NvU32 data)204 gpuWriteFunctionConfigReg_GM107
205 (
206     OBJGPU *pGpu,
207     NvU32   function,
208     NvU32   index,
209     NvU32   data
210 )
211 {
212     return gpuWriteFunctionConfigRegEx_HAL(pGpu, function, index, data,
213                                            NULL /* threadstate */);
214 }
215 
216 NV_STATUS
gpuWriteFunctionConfigRegEx_GM107(OBJGPU * pGpu,NvU32 function,NvU32 index,NvU32 data,THREAD_STATE_NODE * pThreadState)217 gpuWriteFunctionConfigRegEx_GM107
218 (
219     OBJGPU            *pGpu,
220     NvU32              function,
221     NvU32              index,
222     NvU32              data,
223     THREAD_STATE_NODE *pThreadState
224 )
225 {
226     if (index > (PCIE_CONFIG_SPACE_SIZE - sizeof(NvU32)))
227     {
228         NV_PRINTF(LEVEL_ERROR, "Offset 0x%08x exceeds range!\n", index);
229 
230         return NV_ERR_INVALID_ARGUMENT;
231     }
232 
233     switch (function)
234     {
235         case 0:
236         {
237             GPU_REG_WR32_EX(pGpu, DEVICE_BASE(NV_PCFG) + index, data, pThreadState);
238             break;
239         }
240 
241         case 1:
242         {
243             //
244             // Function 1 is not modeled on fmodel
245             //
246             if (!IS_FMODEL(pGpu))
247             {
248                 GPU_REG_WR32_EX(pGpu, DEVICE_BASE(NV_PCFG1) + index, data, pThreadState);
249             }
250             break;
251         }
252 
253         default:
254             NV_PRINTF(LEVEL_ERROR,
255                       "attempt to read cfg space of non-existant function %x\n",
256                       function);
257             return NV_ERR_INVALID_ARGUMENT;
258     }
259 
260     return NV_OK;
261 }
262 
263 /*!
264  * @brief Perform gpu-dependent error handling for error during register read sanity check
265  *
266  * @param[in]       pGpu        GPU object pointer
267  * @param[in]       addr        Value address
268  * @param[in]       value       Value read during check
269  */
270 void
gpuHandleSanityCheckRegReadError_GM107(OBJGPU * pGpu,NvU32 addr,NvU32 value)271 gpuHandleSanityCheckRegReadError_GM107
272 (
273     OBJGPU *pGpu,
274     NvU32 addr,
275     NvU32 value
276 )
277 {
278 #if NV_PRINTF_ENABLED
279     //
280     // Read the interrupt status using the direct OS reg read call so we don't recurs
281     // if we happen to see GPU_READ_PRI_ERROR_CODE there as well (bug 799876)
282     //
283     NvU32 intr = osGpuReadReg032(pGpu, NV_PBUS_INTR_0);
284 
285     // To be sure, filter this down further by checking the related pri interrupts:
286     if (FLD_TEST_DRF(_PBUS, _INTR_0, _PRI_SQUASH,  _PENDING, intr) ||
287         FLD_TEST_DRF(_PBUS, _INTR_0, _PRI_FECSERR, _PENDING, intr) ||
288         FLD_TEST_DRF(_PBUS, _INTR_0, _PRI_TIMEOUT, _PENDING, intr))
289     {
290 #if NV_PRINTF_STRINGS_ALLOWED
291         const char *errorString = "Unknown SYS_PRI_ERROR_CODE";
292 
293         gpuGetSanityCheckRegReadError_HAL(pGpu, value,
294                                           &errorString);
295         NV_PRINTF(LEVEL_ERROR,
296                   "Possible bad register read: addr: 0x%x,  regvalue: 0x%x,  error code: %s\n",
297                   addr, value, errorString);
298 #else // NV_PRINTF_STRINGS_ALLOWED
299         NV_PRINTF(LEVEL_ERROR,
300                   "Possible bad register read: addr: 0x%x,  regvalue: 0x%x\n",
301                   addr, value);
302 #endif // NV_PRINTF_STRINGS_ALLOWED
303     }
304 #endif // NV_PRINTF_ENABLED
305 }
306 
307 void
gpuGetIdInfo_GM107(OBJGPU * pGpu)308 gpuGetIdInfo_GM107(OBJGPU *pGpu)
309 {
310     NvU32 data;
311 
312     if (NV_OK != GPU_BUS_CFG_RD32(pGpu, NV_XVE_REV_ID, &data))
313     {
314         NV_PRINTF(LEVEL_ERROR, "unable to read NV_XVE_REV_ID\n");
315         return;
316     }
317 
318     // we only need the FIB and MASK values
319     pGpu->idInfo.PCIRevisionID = (data & ~GPU_DRF_SHIFTMASK(NV_XVE_REV_ID_CLASS_CODE));
320 
321     if (NV_OK != GPU_BUS_CFG_RD32(pGpu, NV_XVE_SUBSYSTEM, &data))
322     {
323         NV_PRINTF(LEVEL_ERROR, "unable to read NV_XVE_SUBSYSTEM\n");
324         return;
325     }
326     pGpu->idInfo.PCISubDeviceID = data;
327 
328     if (NV_OK != GPU_BUS_CFG_RD32(pGpu, NV_XVE_ID, &data))
329     {
330         NV_PRINTF(LEVEL_ERROR, "unable to read NV_XVE_ID\n");
331         return;
332     }
333 
334     pGpu->idInfo.PCIDeviceID = data;
335 
336 }
337 
338 // GM200 used on all later GPUs
339 
340 //
341 // Lists the order of GPU children for engine state transitions (StateInit, StateLoad,
342 // StateUnload and StateDestroy). This list controls only the engine order. Engine
343 // presence is defined by gpuGetChildrenPresent_HAL. Engines in this list that aren't in the
344 // gpuGetChildrenPresent_HAL list are ignored.
345 //
346 // List entries contain {CLASS-ID, flags} pairs.
347 //
348 // Valid flags are:
349 //   GCO_ALL - entry is used for all list types
350 //   GCO_LIST_INIT - entry is used for init ordering (DO NOT USE)
351 //   GCO_LIST_LOAD - entry is used for load and postload ordering (DO NOT USE)
352 //   GCO_LIST_UNLOAD - entry is used for unload and preunload ordering (DO NOT USE)
353 //   GCO_LIST_DESTROY - entry is used for destroy order (DO NOT USE)
354 //
355 // For UNLOAD and DESTROY the list is played back in reverse from LOAD and INIT.
356 //
357 // IMPORTANT:
358 //   <1> GCO_ALL is the recommended flag to use for all engine types. Engines should
359 //       always have a consist order. If there are complicated dependencies that cannot
360 //       be resolved using this list, please use callbacks (such as fifoAddSchedulingHandler)
361 //   <1> DO NOT FORK THIS LIST. The goal is to have a single ordered list across all
362 //       chips. Inconsistent ordering makes it challenging to modify shared code to work
363 //       across all variations.
364 //
365 static const GPUCHILDORDER
366 gpuChildOrderList_GM200[] =
367 {
368     {classId(OBJVBIOS),           GCO_ALL},
369     {classId(ConfidentialCompute),      GCO_ALL},
370     {classId(Pxuc),               GCO_ALL},
371     {classId(OBJBIF),             GCO_ALL},
372     {classId(KernelBif),          GCO_ALL},
373     {classId(Nne),                GCO_ALL},
374     {classId(NvDebugDump),        GCO_ALL},
375     {classId(ClockManager),       GCO_ALL},
376     {classId(Pmgr),               GCO_ALL},
377     {classId(OBJVOLT),            GCO_ALL},
378     {classId(OBJMC),              GCO_ALL},
379     {classId(KernelMc),           GCO_ALL},
380     {classId(PrivRing),           GCO_ALL},
381     {classId(SwIntr),             GCO_ALL},
382     {classId(Intr),               GCO_ALL},
383     {classId(OBJTMR),             GCO_ALL},
384     {classId(Therm),              GCO_ALL},
385     {classId(OBJHSHUBMANAGER),    GCO_ALL},
386     {classId(Hshub),              GCO_ALL},
387     {classId(MemorySystem),       GCO_ALL},
388     {classId(KernelMemorySystem), GCO_ALL},
389     {classId(MemoryManager),      GCO_ALL},
390     {classId(Nvlink),             GCO_ALL},
391     {classId(KernelNvlink),       GCO_ALL},
392     {classId(OBJHDACODEC),        GCO_ALL},
393     {classId(OBJGMMU),            GCO_ALL},
394     {classId(KernelGmmu),         GCO_ALL},
395     {classId(OBJVMMU),            GCO_ALL},
396     {classId(KernelSec2),         GCO_ALL},
397     {classId(KernelGsp),          GCO_ALL},
398     {classId(OBJBUS),             GCO_ALL},
399     {classId(KernelBus),          GCO_ALL},
400     {classId(OBJLSFM),            GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY}, // LOAD  LSFM must be before ACR and any managed Falcon.
401     {classId(OBJACR),             GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
402     {classId(Pmu),                GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
403     {classId(KernelPmu),          GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
404     {classId(Gsp),                GCO_ALL},
405     {classId(OBJFSP),             GCO_ALL},
406     {classId(KernelFsp),          GCO_ALL},
407     {classId(OBJFBFLCN),          GCO_ALL},
408     {classId(Lpwr   ),            GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
409     {classId(Perf),               GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY}, // LOAD Perf is after PMU for perfmon_sampling to work
410     {classId(KernelPerf),         GCO_LIST_LOAD | GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
411     {classId(OBJDISP),            GCO_LIST_DESTROY},
412     {classId(KernelDisplay),      GCO_LIST_DESTROY},
413     {classId(OBJHDA),             GCO_LIST_DESTROY},
414     {classId(Fan),             GCO_LIST_DESTROY},
415     {classId(VirtMemAllocator),   GCO_ALL},
416     {classId(OBJDISP),            GCO_LIST_INIT},
417     {classId(KernelDisplay),      GCO_LIST_INIT},
418     {classId(OBJHDA),             GCO_LIST_INIT},
419     {classId(Fan),             GCO_LIST_INIT},
420     {classId(GraphicsManager),    GCO_ALL},
421     {classId(MIGManager),         GCO_ALL},
422     {classId(KernelMIGManager),   GCO_ALL},
423     {classId(KernelGraphicsManager), GCO_ALL},
424     {classId(Graphics),           GCO_ALL},                   // INIT GR has to be initialized before LSFM because
425                                                             // the ucode image pointers needed by LSFM are only
426                                                             // known after GR has loaded the netlist.
427 
428     {classId(KernelGraphics),     GCO_ALL},
429     {classId(OBJLSFM),            GCO_LIST_INIT},
430     {classId(OBJACR),             GCO_LIST_INIT},
431     {classId(Pmu),                GCO_LIST_INIT},
432     {classId(KernelPmu),          GCO_LIST_INIT},
433     {classId(Lpwr   ),            GCO_LIST_INIT},
434     {classId(Perf),               GCO_LIST_INIT},
435     {classId(KernelPerf),         GCO_LIST_INIT},
436     {classId(OBJBSP),             GCO_ALL},
437     {classId(OBJCIPHER),          GCO_ALL},
438     {classId(OBJDISP),            GCO_LIST_LOAD | GCO_LIST_UNLOAD},    // LOAD Display is *after* cipher so that hdcp keys can be loaded .
439     {classId(KernelDisplay),      GCO_LIST_LOAD | GCO_LIST_UNLOAD},    // LOAD Display is *after* cipher so that hdcp keys can be loaded .
440     {classId(OBJHDA),             GCO_LIST_LOAD | GCO_LIST_UNLOAD},
441     {classId(Fan),             GCO_LIST_LOAD | GCO_LIST_UNLOAD},
442     {classId(OBJCE),              GCO_ALL},
443     {classId(KernelCE),           GCO_ALL},
444     {classId(OBJMSENC),           GCO_ALL},
445     {classId(OBJNVJPG),           GCO_ALL},
446     {classId(OBJOFA),             GCO_ALL},
447     {classId(OBJSEC2),            GCO_ALL},
448     {classId(KernelFifo),              GCO_ALL},
449     {classId(OBJFIFO),            GCO_ALL},
450     {classId(OBJDPAUX),           GCO_ALL},
451     {classId(OBJINFOROM),         GCO_ALL},
452     {classId(OBJUVM),             GCO_ALL},
453     {classId(OBJGPULOG),          GCO_LIST_INIT | GCO_LIST_LOAD},
454     {classId(OBJGPUMON),          GCO_ALL},
455     {classId(OBJGPULOG),          GCO_LIST_UNLOAD | GCO_LIST_DESTROY},
456     {classId(KernelHwpm),         GCO_ALL},
457     {classId(OBJHWPM),            GCO_ALL},
458     {classId(OBJSWENG),           GCO_ALL},
459     {classId(OBJGRIDDISPLAYLESS), GCO_ALL},
460     {classId(KernelCcu),      GCO_ALL},
461 };
462 
463 const GPUCHILDORDER *
gpuGetChildrenOrder_GM200(OBJGPU * pGpu,NvU32 * pNumEntries)464 gpuGetChildrenOrder_GM200(OBJGPU *pGpu, NvU32 *pNumEntries)
465 {
466     *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildOrderList_GM200);
467     return gpuChildOrderList_GM200;
468 }
469 
470 //
471 // List of GPU children that present for the chip. List entries contain
472 // {CLASS-ID, # of instances} pairs, e.g.: {CE, 2} is 2 instance of OBJCE. This
473 // list controls only engine presence. Order is defined by
474 // gpuGetChildrenOrder_HAL.
475 //
476 // IMPORTANT: This function is to be deleted. Engine removal should instead be
477 // handled by <eng>ConstructEngine returning NV_ERR_NOT_SUPPORTED. PLEASE DO NOT
478 // FORK THIS LIST!
479 //
480 // List entries contain {CLASS-ID, # of instances} pairs.
481 //
482 static const GPUCHILDPRESENT gpuChildrenPresent_GM200[] =
483 {
484     GPU_CHILD_PRESENT(OBJTMR, 1),
485     GPU_CHILD_PRESENT(KernelMIGManager, 1),
486     GPU_CHILD_PRESENT(KernelGraphicsManager, 1),
487     GPU_CHILD_PRESENT(KernelRc, 1),
488     GPU_CHILD_PRESENT(Intr, 1),
489     GPU_CHILD_PRESENT(NvDebugDump, 1),
490     GPU_CHILD_PRESENT(OBJGPUMON, 1),
491     GPU_CHILD_PRESENT(OBJSWENG, 1),
492     GPU_CHILD_PRESENT(KernelBif, 1),
493     GPU_CHILD_PRESENT(KernelBus, 1),
494     GPU_CHILD_PRESENT(KernelCE, 3),
495     GPU_CHILD_PRESENT(KernelDisplay, 1),
496     GPU_CHILD_PRESENT(VirtMemAllocator, 1),
497     GPU_CHILD_PRESENT(KernelMemorySystem, 1),
498     GPU_CHILD_PRESENT(MemoryManager, 1),
499     GPU_CHILD_PRESENT(KernelFifo, 1),
500     GPU_CHILD_PRESENT(KernelGmmu, 1),
501     GPU_CHILD_PRESENT(KernelGraphics, 1),
502     GPU_CHILD_PRESENT(KernelHwpm, 1),
503     GPU_CHILD_PRESENT(KernelMc, 1),
504     GPU_CHILD_PRESENT(SwIntr, 1),
505     GPU_CHILD_PRESENT(KernelPerf, 1),
506     GPU_CHILD_PRESENT(KernelPmu, 1),
507 };
508 
509 const GPUCHILDPRESENT *
gpuGetChildrenPresent_GM200(OBJGPU * pGpu,NvU32 * pNumEntries)510 gpuGetChildrenPresent_GM200(OBJGPU *pGpu, NvU32 *pNumEntries)
511 {
512     *pNumEntries = NV_ARRAY_ELEMENTS(gpuChildrenPresent_GM200);
513     return gpuChildrenPresent_GM200;
514 }
515 
516 /*!
517  * @brief   checks for each type of bridge to deterimne what is available,
518  *          then selects the SLI bridge to use.
519  *
520  * @param[In]   gpuCount    The number of GPUs to be checked for SLI links.
521  * @param[In]   gpuMaskArg  A mask of the GPUs that are to be tested for SLI links.
522  * @param[Out]  pSliLinkOutputMask  a mask of the GPUs that are attached to the type of
523  *                  SLI link that is being used.
524  * @param[Out]  pSliLinkCircular    a boolean indicating if teh SLI link is circular.
525  * @param[Out]  pSliLinkEndsMask    a mask indicating the endpoints of the SLI link,
526  *                   if there are any.
527 .*/
528 void
gpuDetectSliLinkFromGpus_GK104(OBJGPU * pGpu,NvU32 gpuCount,NvU32 gpuMaskArg,NvU32 * pSliLinkOutputMask,NvBool * pSliLinkCircular,NvU32 * pSliLinkEndsMask,NvU32 * pVidLinkCount)529 gpuDetectSliLinkFromGpus_GK104
530 (
531     OBJGPU *pGpu,
532     NvU32   gpuCount,
533     NvU32   gpuMaskArg,
534     NvU32  *pSliLinkOutputMask,
535     NvBool *pSliLinkCircular,
536     NvU32  *pSliLinkEndsMask,
537     NvU32  *pVidLinkCount
538 )
539 {
540     NvU32       i;
541     NvU32       sliLinkOutputMask[SLI_MAX_BRIDGE_TYPES] = {0, 0};
542     NvBool      bSliLinkCircular[SLI_MAX_BRIDGE_TYPES]  = {NV_FALSE, NV_FALSE};
543     NvU32       sliLinkEndsMask[SLI_MAX_BRIDGE_TYPES]   = {0, 0};
544     NvU32       vidLinkCount[SLI_MAX_BRIDGE_TYPES]      = {0, 0};
545     OBJSYS     *pSys                                    = SYS_GET_INSTANCE();
546     OBJGPUMGR  *pGpuMgr                                 = SYS_GET_GPUMGR(pSys);
547     OBJGPU     *pGpuLoop;
548     OBJGPU     *pGpuSaved;
549     NvU32       gpuMask;
550     // Array to store the link detection HAL flag of GpuDetectVidLinkFromGpus_HAL and GpuDetectNvlinkLinkFromGpus_HAL.
551     NvU32       linkHalImpl[SLI_MAX_BRIDGE_TYPES];
552     NvBool      bFoundBridge = NV_FALSE;
553 
554     // set the return values assuming we will not find an SLI link.
555     *pSliLinkOutputMask = 0;
556     *pSliLinkCircular   = NV_FALSE;
557     *pSliLinkEndsMask   = 0;
558 
559     pGpuMgr->gpuBridgeType = SLI_BT_VIDLINK;
560 
561     //
562     // Link detection HAL should have same HAL implementation as HAL flag.
563     // This checks for mismatched HAL implementation flag.
564     //
565     NV_ASSERT_OR_RETURN_VOID(gpuGetSliLinkDetectionHalFlag_HAL(pGpu) == GPU_LINK_DETECTION_HAL_GK104);
566 
567     i = 0;
568     gpuMask = gpuMaskArg;
569     pGpuLoop = gpumgrGetNextGpu(gpuMask, &i);
570     if (pGpuLoop != NULL)
571     {
572         pGpuSaved = pGpuLoop;
573         linkHalImpl[SLI_BT_NVLINK]  = gpuGetNvlinkLinkDetectionHalFlag_HAL(pGpuLoop);
574 
575     }
576     else
577     {
578          return;
579     }
580 
581     // run thru the GPUs and see if they are all using the same HAL functions.
582     // if they are different, we can't use the function to check for a bridge
583     pGpuLoop = gpumgrGetNextGpu(gpuMask, &i);
584     while (NULL != pGpuLoop)
585     {
586         if (linkHalImpl[SLI_BT_NVLINK] != gpuGetNvlinkLinkDetectionHalFlag_HAL(pGpuLoop))
587         {
588             linkHalImpl[SLI_BT_NVLINK] = GPU_LINK_DETECTION_HAL_STUB;
589         }
590         pGpuLoop = gpumgrGetNextGpu(gpuMask, &i);
591     }
592 
593     if (linkHalImpl[SLI_BT_NVLINK] != GPU_LINK_DETECTION_HAL_STUB)
594     {
595         gpuDetectNvlinkLinkFromGpus_HAL(pGpuSaved, gpuCount, gpuMaskArg,
596                                         &sliLinkOutputMask[SLI_BT_NVLINK],
597                                         &bSliLinkCircular[SLI_BT_NVLINK],
598                                         &sliLinkEndsMask[SLI_BT_NVLINK],
599                                         &vidLinkCount[SLI_BT_NVLINK]);
600     }
601 
602     //
603     // Determine which type of bridge we are going to support.
604     // Currently we only support a single type of SLI bridge in the system.
605     //
606     for (i = 0; i < SLI_MAX_BRIDGE_TYPES; ++i)
607     {
608         if (sliLinkOutputMask[i] != 0)
609         {
610             if (bFoundBridge)
611             {
612                 NV_PRINTF(LEVEL_ERROR, "More than one type of SLI bridge detected!\n");
613                 NV_ASSERT(0);
614                 break;
615             }
616             else
617             {
618                 pGpuMgr->gpuBridgeType = (NvU8)i;
619                 *pSliLinkOutputMask = sliLinkOutputMask[i];
620                 *pSliLinkCircular = bSliLinkCircular[i];
621                 *pSliLinkEndsMask = sliLinkEndsMask[i];
622                 *pVidLinkCount = vidLinkCount[i];
623                 bFoundBridge = NV_TRUE;
624             }
625         }
626     }
627 }
628 
629 /*!
630  * @brief Get error that arises during sanity check on a register read value
631  *
632  * @param[in]       pGpu             GPU object pointer
633  * @param[in]       value            Register value
634  * @param[out]      pErrorString     Error string pointer
635  *
636  * @return void
637  */
638 void
gpuGetSanityCheckRegReadError_GM107(OBJGPU * pGpu,NvU32 value,const char ** pErrorString)639 gpuGetSanityCheckRegReadError_GM107
640 (
641     OBJGPU *pGpu,
642     NvU32 value,
643     const char **pErrorString
644 )
645 {
646 #define PRI_ERROR(err)                                    \
647 if (DRF_VAL(_PPRIV, _SYS_PRI_ERROR, _CODE, value) == err) \
648 {                                                         \
649     *pErrorString = #err;                                 \
650 }
651     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_HOST_FECS_ERR);
652     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_HOST_PRI_TIMEOUT);
653     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_HOST_FB_ACK_TIMEOUT);
654     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_TIMEOUT);
655     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_DECODE);
656     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_RESET);
657     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_FLOORSWEEP);
658     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_STUCK_ACK);
659     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_0_EXPECTED_ACK);
660     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_FENCE_ERROR);
661     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_SUBID_ERROR);
662     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_ORPHAN);
663     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_DEAD_RING);
664     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_TRAP);
665     PRI_ERROR(NV_PPRIV_SYS_PRI_ERROR_CODE_FECS_PRI_CLIENT_ERR);
666 }
667