1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /***************************** HW State Routines ***************************\
25 *                                                                           *
26 *         RM robust error journal (formerly RCDB)                           *
27 *                                                                           *
28 \***************************************************************************/
29 
30 #include "gpu_mgr/gpu_mgr.h"
31 #include "nvRmReg.h"
32 #include "nvBldVer.h"
33 #include "nvVer.h"
34 #include "os/os.h"
35 #include "core/system.h"
36 #include "gpu/gpu.h"
37 #include "diagnostics/journal.h"
38 #include "platform/chipset/chipset.h"
39 #include "diagnostics/nv_debug_dump.h"
40 #include "diagnostics/tracer.h"
41 #include "core/locks.h"
42 #include "rmapi/rs_utils.h"
43 #include "vgpu/rpc.h"
44 #include "gpu/bus/kern_bus.h"
45 #include "gpu/mem_mgr/mem_mgr.h"
46 #include "nvdevid.h"
47 #include "nvop.h"
48 #include "jt.h"
49 
50 
51 
52 #include "ctrl/ctrl0000/ctrl0000nvd.h"
53 
54 #include "nvlimits.h"
55 #include "Nvcm.h"
56 
57 #include "lib/protobuf/prb_util.h"
58 #include "g_all_dcl_pb.h"
59 #include "g_nvdebug_pb.h"
60 #include "nv_ref.h"
61 
62 #define NOCAT_UNKNOWN_STR                       "*** unknown ***"
63 #define NOCAT_NA_STR                            "N/A"
64 #define NOCAT_LEGACY_STR                        "LEGACY"
65 #define NOCAT_FULLCHIP_TDR_STR                  "FULL CHIP RESET"
66 #define NOCAT_BUS_RESET_TDR_STR                 "BUS RESET"
67 #define NOCAT_GC6_RESET_TDR_STR                 "GC6 RESET"
68 #define NOCAT_NORMAL_TDR_STR                    "NORMAL TDR"
69 #define NOCAT_UCODE_RESET_TDR_STR               "UCODE RESET"
70 #define NOCAT_SURPRISE_REMOVAL_TDR_STR          "SURPRISE REMOVAL"
71 #define NOCAT_DEFAULT_TAG_VALUE_STR             "prod"
72 #define NOCAT_DEFAULT_TDR_REASON_SRC_STR        "KMD"
73 #define NOCAT_FBSIZETESTED                      0x10
74 
75 // Reducing size to 4K for reducing non-paged allocations on win8
76 #define NVDUMP_DEBUGGER_BUFFER_SIZE (4 * 1024)
77 
78 #define JOURNAL_BUFFER_SIZE_DEFAULT    (4 * 1024)
79 
80 #define JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES    5
81 
82 static void nvdDebuggerControlFunc(void);
83 
84 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
85 #if !defined(DEBUG) && !defined(QA_BUILD)
86 static NvBool rcdProbeGpuPresent(OBJGPU *pGpu, NvU64 ip);
87 static NvBool rcdProbeAllGpusPresent(NvU64 ip);
88 static volatile NvS32 probeGpuRecursion = 0;
89 #endif
90 #endif
91 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type);
92 static volatile NvS32 concurrentRingBufferAccess = 0;
93 static volatile NvS32 assertListRecursion = 0;
94 static void rcdbFindRingBufferForType(Journal *pRcDB, RMCD_RECORD_TYPE recType, RING_BUFFER_LOG **ppRingBuffer);
95 static NV_STATUS _rcdbGetNocatJournalRecord(OBJRCDB* pRcdb,
96     NvU32 id, NvBool bExactMatch,
97     RmRCCommonJournal_RECORD** ppReturnedCommon, RM_NOCAT_JOURNAL_ENTRY** ppReturnedJournal);
98 static NV_STATUS _rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY* pReturnedJournal);
99 static NV_STATUS _rcdbNocatReportAssert(OBJGPU* pGpu, RmRCCommonAssert_RECORD* pAssert);
100 
101 // Global flag to make sure we never re-enter the nvLog code.
102 #if defined(DEBUG) || defined(ASSERT_BUILD) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
103 static volatile NvS32 nvLogRecursion = 0;
104 #endif
105 
106 // NvDump interface config - communicates with external kernel debuggers
107 NVDUMP_EXPORT volatile NV_DECLARE_ALIGNED(NVDUMP_CONFIG, 8) nvDumpConfig =
108 {
109     NVDUMP_CONFIG_SIGNATURE, // sigHead
110     NvP64_NULL, // debuggerControlFuncAddr
111     { NvP64_NULL, NVDUMP_DEBUGGER_BUFFER_SIZE, 0 }, // buffer
112     0, // gpuSelect
113     NVDUMP_COMPONENT_SYS_ALL, // component
114     NVDUMP_STATUS_IDLE, // dumpStatus
115     NV_OK, // rmStatus
116 
117     NVDUMP_CONFIG_SIGNATURE // sigTail
118 };
119 
120 void
rcdbDestruct_IMPL(Journal * pRcDB)121 rcdbDestruct_IMPL(Journal *pRcDB)
122 {
123     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
124 
125     // Deallocate NvDebug debugger dump buffer.
126     if (nvDumpConfig.buffer.address != NvP64_NULL)
127     {
128         portMemFree(NvP64_VALUE(nvDumpConfig.buffer.address));
129         nvDumpConfig.buffer.address = NvP64_NULL;
130     }
131 
132     // Delete Journal and Btree
133     if (pJournal->pBuffer != NULL)
134     {
135         portMemFree(pJournal->pBuffer);
136         portMemFree(pJournal->AssertList.ppList);
137 
138         // clear journal of anything
139         portMemSet(pJournal, 0, sizeof(EVENT_JOURNAL));
140     }
141 
142     rcdbClearErrorHistory(pRcDB);
143 
144     rcdbDestroyRingBufferCollection(pRcDB);
145 
146     portMemFree(pRcDB->previousDriverVersion);
147     pRcDB->previousDriverVersion = NULL;
148 
149     portMemFree(pRcDB->previousDriverBranch);
150     pRcDB->previousDriverBranch = NULL;
151 }
152 
153 static void
_initJournal(EVENT_JOURNAL * pJournal,NvU32 size)154 _initJournal(EVENT_JOURNAL *pJournal, NvU32 size)
155 {
156     // verify we are not abandoning any memory allocations.
157     NV_ASSERT(NULL == pJournal->pBuffer);
158     NV_ASSERT(NULL == (NvU8*) pJournal->AssertList.ppList);
159 
160     // init the Journal to an empty buffer.
161     pJournal->pBuffer = NULL;
162     pJournal->BufferSize = 0;
163     pJournal->pFree = pJournal->pBuffer;
164     pJournal->BufferRemaining = pJournal->BufferSize;
165     pJournal->pCurrCollection = NULL;
166     pJournal->RecordCount = 0;
167 
168     // init the assert list to an empty buffer.
169     pJournal->AssertList.ppList = NULL;
170     pJournal->AssertList.Size = 0;
171     pJournal->AssertList.Count = 0;
172     pJournal->AssertList.QualifyingStackSize = JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES;
173 
174     // allocate and initialize journal buffer memory
175     pJournal->pBuffer = portMemAllocNonPaged(size);
176     if (pJournal->pBuffer != NULL )
177     {
178         pJournal->BufferSize = size;
179         pJournal->pFree = pJournal->pBuffer;
180         pJournal->BufferRemaining = pJournal->BufferSize;
181 
182         // if the journal is large enough to hold at least one assert record,
183         // init the assert list as well.
184         if (sizeof(RmRCCommonAssert_RECORD) <= pJournal->BufferSize)
185         {
186             pJournal->AssertList.Size = pJournal->BufferSize / sizeof(RmRCCommonAssert_RECORD);
187             pJournal->AssertList.ppList = portMemAllocNonPaged(pJournal->AssertList.Size * sizeof(pJournal->AssertList.ppList[0]));
188             if (pJournal->AssertList.ppList == NULL )
189             {
190                 NV_PRINTF(LEVEL_ERROR,
191                           "Failure to allocate RC assert tracking buffer \n");
192                 pJournal->AssertList.Size = 0;
193             }
194         }
195     }
196     else
197     {
198         NV_PRINTF(LEVEL_ERROR, "Failure to allocate RC journal buffer \n");
199     }
200 }
201 
202 NV_STATUS
rcdbConstruct_IMPL(Journal * pRcDB)203 rcdbConstruct_IMPL(Journal *pRcDB)
204 {
205     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
206     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
207     NvU32 i;
208     void *pBuffer;
209 
210     // Time parameters
211     NvU32 sec, usec;
212     NvU64 timeStamp;
213     NvU64 systemTime;
214     NvU64 timeStampFreq;
215 
216     _initJournal(pJournal, JOURNAL_BUFFER_SIZE_DEFAULT);
217 
218     portMemSet(pRingBufferColl, 0x00, sizeof(pRcDB->RingBufferColl));
219 
220     pRcDB->BugcheckCount = 0;
221 
222     // Allocate NvDebug debugger dump buffer.
223     pBuffer = portMemAllocNonPaged(nvDumpConfig.buffer.size);
224     if (pBuffer != NULL)
225     {
226         nvDumpConfig.buffer.address = NV_SIGN_EXT_PTR_TO_NvP64(pBuffer);
227     }
228     else
229     {
230         NV_PRINTF(LEVEL_ERROR,
231                   "failed to allocate NVD debugger dump buffer\n");
232     }
233 
234     // Initialize NvDebug debugger function address.
235     nvDumpConfig.debuggerControlFuncAddr = NV_SIGN_EXT_PTR_TO_NvP64(nvdDebuggerControlFunc);
236 
237     //
238     // Create RC Diagnostic report Wrap Buffer
239     //
240     if (NULL == rcdbCreateRingBuffer(pRcDB, RmRcDiagReport, MAX_RCDB_RCDIAG_WRAP_BUFF))
241     {
242         NV_PRINTF(LEVEL_ERROR, "failed to allocate RC Diagnostic Ring Buffer\n");
243     }
244     // init the RC error report data
245     pRcDB->RcErrRptNextIdx = 0;
246     pRcDB->RcErrRptRecordsDropped = NV_FALSE;
247 
248     // Initialize RC Error Counters.
249     for ( i = 0  ;  i < MAX_RC_ERROR_COUNTER  ;  i++)
250     {
251         pRcDB->rcErrorCounterArray[i].rcErrorType  = RC_ERROR_COUNTER_TYPE_INVALID;
252         pRcDB->rcErrorCounterArray[i].rcErrorCount = 0;
253         pRcDB->rcErrorCounterArray[i].rcLastCHID   = INVALID_CHID;
254         pRcDB->rcErrorCounterArray[i].rcLastTime   = 0;
255     }
256      pRcDB->rcErrorCounterArray[RC_ERROR_COUNTER_OTHER_INDEX].rcErrorType  = RC_ERROR_COUNTER_OTHER_TYPE;
257 
258      // clear the Nocat Queue descriptors & counters
259      portMemSet(&pRcDB->nocatJournalDescriptor, 0x00, sizeof(pRcDB->nocatJournalDescriptor));
260      portMemSet(pRcDB->nocatJournalDescriptor.lastRecordId, 0xff, sizeof(pRcDB->nocatJournalDescriptor.lastRecordId));
261      pRcDB->nocatJournalDescriptor.nocatLastRecordType = NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN;
262      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks = NOCAT_CACHE_FRESHNESS_PERIOD_MS;
263      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks *= osGetTimestampFreq();
264      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks /= 1000ULL;
265 
266      //
267      // Create NOCAT report Wrap Buffer
268      //
269      if (NULL == rcdbCreateRingBuffer(pRcDB, RmNocatReport, MAX_RCDB_NOCAT_WRAP_BUFF))
270      {
271          NV_PRINTF(LEVEL_ERROR, "failed to allocate NOCAT Ring Buffer\n");
272      }
273 
274      // Save params for timestamp conversion
275      timeStampFreq = osGetTimestampFreq();
276      timeStamp = osGetTimestamp();
277      osGetCurrentTime(&sec, &usec);
278      systemTime = ((NvU64)sec * 1000000) + (NvU64)usec;
279 
280      pRcDB->systemTimeReference = systemTime - ((timeStamp * 1000000) / timeStampFreq);
281      pRcDB->timeStampFreq = timeStampFreq;
282 
283      return NV_OK;
284 }
285 
286 //
287 // Retrieve the previous driver version from volatile registry entires
288 // and then save the current driver version for next time.
289 //
rcdbSavePreviousDriverVersion_IMPL(OBJGPU * pGpu,Journal * pRcDB)290 NV_STATUS rcdbSavePreviousDriverVersion_IMPL
291 (
292     OBJGPU  *pGpu,
293     Journal *pRcDB
294 )
295 {
296     NV_STATUS nvStatus = NV_OK;
297 
298     NvU32     regEntrySize = 0;
299     NvU32     changeListNum = NV_LAST_OFFICIAL_CHANGELIST_NUM;
300 
301     // Only run this code only once each time the driver is loaded.
302     if (pRcDB->bPrevDriverCodeExecuted)
303         return NV_OK;
304 
305     pRcDB->bPrevDriverCodeExecuted = NV_TRUE;
306 
307     //
308     // Get the previous driver version information
309     // from volatile registry settings.
310     //
311     nvStatus = osReadRegistryVolatileSize(pGpu,
312         NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, &regEntrySize);
313 
314     // Early exit if this platform does not support volatile registry.
315     if (nvStatus == NV_ERR_NOT_SUPPORTED)
316         return NV_OK;
317 
318     if ((NV_OK == nvStatus) && (0 != regEntrySize))
319     {
320         //
321         // Previous driver version is there, so assume all previous driver
322         // information is there as well.
323         //
324         pRcDB->previousDriverVersion = portMemAllocNonPaged(regEntrySize + 1);
325         if (pRcDB->previousDriverVersion == NULL)
326         {
327             nvStatus = NV_ERR_NO_MEMORY;
328             DBG_BREAKPOINT();
329             goto rcdbSavePreviousDriverVersion_writeRegistry;
330         }
331 
332         nvStatus = osReadRegistryVolatile(pGpu,
333                                      NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
334                                      (NvU8 *)pRcDB->previousDriverVersion,
335                                      regEntrySize);
336         if (nvStatus != NV_OK)
337         {
338             DBG_BREAKPOINT();
339             goto rcdbSavePreviousDriverVersion_writeRegistry;
340         }
341         pRcDB->previousDriverVersion[regEntrySize] = 0;
342 
343         nvStatus = osReadRegistryVolatileSize(pGpu,
344             NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, &regEntrySize);
345         if ((nvStatus != NV_OK) || (0 == regEntrySize))
346         {
347             DBG_BREAKPOINT();
348             goto rcdbSavePreviousDriverVersion_writeRegistry;
349         }
350 
351         pRcDB->previousDriverBranch = portMemAllocNonPaged(regEntrySize + 1);
352         if (pRcDB->previousDriverBranch == NULL)
353         {
354             nvStatus = NV_ERR_NO_MEMORY;
355             DBG_BREAKPOINT();
356             goto rcdbSavePreviousDriverVersion_writeRegistry;
357         }
358 
359         nvStatus = osReadRegistryVolatile(pGpu,
360                                          NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
361                                          (NvU8 *)pRcDB->previousDriverBranch,
362                                          regEntrySize);
363         if (nvStatus != NV_OK)
364         {
365             DBG_BREAKPOINT();
366             goto rcdbSavePreviousDriverVersion_writeRegistry;
367         }
368         pRcDB->previousDriverBranch[regEntrySize] = 0;
369 
370         nvStatus = osReadRegistryVolatile(pGpu,
371                                      NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
372                                      (NvU8 *)&pRcDB->prevDriverChangelist,
373                                      sizeof(pRcDB->prevDriverChangelist));
374         if (nvStatus != NV_OK)
375         {
376             DBG_BREAKPOINT();
377             goto rcdbSavePreviousDriverVersion_writeRegistry;
378         }
379 
380         nvStatus = osReadRegistryVolatile(pGpu,
381                                      NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
382                                      (NvU8 *)&pRcDB->driverLoadCount,
383                                      sizeof(pRcDB->driverLoadCount));
384         if (nvStatus != NV_OK)
385         {
386             DBG_BREAKPOINT();
387             goto rcdbSavePreviousDriverVersion_writeRegistry;
388         }
389     }
390 
391     // Always write out the driver info, even if there was an error reading it.
392 rcdbSavePreviousDriverVersion_writeRegistry:
393     pRcDB->driverLoadCount++;
394 
395     osWriteRegistryVolatile(pGpu,
396                             NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
397                             (NvU8 *)NV_VERSION_STRING,
398                             sizeof(NV_VERSION_STRING));
399 
400     osWriteRegistryVolatile(pGpu,
401                             NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
402                             (NvU8 *)NV_BUILD_BRANCH_VERSION,
403                             sizeof(NV_BUILD_BRANCH_VERSION));
404 
405     osWriteRegistryVolatile(pGpu,
406                             NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
407                             (NvU8 *)&changeListNum,
408                             sizeof(changeListNum));
409 
410     osWriteRegistryVolatile(pGpu,
411                             NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
412                             (NvU8 *)&pRcDB->driverLoadCount,
413                             sizeof(pRcDB->driverLoadCount));
414 
415     return nvStatus;
416 }
417 
rcdbAddAssertJournalRecWithLine(void * pVoidGpu,NvU32 lineNum,void ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size,NvU32 level,NvU64 key)418 NV_STATUS rcdbAddAssertJournalRecWithLine(void *pVoidGpu, NvU32 lineNum, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
419 {
420     OBJSYS                     *pSys;
421     Journal                    *pRcDB;
422     OBJGPU                     *pPossibleNULLGpu;
423     JOURNAL_ASSERT_LIST        *pAssertList;
424     RmRCCommonAssert_RECORD     newAssertRec;
425     RmRCCommonAssert_RECORD    *pAssertRec;
426     NV_STATUS                   rmStatus = NV_ERR_GENERIC;
427     NvU32                       i;
428 
429     //
430     // Note: we allow NULL pGpu here, as many clients (such as KMD)
431     // do not have access to pGpu.  And much of the RM does not provide this either.
432     //
433     pPossibleNULLGpu = reinterpretCast(pVoidGpu, OBJGPU *);
434 
435     pSys = SYS_GET_INSTANCE();
436     if (!pSys)
437     {
438         return NV_ERR_INVALID_STATE;
439     }
440 
441     pRcDB = SYS_GET_RCDB(pSys);
442     if (!pRcDB)
443     {
444         return NV_ERR_INVALID_STATE;
445     }
446 
447     pAssertList = &pRcDB->Journal.AssertList;
448 
449     *ppRec = NULL;
450 
451     RMTRACE_PROBE4_PRIMTYPE(rcjournal, assertlog, NvU32, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : 0), NvU8, type, NvU32, level, NvU64, key);
452 
453     // create a local instance of the Assert record.
454     portMemSet(&newAssertRec, 0x00, sizeof(newAssertRec));
455     rcdbSetCommonJournalRecord(pPossibleNULLGpu, &newAssertRec.common);
456     newAssertRec.count = 1;
457     newAssertRec.breakpointAddrHint = key;
458     newAssertRec.lineNum = lineNum;
459 
460     if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_COMPRESS))
461     {
462         // search for a pre-existing assert record with the same stack
463         for (i = 0; i < pAssertList->Count; ++i)
464         {
465             pAssertRec = pAssertList->ppList[i];
466             if ((newAssertRec.breakpointAddrHint == pAssertRec->breakpointAddrHint) &&
467                 (0 == portMemCmp(newAssertRec.callStack, pAssertRec->callStack,
468                     sizeof(newAssertRec.callStack[0]) * pAssertList->QualifyingStackSize)))
469             {
470                 pAssertRec->count++;
471                 pAssertRec->lastTimeStamp = newAssertRec.common.timeStamp;
472 
473                 rmStatus = NV_OK;
474                 break;
475             }
476         }
477     }
478 
479     if (rmStatus != NV_OK)
480     {
481         // Discard to avoid reentry from messing up record array.
482         if (portAtomicIncrementS32(&assertListRecursion) == 1)
483         {
484             rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pAssertRec, jGroup, type, size);
485             if (NV_OK == rmStatus)
486             {
487                 // the Header is filled in when the record is allocated, so update the local instance header.
488                 newAssertRec.common.Header = pAssertRec->common.Header;
489                 *pAssertRec = newAssertRec;
490                 if (pAssertList->Count < pAssertList->Size)
491                 {
492                     pAssertList->ppList[pAssertList->Count] = pAssertRec;
493                     ++(pAssertList->Count);
494                 }
495                 else
496                 {
497                     // based on the way the assert list size is calculated this should never happen....
498                     NV_PRINTF(LEVEL_ERROR,
499                               "failed to insert tracking for assert record\n");
500                 }
501             }
502         }
503         portAtomicDecrementS32(&assertListRecursion);
504     }
505 
506     if (rmStatus == NV_OK)
507     {
508         RMTRACE_RMJOURNAL(_ASSERTLOG, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
509                                       type,
510                                       jGroup,
511                                       key,
512                                       pAssertRec->count,
513                                       pAssertRec->common.timeStamp,
514                                       pAssertRec->lastTimeStamp);
515         *ppRec = pAssertRec;
516 
517         _rcdbNocatReportAssert(pPossibleNULLGpu, pAssertRec);
518     }
519     else
520     {
521         _rcdbNocatReportAssert(pPossibleNULLGpu, &newAssertRec);
522     }
523 
524     return rmStatus;
525 }
526 
rcdbAddAssertJournalRec(void * pVoidGpu,void ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size,NvU32 level,NvU64 key)527 NV_STATUS rcdbAddAssertJournalRec(void *pVoidGpu, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
528 {
529     return rcdbAddAssertJournalRecWithLine(pVoidGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, ppRec, jGroup, type, size, level, key);
530 }
531 // Populate stateMask with flags that represent the power state and other useful things.
_getCommonJournalStateMask(OBJGPU * pGpu)532 static NvU64 _getCommonJournalStateMask(OBJGPU *pGpu)
533 {
534     NvU64 stateMask = REF_NUM(NV_RM_JOURNAL_STATE_MASK_GC6_STATE,
535         pGpu->gc6State.currentState);
536 
537     if (!gpuIsGpuFullPower(pGpu))
538         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_FULL_POWER;
539 
540     if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED))
541         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_CONNECTED;
542 
543     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_STANDBY))
544         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_STANDBY;
545 
546     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_HIBERNATE))
547         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_HIBERNATE;
548 
549     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH))
550         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_PM_CODEPATH;
551 
552     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET))
553         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_GC6_RESET;
554 
555     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET))
556         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_FULLCHIP_RESET;
557 
558     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET))
559         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_SEC_BUS_RESET;
560 
561     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_TIMEOUT_RECOVERY))
562         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_TIMEOUT_RECOVERY;
563 
564     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
565         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_LOST;
566 
567     return stateMask;
568 }
569 
570 // Fill in the common portion of the journal structure.
571 void
rcdbSetCommonJournalRecord(OBJGPU * pGpu,RmRCCommonJournal_RECORD * pRec)572 rcdbSetCommonJournalRecord
573 (
574     OBJGPU *pGpu,
575     RmRCCommonJournal_RECORD *pRec
576 )
577 {
578     OS_THREAD_HANDLE threadId;
579 
580     pRec->timeStamp = osGetTimestamp();
581     pRec->GPUTag    = 0;
582     pRec->CPUTag    = 0;
583     pRec->stateMask = 0;
584 
585     if (pGpu)
586     {
587         pRec->GPUTag    = pGpu->gpuId;
588         pRec->stateMask = _getCommonJournalStateMask(pGpu);
589     }
590 
591     if (NV_OK == osGetCurrentThread(&threadId))
592     {
593         pRec->CPUTag = (NvU64)threadId;
594     }
595 }
596 
597 NV_STATUS
rcdbAddBugCheckRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,NvU32 bugCheckCode)598 rcdbAddBugCheckRec_IMPL
599 (
600     OBJGPU  *pGpu,
601     Journal *pRcDB,
602     NvU32    bugCheckCode
603 )
604 {
605     RmJournalBugcheck_RECORD *pRec;
606     NV_STATUS                 rmStatus;
607 
608     rmStatus = rcdbAllocNextJournalRec(pRcDB,
609                                        (NVCD_RECORD **)&pRec,
610                                        RmGroup,
611                                        RmJournalBugCheck,
612                                        sizeof(*pRec));
613     if (NV_OK == rmStatus)
614     {
615         rcdbSetCommonJournalRecord(pGpu, &pRec->common);
616         pRec->bugCheckCode = bugCheckCode;
617     }
618 
619      pRcDB->BugcheckCount++;
620 
621     return rmStatus;
622 }
623 
624 NV_STATUS
rcdbAddPowerStateRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,NvU32 powerEvent,NvU32 state,NvU32 fastBootPowerState)625 rcdbAddPowerStateRec_IMPL
626 (
627     OBJGPU  *pGpu,
628     Journal *pRcDB,
629     NvU32    powerEvent,
630     NvU32    state,
631     NvU32    fastBootPowerState
632 )
633 {
634     RmPowerState_RECORD       newRmDiagWrapBuffRec;
635 
636     // Create Records, then write it.
637     newRmDiagWrapBuffRec.powerState = state;
638     newRmDiagWrapBuffRec.powerEvent = powerEvent;
639     newRmDiagWrapBuffRec.fastBootPowerState = fastBootPowerState;
640     rcdbAddRecToRingBuffer(pGpu, pRcDB, RmPowerState,
641                               sizeof(RmPowerState_RECORD), (NvU8 *)&newRmDiagWrapBuffRec);
642     return NV_OK;
643 }
644 
645 NV_STATUS
rcdbGetRcDiagRecBoundaries_IMPL(Journal * pRcDB,NvU16 * pStart,NvU16 * pEnd,NvU32 owner,NvU32 processId)646 rcdbGetRcDiagRecBoundaries_IMPL
647 (
648     Journal  *pRcDB,
649     NvU16    *pStart,
650     NvU16    *pEnd,
651     NvU32     owner,
652     NvU32     processId
653 )
654 {
655     NV_STATUS                   status = NV_ERR_MISSING_TABLE_ENTRY;
656     RmRCCommonJournal_RECORD   *pCommon;
657     RmRcDiag_RECORD            *pRecord = NULL;
658     RING_BUFFER_LOG            *pRingBuffer = NULL;
659     NvU32                       i;
660     NvU16                       logicalStartIdx;
661     NvU16                       start = 0;
662     NvU16                       end = 0;
663     NvBool                      foundStart = NV_FALSE;
664     NvBool                      foundEnd = NV_FALSE;
665 
666     // scan the buffer to find all the qualified records & return the
667     // first & last indicies of the qualified records found.
668 
669     // Get the Diag Report Ring buffer.
670     rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
671 
672     // attempt to claim ownership
673     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
674     {
675         // get the logical start of the buffer.
676         logicalStartIdx = pRingBuffer->headIndex;
677 
678         // run thru all the entries in the buffer, start to end, until we find the start & end of the range we are looking for.
679         for (i = 0; i < pRingBuffer->numEntries; ++i)
680         {
681             // get a pointer to the record from the buffer.
682             pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * ((logicalStartIdx + i) % pRingBuffer->maxEntries)));
683             pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
684 
685             // check to see if the record qualifies
686             if (((RCDB_RCDIAG_DEFAULT_OWNER != owner) && (pRecord->owner != owner) && (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID != owner))
687                 || ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID != processId) && (pRecord->processId != processId)))
688             {
689                 continue;
690             }
691             switch (foundStart)
692             {
693             case NV_FALSE:
694                 // check if this is a start record.
695                 // we want the first record to be a start record to insure that all the reports that are in the range are complete
696                 // (I.E. we didn't wrap over the first record of a report)
697                 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_FIRST))
698                 {
699                     // yes save the idx as the first Idx, & note that we found the start of the range.
700                     start = pRecord->idx;
701                     foundStart = NV_TRUE;
702                 }
703                 // fall thru to check if the start of the report is also the end of the report.
704 
705             case NV_TRUE:
706                 // check if this is an end record.
707                 // we want the last record in the range to be an end record to insure that all the reports that are in the range are complete
708                 // (Note -- in the case of end records, this should only be an issue if we are interrupting the collection of a report)
709                 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_LAST))
710                 {
711                     // save the idx as the last idx & continue scanning until we have checked all the records.
712                     // the last idx saved will be the last idx.
713                     end = pRecord->idx;
714                     foundEnd = foundStart;
715                 }
716                 break;
717             }
718         }
719         // checking end is sufficient, because end can't be set w/o start being set first.
720         if (foundEnd)
721         {
722             // we found a complete range, mark us as succeeding.
723             status = NV_OK;
724 
725             // pass up the results.
726             if (NULL != pEnd)
727             {
728                 *pEnd = end;
729             }
730             if (NULL != pStart)
731             {
732                 *pStart = start;
733             }
734         }
735     }
736     else
737     {
738         // the buffer is currently busy.
739         status = NV_ERR_BUSY_RETRY;
740     }
741     portAtomicDecrementS32(&concurrentRingBufferAccess);
742     return status;
743 }
744 
745 RmRCCommonJournal_RECORD *
rcdbAddRcDiagRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,RmRcDiag_RECORD * pRmDiagWrapBuffRec)746 rcdbAddRcDiagRec_IMPL
747 (
748     OBJGPU  *pGpu,
749     Journal *pRcDB,
750     RmRcDiag_RECORD       *pRmDiagWrapBuffRec
751 )
752 {
753     RmRCCommonJournal_RECORD *pCommon;
754     NvU32   usec;
755 
756     // Create Records, then write it.
757     pRmDiagWrapBuffRec->idx = (pRcDB->RcErrRptNextIdx)++;
758     if (MAX_RCDB_RCDIAG_ENTRIES < pRmDiagWrapBuffRec->count)
759     {
760         NV_ASSERT_FAILED("Diag report to large for buffer");
761         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].offset = 0;
762         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].tag = NV0000_CTRL_CMD_NVD_RCERR_RPT_REG_OVERFLOWED;
763         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].value = pRmDiagWrapBuffRec->count - MAX_RCDB_RCDIAG_ENTRIES + 1;
764         pRmDiagWrapBuffRec->count = MAX_RCDB_RCDIAG_ENTRIES;
765     }
766     osGetCurrentTime(&(pRmDiagWrapBuffRec->timeStamp), &usec);
767 
768     pCommon = rcdbAddRecToRingBuffer(pGpu, pRcDB, RmRcDiagReport,
769                                      sizeof(RmRcDiag_RECORD), (NvU8 *)pRmDiagWrapBuffRec);
770 
771     pRcDB->RcErrRptRecordsDropped |= pRcDB->RcErrRptNextIdx >= MAX_RCDB_RCDIAG_WRAP_BUFF;
772     return pCommon;
773 }
774 
775 RmRCCommonJournal_RECORD *
rcdbAddRcDiagRecFromGsp_IMPL(OBJGPU * pGpu,Journal * pRcDB,RmRCCommonJournal_RECORD * pCommonGsp,RmRcDiag_RECORD * pRmDiagGsp)776 rcdbAddRcDiagRecFromGsp_IMPL
777 (
778     OBJGPU  *pGpu,
779     Journal *pRcDB,
780     RmRCCommonJournal_RECORD   *pCommonGsp,
781     RmRcDiag_RECORD            *pRmDiagGsp
782 )
783 {
784     RmRCCommonJournal_RECORD   *pCommonCpu;
785 
786     pCommonCpu = rcdbAddRcDiagRec(pGpu, pRcDB, pRmDiagGsp);
787     if (pCommonCpu)
788     {
789         NV_ASSERT(pCommonCpu->GPUTag == pCommonGsp->GPUTag);
790         pCommonCpu->stateMask |= pCommonGsp->stateMask;
791     }
792 
793     return pCommonCpu;
794 }
795 
796 NV_STATUS
_rcdbInternalGetRcDiagRec(Journal * pRcDB,NvU16 reqIdx,RmRCCommonJournal_RECORD ** ppRmDiagWrapBuffRec,NvU32 owner,NvU32 processId)797 _rcdbInternalGetRcDiagRec
798 (
799     Journal                    *pRcDB,
800     NvU16                       reqIdx,
801     RmRCCommonJournal_RECORD  **ppRmDiagWrapBuffRec,
802     NvU32                       owner,
803     NvU32                       processId
804 )
805 {
806     RmRCCommonJournal_RECORD   *pCommon;
807     RmRcDiag_RECORD*            pRecord = NULL;
808     NV_STATUS                   status = NV_ERR_INVALID_INDEX;
809     RING_BUFFER_LOG            *pRingBuffer = NULL;
810 
811     NvU32                       i;
812 
813     // assume we will fail.
814     *ppRmDiagWrapBuffRec = NULL;
815 
816     // Find the ring buffer for the diag reports
817     rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
818 
819     // is the requested record in the buffer?
820     if ((NvU16)(pRcDB->RcErrRptNextIdx - reqIdx) <= pRingBuffer->numEntries)
821     {
822         // calculate the location of the record.
823         // find the record just past the last record in the buffer. to use as the initial offset.
824         i = pRingBuffer->headIndex + pRingBuffer->numEntries;
825 
826         // subtract off the diff between the next idx to be used & the requested idx.
827         i -= pRcDB->RcErrRptNextIdx - reqIdx;
828 
829         // wrap the offset to the size of the buffer.
830         i %= pRingBuffer->maxEntries;
831 
832         // get a pointer to the record from the buffer.
833         pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * i));
834         pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
835 
836         // verify we have the record that was requested.
837         NV_ASSERT_OR_RETURN(pRecord->idx == reqIdx, NV_ERR_INVALID_INDEX);
838 
839         // we found the requested Index,
840         // check to see if the record qualifies
841         if (((RCDB_RCDIAG_DEFAULT_OWNER == owner) || (pRecord->owner == owner) || (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner))
842             && ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId) || (pRecord->processId == processId)))
843         {
844             // combination of ANY_OWNER_ID && ANY_PROCESS_ID is not valid
845             if (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner && NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId)
846             {
847                 status = NV_ERR_INSUFFICIENT_PERMISSIONS;
848                 goto exit;
849             }
850             // we found a record that fully qualifies
851             *ppRmDiagWrapBuffRec = pCommon;
852             status = NV_OK;
853         }
854         else
855         {
856             // we found the record, but it does not pass the filter.
857             status = NV_ERR_INSUFFICIENT_PERMISSIONS;
858         }
859     }
860 exit:
861     return status;
862 }
863 
864 NV_STATUS
rcdbGetRcDiagRec_IMPL(Journal * pRcDB,NvU16 reqIdx,RmRCCommonJournal_RECORD ** ppRmDiagWrapBuffRec,NvU32 owner,NvU32 processId)865 rcdbGetRcDiagRec_IMPL
866 (
867     Journal                    *pRcDB,
868     NvU16                       reqIdx,
869     RmRCCommonJournal_RECORD  **ppRmDiagWrapBuffRec,
870     NvU32                       owner,
871     NvU32                       processId
872 )
873 {
874     NV_STATUS                   status;
875 
876     if (ppRmDiagWrapBuffRec == NULL)
877     {
878         return NV_ERR_INVALID_ARGUMENT;
879     }
880 
881     *ppRmDiagWrapBuffRec = NULL;
882 
883     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
884     {
885         status = _rcdbInternalGetRcDiagRec(pRcDB, reqIdx, ppRmDiagWrapBuffRec, owner, processId);
886     }
887     else
888     {
889         status = NV_ERR_BUSY_RETRY;
890     }
891     portAtomicDecrementS32(&concurrentRingBufferAccess);
892     return status;
893 }
894 
895 //
896 //  The function to set context data for all the RmRcDiag_RECORDs in a specified range
897 //
898 NV_STATUS
rcdbUpdateRcDiagRecContext_IMPL(Journal * pRcDB,NvU16 rangeStartIdx,NvU16 rangeEndIdx,NvU32 processId,NvU32 owner)899 rcdbUpdateRcDiagRecContext_IMPL
900 (
901     Journal                    *pRcDB,
902     NvU16                       rangeStartIdx,
903     NvU16                       rangeEndIdx,
904     NvU32                       processId,
905     NvU32                       owner
906 )
907 {
908     RmRCCommonJournal_RECORD   *pCommon = NULL;
909     RmRcDiag_RECORD*            pRecord = NULL;
910     NV_STATUS                   status = NV_OK;
911     NV_STATUS                   recStatus = NV_ERR_OUT_OF_RANGE;
912 
913     NvU16                       i;
914 
915     // go from the start index thru the end index.
916     // note we use != because the indicies will wrap.
917     for (i = rangeStartIdx; i != (NvU16)(rangeEndIdx + 1U); i++)
918     {
919         recStatus = rcdbGetRcDiagRec(pRcDB, i, &pCommon, RCDB_RCDIAG_DEFAULT_OWNER, NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID);
920         if (NV_OK != recStatus)
921         {
922             // something went wrong,
923             // record the status & skip this record.
924             status = recStatus;
925             continue;
926         }
927         // get the pointer to the diag record.
928         pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
929 
930         pRecord->owner = owner;
931         pRecord->processId = processId;
932     }
933     return status;
934 }
935 
936 //
937 // size must include NVCD_RECORD size too
938 //
rcdbAllocNextJournalRec_IMPL(Journal * pRcDB,NVCD_RECORD ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size)939 NV_STATUS rcdbAllocNextJournalRec_IMPL(Journal *pRcDB, NVCD_RECORD** ppRec, NvU8 jGroup, NvU8 type, NvU16 size)
940 {
941     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
942 
943     if ( ppRec == NULL )
944         return NV_ERR_GENERIC;
945 
946     if ( pJournal->pBuffer == NULL || pJournal->BufferSize == 0 )
947         return NV_ERR_GENERIC;
948 
949     if ( size == 0 || pJournal->BufferRemaining < size )
950     {
951         return NV_ERR_GENERIC;
952     }
953 
954     *ppRec = (NVCD_RECORD*)(pJournal->pFree);
955 
956     (*ppRec)->cRecordGroup = jGroup;
957     (*ppRec)->cRecordType = type;
958     (*ppRec)->wRecordSize = size;
959 
960     if ( pJournal->pCurrCollection )
961     {
962         pJournal->pCurrCollection->NumRecords++;
963         pJournal->pCurrCollection->Header.wRecordSize += size;
964     }
965     else
966     {
967         // standalone record (not part of collection) - increase total count
968         pJournal->RecordCount++;
969     }
970 
971     pJournal->pFree += size;
972     pJournal->BufferRemaining -= size;
973 
974     return NV_OK;
975 }
976 
rcdbClearErrorHistory_IMPL(Journal * pRcDB)977 NV_STATUS rcdbClearErrorHistory_IMPL(Journal *pRcDB)
978 {
979     SYS_ERROR_INFO         *pSysErrorInfo = &pRcDB->ErrorInfo;
980     RMFIFOERRORELEMENT_V3* pFifoErrorInfo;
981     RMFIFOERRORELEMENT_V3* pFreeErrorInfo;
982 
983     // Wait until any errors currently being reported are complete
984     while (!portAtomicCompareAndSwapU32(&pSysErrorInfo->InUse, 1, 0))
985     {
986         // We're not going to sleep, but safe to sleep also means safe to spin..
987         NV_ASSERT_OR_RETURN(portSyncExSafeToSleep(), NV_ERR_INVALID_STATE);
988         portUtilSpin();
989     }
990 
991     pFifoErrorInfo = (RMFIFOERRORELEMENT_V3*) pSysErrorInfo->pErrorList;
992     while (NULL != pFifoErrorInfo)
993     {
994         pFreeErrorInfo = pFifoErrorInfo;
995         pFifoErrorInfo = pFifoErrorInfo->ErrorHeader.pNextError;
996         rcdbDeleteErrorElement(pRcDB, pFreeErrorInfo);
997     }
998 
999     pSysErrorInfo->ErrorCount = 0x0;
1000     pSysErrorInfo->LogCount = 0x0;
1001     pSysErrorInfo->pErrorList = NULL;
1002 
1003     portAtomicSetU32(&pSysErrorInfo->InUse, 0);
1004     return NV_OK;
1005 }
1006 
1007 
rcdbDeleteErrorElement_IMPL(Journal * pRcDB,void * pDelete)1008 NV_STATUS rcdbDeleteErrorElement_IMPL(Journal *pRcDB, void *pDelete)
1009 {
1010     RMFIFOERRORELEMENT_V3* pFifoDelete = (RMFIFOERRORELEMENT_V3*)pDelete;
1011     RMCD_ERROR_BLOCK*              pErrorBlock;
1012     RMCD_ERROR_BLOCK*              pOldErrorBlock;
1013 
1014     // Free Additional Error Block
1015     for (pErrorBlock = pFifoDelete->ErrorHeader.pErrorBlock; pErrorBlock != NULL;)
1016     {
1017         pOldErrorBlock = pErrorBlock;
1018         pErrorBlock = pErrorBlock->pNext;
1019         portMemFree(pOldErrorBlock->pBlock);
1020         portMemFree(pOldErrorBlock);
1021     }
1022 
1023     // Free Error Collector
1024     portMemFree(pFifoDelete);
1025 
1026     return NV_OK;
1027 }
1028 
1029 // Frees up the all the ring buffers
rcdbDestroyRingBufferCollection_IMPL(Journal * pRcDB)1030 void rcdbDestroyRingBufferCollection_IMPL(Journal *pRcDB)
1031 {
1032     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1033     NvU32 i;
1034     RING_BUFFER_LOG* pCurrentBuffer = pRingBufferColl->pFirstEntry;
1035 
1036     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1037     {
1038         RING_BUFFER_LOG* pTempCurrentBuffer = pCurrentBuffer;
1039 
1040         NV_ASSERT(pCurrentBuffer != NULL);
1041         NV_ASSERT(pCurrentBuffer->pBuffer != NULL);
1042 
1043         portMemFree(pCurrentBuffer->pBuffer);
1044 
1045         pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1046 
1047         // Free the current ring buffer entry.
1048         portMemFree(pTempCurrentBuffer);
1049     }
1050 
1051     // pCurrentBuffer should be NULL if our accounting of NumEntries is correct
1052     NV_ASSERT(pCurrentBuffer == NULL);
1053 
1054     portMemSet(pRingBufferColl, 0x00, sizeof(*pRingBufferColl));
1055 }
1056 
1057 
1058 static NvU32 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord);
1059 static void _rcdbDumpCommonJournalRecord(PRB_ENCODER *pPrbEnc,const PRB_FIELD_DESC *pFieldDesc,PRmRCCommonJournal_RECORD pRec);
1060 
1061 /*!
1062  * @brief Initialize the GPU accessible flag
1063  *
1064  * @param[in] pGPU
1065  * @param[in] pRcDB
1066  *
1067  * @return NV_OK
1068  */
1069 NV_STATUS
rcdbDumpInitGpuAccessibleFlag_IMPL(OBJGPU * pGpu,Journal * pRcDB)1070 rcdbDumpInitGpuAccessibleFlag_IMPL
1071 (
1072     OBJGPU  *pGpu,
1073     Journal *pRcDB
1074 )
1075 {
1076     pRcDB->nvDumpState.bGpuAccessible =
1077         pRcDB->nvDumpState.bRMLock                                    &&
1078         !pGpu->bIsSOC                                                 &&
1079         !IS_VIRTUAL(pGpu)                                             &&
1080         gpuIsGpuFullPower(pGpu)                                       &&
1081         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET)      &&
1082         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET) &&
1083         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET)           &&
1084         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH)         &&
1085         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST);
1086 
1087     // The GPU should be there... but make sure.
1088     if (pRcDB->nvDumpState.bGpuAccessible)
1089     {
1090         if (GPU_REG_RD32(pGpu, NV_PMC_BOOT_0) != pGpu->chipId0)
1091         {
1092             pRcDB->nvDumpState.bGpuAccessible = NV_FALSE;
1093         }
1094     }
1095 
1096     return NV_OK;
1097 }
1098 
1099 /*!
1100  * @brief Performs a dump of the specified system component into the given buffer.
1101  *
1102  * @param[in] pSys The system object
1103  * @param[in] component NVDUMP_IS_SYS_COMPONENT(component) must be true.
1104  * @param[in, out] pBuffer Buffer to populate with dump results
1105  * @param[in] policy Policy for buffer allocation: use this one, allocate one or count
1106  * @param[in, out] pBufferCallback Callback function for use with fixed-sized buffer encoding.
1107  *                                 If this is NULL then pBuffer->size is assumed to be large
1108  *                                 enough for the whole dump. Otherwise pBufferCallback is called
1109  *                                 when the buffer is full or when a message ends, allowing the
1110  *                                 the callback to construct the whole buffer piece by piece.
1111  *
1112  * @return NV_OK on success and specific error status on failure
1113  */
1114 NV_STATUS
rcdbDumpComponent_IMPL(OBJRCDB * pRcDB,NvU32 component,NVDUMP_BUFFER * pBuffer,NVDUMP_BUFFER_POLICY policy,PrbBufferCallback * pBufferCallback)1115 rcdbDumpComponent_IMPL
1116 (
1117     OBJRCDB *pRcDB,
1118     NvU32 component,
1119     NVDUMP_BUFFER *pBuffer,
1120     NVDUMP_BUFFER_POLICY policy,
1121     PrbBufferCallback *pBufferCallback
1122 )
1123 {
1124     NVD_STATE *pNvDumpState = &pRcDB->nvDumpState;
1125     void *pBuff;
1126     PRB_ENCODER encoder;
1127     NV_STATUS status = NV_OK;
1128     NvU8 startingDepth;
1129 
1130     // Validate arguments.
1131     NV_ASSERT_OR_RETURN(pBuffer != NULL, NV_ERR_INVALID_ARGUMENT);
1132 
1133     // Make sure we were not reentered.
1134     if (pNvDumpState->bDumpInProcess)
1135         return NV_ERR_STATE_IN_USE;
1136 
1137     // Initialize dump state.
1138     pNvDumpState->bDumpInProcess    = NV_TRUE;
1139     pNvDumpState->bugCheckCode      = 0;
1140     pNvDumpState->internalCode      = NVD_ERROR_CODE(NVD_EXTERNALLY_GENERATED, 0);
1141     pNvDumpState->bRMLock           = rmapiLockIsOwner();
1142     pNvDumpState->bGpuAccessible    = NV_FALSE;
1143     pNvDumpState->initialbufferSize = pBuffer->size;
1144     pNvDumpState->nvDumpType        = NVD_DUMP_TYPE_API;
1145 
1146     // Clear dump buffer.
1147     pBuffer->curNumBytes = 0;
1148 
1149     // Start encoding protobuf dump message.
1150     switch (policy)
1151     {
1152         case NVDUMP_BUFFER_PROVIDED:
1153             prbEncStart(&encoder, NVDEBUG_NVDUMP, NvP64_VALUE(pBuffer->address),
1154                         pBuffer->size, pBufferCallback);
1155             break;
1156         case NVDUMP_BUFFER_ALLOCATE:
1157             NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1158                 prbEncStartAlloc(&encoder, NVDEBUG_NVDUMP,
1159                                  pBuffer->size, pBufferCallback));
1160             break;
1161         case NVDUMP_BUFFER_COUNT:
1162             prbEncStartCount(&encoder, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE);
1163             break;
1164         default:
1165             return NV_ERR_INVALID_ARGUMENT;
1166     }
1167 
1168     startingDepth = prbEncNestingLevel(&encoder);
1169 
1170     switch (component)
1171     {
1172         case NVDUMP_COMPONENT_SYS_RCDB:
1173         {
1174             NV_CHECK_OK(status, LEVEL_ERROR,
1175                 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1176             break;
1177         }
1178         case NVDUMP_COMPONENT_SYS_SYSINFO:
1179         {
1180             NV_CHECK_OK(status, LEVEL_ERROR,
1181                 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1182             break;
1183         }
1184         case NVDUMP_COMPONENT_SYS_ALL:
1185         {
1186             NV_CHECK_OK(status, LEVEL_ERROR,
1187                 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1188             NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1189                 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1190             break;
1191         }
1192         default:
1193         {
1194             NV_PRINTF(LEVEL_ERROR,
1195                       "called with invalid component %u selected.\n",
1196                       component);
1197             status = NV_ERR_INVALID_ARGUMENT;
1198             break;
1199         }
1200     }
1201 
1202     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1203         prbEncUnwindNesting(&encoder, startingDepth));
1204 
1205     {
1206         NvU32   gpu;
1207         OBJGPU *pGpu;
1208 
1209         for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1210         {
1211             pGpu = gpumgrGetGpu(gpu);
1212 
1213             if ((pGpu != NULL) && IS_GSP_CLIENT(pGpu))
1214             {
1215                 NV_RM_RPC_DUMP_PROTOBUF_COMPONENT(pGpu, status, &encoder,
1216                     pNvDumpState, component);
1217 
1218                 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1219                     prbEncUnwindNesting(&encoder, startingDepth));
1220             }
1221         }
1222     }
1223 
1224     // Finish encoding protobuf dump message.
1225     pBuffer->curNumBytes = prbEncFinish(&encoder, &pBuff);
1226     pBuffer->address = NV_SIGN_EXT_PTR_TO_NvP64(pBuff);
1227     pNvDumpState->bDumpInProcess = NV_FALSE;
1228 
1229     return status;
1230 }
1231 
1232 static NV_STATUS
_rcdbGetTimeInfo(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc)1233 _rcdbGetTimeInfo
1234 (
1235     PRB_ENCODER          *pPrbEnc,
1236     NVD_STATE            *pNvDumpState,
1237     const PRB_FIELD_DESC *pFieldDesc
1238 )
1239 {
1240     NvU64 timeSinceBoot;
1241     NvU32 sec;
1242     NvU32 usec;
1243     NV_STATUS nvStatus = NV_OK;
1244     NvU8 startingDepth = prbEncNestingLevel(pPrbEnc);
1245 
1246     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1247         prbEncNestedStart(pPrbEnc, pFieldDesc));
1248 
1249     prbEncAddUInt64(pPrbEnc,
1250                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_FREQ,
1251                     osGetTimestampFreq());
1252 
1253     // Add Timestamp
1254     prbEncAddUInt64(pPrbEnc,
1255                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_DUMP,
1256                     osGetTimestamp());
1257     osGetCurrentTime(&sec, &usec);
1258     prbEncAddUInt64(pPrbEnc,
1259                     NVDEBUG_SYSTEMINFO_TIMEINFO_SYSTEM_TIME_DUMP,
1260                     (NvU64)sec * 1000000 + usec);
1261 
1262     // Add time since boot in seconds.
1263     osGetCurrentTick(&timeSinceBoot);
1264     prbEncAddUInt32(pPrbEnc,
1265                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIME_SINCE_BOOT_SEC,
1266                     (NvU32)(timeSinceBoot / 1000000000ULL));
1267 
1268     // Unwind the protobuf to the correct depth.
1269     NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1270         prbEncUnwindNesting(pPrbEnc, startingDepth));
1271 
1272     return nvStatus;
1273 }
1274 
1275 static const char * GPU_NA_UUID = "N/A";
1276 
1277 NV_STATUS
rcdbDumpSystemInfo_IMPL(OBJRCDB * pRcDB,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState)1278 rcdbDumpSystemInfo_IMPL
1279 (
1280     OBJRCDB *pRcDB,
1281     PRB_ENCODER *pPrbEnc,
1282     NVD_STATE   *pNvDumpState
1283 )
1284 {
1285     OBJGPU     *pGpu;
1286     NvU8       *pGidString;
1287     NvU32       gpu;
1288     NvU32       numGpus;
1289     NvU32       gidStrlen;
1290     NvU32       sizeStr;
1291     NV_STATUS   nvStatus = NV_OK;
1292     NvBool      bRelease;
1293     NvU8        startingDepth = prbEncNestingLevel(pPrbEnc);
1294 
1295     OBJSYS     *pSys = SYS_GET_INSTANCE();
1296     OBJCL      *pCl = SYS_GET_CL(pSys);
1297     OBJGPU     *pParent;
1298     NvU32       gpuIndex;
1299     NvU32       gpuMask;
1300     NvBool      bGpuDone[NV_MAX_DEVICES];
1301 
1302     // All of this stuff should run OK even without the RM lock.
1303     // No need to check pRcDB->nvDumpState.bNoRMLock;
1304 
1305     switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1306     {
1307     case NVD_GPU_GENERATED:
1308     case NVD_SKIP_ZERO:
1309         // don't report on these internal codes.
1310         return NV_OK;
1311         break;
1312     }
1313 
1314     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1315         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_SYSTEM_INFO));
1316 
1317     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1318         _rcdbGetTimeInfo(pPrbEnc, pNvDumpState, NVDEBUG_SYSTEMINFO_TIME_INFO),
1319         External_Cleanup);
1320 
1321     prbEncAddUInt32(pPrbEnc,
1322                     NVDEBUG_SYSTEMINFO_BUGCHECK_COUNT,
1323                     pRcDB->BugcheckCount);
1324 
1325     // Add NorthBridge Info
1326     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1327         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO),
1328         External_Cleanup);
1329 
1330     prbEncAddUInt32(pPrbEnc,
1331         NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_ID,
1332         pCl->FHBBusInfo.vendorID |
1333         (pCl->FHBBusInfo.deviceID << 16));
1334 
1335     prbEncAddUInt32(pPrbEnc,
1336         NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_SSID,
1337         pCl->FHBBusInfo.subvendorID |
1338         (pCl->FHBBusInfo.subdeviceID << 16));
1339 
1340     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO
1341         prbEncNestedEnd(pPrbEnc),
1342         External_Cleanup);
1343 
1344     //CPU Info
1345     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1346         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_CPU_INFO),
1347         External_Cleanup);
1348 
1349     prbEncAddUInt32(pPrbEnc,
1350         NVDEBUG_SYSTEMINFO_CPUINFO_CPU_TYPE,
1351         pSys->cpuInfo.type);
1352 
1353     prbEncAddUInt32(pPrbEnc,
1354         NVDEBUG_SYSTEMINFO_CPUINFO_CPU_CAPS,
1355         pSys->cpuInfo.caps);
1356 
1357     prbEncAddUInt32(pPrbEnc,
1358         NVDEBUG_SYSTEMINFO_CPUINFO_NUM_CPU_CORES,
1359         pSys->cpuInfo.numPhysicalCpus);
1360 
1361     prbEncAddUInt32(pPrbEnc,
1362         NVDEBUG_SYSTEMINFO_CPUINFO_NUM_LOGICAL_CPUS,
1363         pSys->cpuInfo.numLogicalCpus);
1364 
1365     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_CPU_INFO
1366         prbEncNestedEnd(pPrbEnc),
1367         External_Cleanup);
1368 
1369     //GPU Info
1370     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1371         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_INFO),
1372         External_Cleanup);
1373 
1374     // Count the number of GPUs and List the gpuIds
1375     numGpus = 0;
1376     for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1377     {
1378         const NvU32 gidFlags =
1379             DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _BINARY) |
1380             DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE, _SHA1);
1381 
1382         pGpu = gpumgrGetGpu(gpu);
1383 
1384         if (pGpu)
1385         {
1386             numGpus++;
1387 
1388             prbEncAddUInt32(pPrbEnc,
1389                 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_ID,
1390                 pGpu->gpuId);
1391 
1392             nvStatus = gpuGetGidInfo(pGpu, &pGidString,
1393                 &gidStrlen, gidFlags);
1394             if (NV_OK == nvStatus)
1395             {
1396                 prbEncAddBytes(pPrbEnc,
1397                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1398                     pGidString, gidStrlen);
1399                 portMemFree(pGidString);
1400             }
1401             else if (pGpu->gpuUuid.isInitialized)
1402             {
1403                 prbEncAddBytes(pPrbEnc,
1404                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1405                     pGpu->gpuUuid.uuid, sizeof(pGpu->gpuUuid.uuid));
1406             }
1407             else
1408             {
1409                 prbEncAddString(pPrbEnc,
1410                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1411                     GPU_NA_UUID);
1412             }
1413 
1414             prbEncAddUInt32(pPrbEnc,
1415                 NVDEBUG_SYSTEMINFO_GPUINFO_DEVICE_ID,
1416                 pGpu->idInfo.PCIDeviceID);
1417 
1418             prbEncAddUInt32(pPrbEnc,
1419                 NVDEBUG_SYSTEMINFO_GPUINFO_PMCBOOT0,
1420                 pGpu->chipId0);
1421 
1422             prbEncAddUInt32(pPrbEnc,
1423                 NVDEBUG_SYSTEMINFO_GPUINFO_SUBDEV_ID,
1424                 pGpu->idInfo.PCISubDeviceID);
1425         }
1426     }
1427 
1428     prbEncAddUInt32(pPrbEnc,
1429         NVDEBUG_SYSTEMINFO_GPUINFO_NUM_GPUS,
1430         numGpus);
1431 
1432     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_INFO
1433         prbEncNestedEnd(pPrbEnc),
1434         External_Cleanup);
1435 
1436     //OS Info
1437     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1438         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_OS_INFO),
1439         External_Cleanup);
1440 
1441     nvStatus = osGetVersionDump(pPrbEnc);
1442     if (nvStatus != NV_OK)
1443         goto External_Cleanup;
1444 
1445     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_OS_INFO
1446         prbEncNestedEnd(pPrbEnc),
1447         External_Cleanup);
1448 
1449     // Driver Info
1450     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1451         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_DRIVER_INFO),
1452         External_Cleanup);
1453 
1454     sizeStr = (sizeof("RELEASE") < sizeof(NV_DISPLAY_DRIVER_TITLE) ?
1455         sizeof("RELEASE") :
1456         sizeof(NV_DISPLAY_DRIVER_TITLE));
1457 
1458     if (portMemCmp(NV_DISPLAY_DRIVER_TITLE, "RELEASE", sizeStr) == 0)
1459         bRelease = NV_TRUE;
1460     else
1461         bRelease = NV_FALSE;
1462 
1463     prbEncAddBool(pPrbEnc,
1464         NVDEBUG_SYSTEMINFO_DRIVERINFO_IS_RELEASE,
1465         bRelease);
1466 
1467     prbEncAddString(pPrbEnc,
1468         NVDEBUG_SYSTEMINFO_DRIVERINFO_VERSION,
1469         NV_VERSION_STRING);
1470 
1471     prbEncAddString(pPrbEnc,
1472         NVDEBUG_SYSTEMINFO_DRIVERINFO_BRANCH,
1473         NV_BUILD_BRANCH_VERSION);
1474 
1475     prbEncAddUInt32(pPrbEnc,
1476         NVDEBUG_SYSTEMINFO_DRIVERINFO_CHANGELIST,
1477         NV_LAST_OFFICIAL_CHANGELIST_NUM);
1478 
1479     // Only write previous driver version if loaded more than once.
1480     if (pRcDB->driverLoadCount > 1)
1481     {
1482         if (pRcDB->previousDriverVersion != NULL)
1483         {
1484             prbEncAddString(pPrbEnc,
1485                 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_VERSION,
1486                 pRcDB->previousDriverVersion);
1487         }
1488 
1489         if (pRcDB->previousDriverBranch != NULL)
1490         {
1491             prbEncAddString(pPrbEnc,
1492                 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_BRANCH,
1493                 pRcDB->previousDriverBranch);
1494         }
1495 
1496         prbEncAddUInt32(pPrbEnc,
1497             NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_CHANGELIST,
1498             pRcDB->prevDriverChangelist);
1499     }
1500 
1501     prbEncAddUInt32(pPrbEnc,
1502         NVDEBUG_SYSTEMINFO_DRIVERINFO_LOAD_COUNT,
1503         pRcDB->driverLoadCount);
1504 
1505     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_DRIVER_INFO
1506         prbEncNestedEnd(pPrbEnc),
1507         External_Cleanup);
1508 
1509     // Dump an table of
1510     // Master GPU -- gpuId
1511     // List all gpus involved by gpuIds
1512     portMemSet(bGpuDone, NV_FALSE, sizeof(bGpuDone));
1513     for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1514     {
1515         pGpu = gpumgrGetGpu(gpu);
1516 
1517         if ((pGpu) && (bGpuDone[gpu] == NV_FALSE))
1518         {
1519             pParent = gpumgrGetParentGPU(pGpu);
1520 
1521             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1522                 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_CONFIG),
1523                 External_Cleanup);
1524 
1525             prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_MASTER_ID, pParent->gpuId);
1526             gpuMask = gpumgrGetGpuMask(pGpu);
1527             gpuIndex = 0;
1528             pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1529             while (pGpu)
1530             {
1531                 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_GPU_ID, pGpu->gpuId);
1532 
1533                 // gpuIndex is either the next or the MAX
1534                 bGpuDone[gpuIndex - 1] = NV_TRUE;
1535                 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1536             }
1537 
1538             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_CONFIG
1539                 prbEncNestedEnd(pPrbEnc),
1540                 External_Cleanup);
1541         }
1542     }
1543 
1544     // Error state
1545     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1546         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_ERROR_STATE),
1547         External_Cleanup);
1548 
1549     prbEncAddUInt32(pPrbEnc,
1550         NVDEBUG_SYSTEMINFO_ERRORSTATE_BUGCHECK_CODE,
1551         pNvDumpState->bugCheckCode);
1552 
1553     prbEncAddBool(pPrbEnc,
1554         NVDEBUG_SYSTEMINFO_ERRORSTATE_GOT_RM_LOCK,
1555         pNvDumpState->bRMLock);
1556 
1557     prbEncAddUInt32(pPrbEnc,
1558         NVDEBUG_SYSTEMINFO_ERRORSTATE_DUMP_BUFFER_SIZE,
1559         pNvDumpState->initialbufferSize);
1560 
1561     //
1562     // prbEncNestedEnd for NVDEBUG_SYSTEMINFO_ERROR_STATE and
1563     // NVDEBUG_NVDUMP_SYSTEM_INFO are handled by prbEncUnwindNesting.
1564     //
1565 
1566 External_Cleanup:
1567     // Unwind the protobuf to the correct depth.
1568     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
1569         prbEncUnwindNesting(pPrbEnc, startingDepth));
1570 
1571     return nvStatus;
1572 }
1573 
1574 //
1575 // Routine to dump RcDB Debug Info
1576 //
1577 NV_STATUS
rcdbDumpSystemFunc_IMPL(OBJRCDB * pRcDB,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState)1578 rcdbDumpSystemFunc_IMPL
1579 (
1580     OBJRCDB *pRcDB,
1581     PRB_ENCODER *pPrbEnc,
1582     NVD_STATE *pNvDumpState
1583 )
1584 {
1585     OBJGPU  *pGpu = gpumgrGetSomeGpu();
1586 
1587     switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1588     {
1589     case NVD_GPU_GENERATED:
1590     case NVD_SKIP_ZERO:
1591         // don't report on these internal codes.
1592         return NV_OK;
1593         break;
1594     }
1595 
1596     rcdbDumpJournal(pRcDB, pGpu, pPrbEnc, pNvDumpState, NVDEBUG_NVDUMP_DCL_MSG);
1597     if (pGpu != NULL)
1598     {
1599         rcdbDumpErrorCounters(pRcDB, pGpu, pPrbEnc);
1600     }
1601     else
1602     {
1603         NV_PRINTF(LEVEL_WARNING,
1604                   "no GPU - won't dump ring buffers or journal\n");
1605     }
1606 
1607     return NV_OK;
1608 }
1609 
1610 static NvU32
_rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD * pList,NVD_STATE * pNvDumpState)1611 _rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD   *pList, NVD_STATE *pNvDumpState)
1612 {
1613     OBJSYS                *pSys          = SYS_GET_INSTANCE();
1614     Journal               *pRcDB         = SYS_GET_RCDB(pSys);
1615     SYS_ERROR_INFO        *pSysErrorInfo = &pRcDB->ErrorInfo;
1616     RMPRBERRORELEMENT_V2*  pPrbErrorElement;
1617     RMCD_ERROR_BLOCK*      pErrorBlock;
1618     NV_STATUS              status = NV_OK;
1619 
1620     //
1621     // If we are called from the OCA dump, make sure we have the rm lock.
1622     // TO DO:  Try to dump as much as possible without the lock.
1623     //
1624     if (!pNvDumpState->bRMLock)
1625         return NV_OK;
1626 
1627     // Get Past Exceptions
1628     pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
1629     while (NULL != pPrbErrorElement)
1630     {
1631         pErrorBlock = pPrbErrorElement->ErrorHeader.pErrorBlock;
1632         switch (pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType)
1633         {
1634             case RmPrbErrorInfo_V2:
1635                 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1636                 break;
1637 
1638             case RmPrbFullDump_V2:
1639                 //
1640                 // Full crash dumps are a single NvDebug.NvDump message, and
1641                 // should be contained in a single block.
1642                 //
1643                 if (pErrorBlock != NULL)
1644                 {
1645                     if (pErrorBlock->pNext != NULL)
1646                     {
1647                         NV_PRINTF(LEVEL_WARNING,
1648                                   "only one error block expected!\n");
1649                     }
1650                     _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1651                 }
1652                 break;
1653             default:
1654                 // Can only handle protobuf formatted messages
1655                 NV_PRINTF(LEVEL_ERROR, "unknown error element type: %d\n",
1656                           pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType);
1657                 break;
1658         }
1659         pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pPrbErrorElement->ErrorHeader.pNextError;
1660     }
1661     return status;
1662 }
1663 
1664 static void
_rcdbDumpCommonJournalRecord(PRB_ENCODER * pPrbEnc,const PRB_FIELD_DESC * pFieldDesc,RmRCCommonJournal_RECORD * pRec)1665 _rcdbDumpCommonJournalRecord
1666 (
1667     PRB_ENCODER               *pPrbEnc,
1668     const PRB_FIELD_DESC      *pFieldDesc,
1669     RmRCCommonJournal_RECORD  *pRec
1670 )
1671 {
1672     NV_STATUS nvStatus = NV_OK;
1673 
1674     NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1675         prbEncNestedStart(pPrbEnc, pFieldDesc));
1676 
1677     if (nvStatus == NV_OK)
1678     {
1679         if (pRec->timeStamp != 0)
1680             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_TIME_STAMP, pRec->timeStamp);
1681         if (pRec->GPUTag != 0)
1682             prbEncAddUInt32(pPrbEnc, JOURNAL_COMMON_GPU_TAG,    pRec->GPUTag);
1683         if (pRec->CPUTag != 0)
1684             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_CPU_TAG,    pRec->CPUTag);
1685         if (pRec->stateMask != 0)
1686             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_STATE_MASK, pRec->stateMask);
1687         NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1688     }
1689 }
1690 
1691 static void
rcdbDumpCommonAssertRecord(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,RmRCCommonAssert_RECORD * pRec,NvU32 type)1692 rcdbDumpCommonAssertRecord
1693 (
1694     PRB_ENCODER              *pPrbEnc,
1695     NVD_STATE                *pNvDumpState,
1696     RmRCCommonAssert_RECORD  *pRec,
1697     NvU32                     type
1698 )
1699 {
1700     NvU32 i;
1701 
1702     prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_TYPE,                 type);
1703 
1704     if (pRec->lastTimeStamp != 0)
1705         prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_LAST_TIME_STAMP,  pRec->lastTimeStamp);
1706 
1707     prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_BREAKPOINT_ADDR_HINT, pRec->breakpointAddrHint);
1708 
1709     // if there is a line number, add it to the message.
1710     if (pRec->lineNum != NV_RM_ASSERT_UNKNOWN_LINE_NUM)
1711         prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_SOURCE_LINE, pRec->lineNum);
1712 
1713     if (pRec->count != 1)
1714         prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_COUNT,            pRec->count);
1715 
1716     for (i = 0; i < NV_ARRAY_ELEMENTS(pRec->callStack); i++)
1717     {
1718         if (pRec->callStack[i] == 0)
1719             break;
1720 
1721         prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_CALL_STACK, pRec->callStack[i]);
1722     }
1723 }
1724 
1725 static NV_STATUS
_rcdbDumpDclMsgRecord(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc,RmRCCommonJournal_RECORD * pDclRecord)1726 _rcdbDumpDclMsgRecord(
1727     PRB_ENCODER *pPrbEnc,
1728     NVD_STATE *pNvDumpState,
1729     const PRB_FIELD_DESC *pFieldDesc,
1730     RmRCCommonJournal_RECORD *pDclRecord
1731     )
1732 {
1733     NV_STATUS nvStatus = NV_OK;
1734 
1735     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1736         prbEncNestedStart(pPrbEnc, pFieldDesc));
1737 
1738     _rcdbDumpCommonJournalRecord(pPrbEnc, DCL_DCLMSG_COMMON, pDclRecord);
1739 
1740     switch (pDclRecord->Header.cRecordType)
1741     {
1742         case RmRC2SwDbgBreakpoint_V3:
1743         case RmRC2SwRmAssert_V3:
1744         {
1745             RmRC2SwRmAssert3_RECORD* pRecord = (RmRC2SwRmAssert3_RECORD*)pDclRecord;
1746 
1747             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1748                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1749             if (nvStatus == NV_OK)
1750             {
1751                 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState,
1752                     &pRecord->commonAssert, pDclRecord->Header.cRecordType);
1753 
1754                 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_LEVEL, pRecord->level);
1755                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1756             }
1757             break;
1758         }
1759         case RmRC2GpuTimeout_V3:
1760         {
1761             RmRC2GpuTimeout3_RECORD* pRecord = (RmRC2GpuTimeout3_RECORD*)pDclRecord;
1762 
1763             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1764                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1765             if (nvStatus == NV_OK)
1766             {
1767                 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState, pRecord, pDclRecord->Header.cRecordType);
1768                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1769             }
1770             break;
1771         }
1772         case RmBadRead_V2:
1773         {
1774             RmRC2BadRead2_RECORD* pRecord = (RmRC2BadRead2_RECORD*)pDclRecord;
1775 
1776             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1777                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BADREAD));
1778             if (nvStatus == NV_OK)
1779             {
1780                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MEMORY_SPACE, pRecord->MemorySpace);
1781                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_OFFSET, pRecord->Offset);
1782                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MASK, pRecord->Mask);
1783                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_VALUE, pRecord->Value);
1784                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_REASON, pRecord->Reason);
1785                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1786             }
1787             break;
1788         }
1789         case RmDclMsg:
1790         {
1791             RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1792             // Add the bytes after RM_DATA_COLLECTION_RECORD
1793             prbEncAddBytes(pPrbEnc, pRecord->fieldDesc, (void *) (pRecord + 1),
1794                 pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1795             break;
1796         }
1797         case RmJournalEngDump:
1798         {
1799             RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1800             // Add the bytes after RM_DATA_COLLECTION_RECORD
1801             prbEncCatMsg(pPrbEnc, (void *)(pRecord + 1),
1802                     pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1803             break;
1804         }
1805         case RmJournalBugCheck:
1806         {
1807             RmJournalBugcheck_RECORD* pRecord = (RmJournalBugcheck_RECORD*)pDclRecord;
1808             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1809                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BUGCHECK));
1810             if (nvStatus == NV_OK)
1811             {
1812                 prbEncAddUInt32(pPrbEnc, JOURNAL_BUGCHECK_CODE, pRecord->bugCheckCode);
1813                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1814             }
1815             break;
1816         }
1817         case RmPrbErrorInfo_V2:
1818         case RmPrbFullDump_V2:
1819         {
1820             RMPRBERRORELEMENT_V2*   pRecord = (RMPRBERRORELEMENT_V2*)((NvU8 *)pDclRecord
1821                                                 - NV_OFFSETOF(RMPRBERRORELEMENT_V2, RmPrbErrorData));
1822             RMCD_ERROR_BLOCK*       pErrorBlock;
1823 
1824             for (pErrorBlock = pRecord->ErrorHeader.pErrorBlock;
1825                 (pErrorBlock != NULL); pErrorBlock = pErrorBlock->pNext)
1826             {
1827                     prbEncCatMsg(pPrbEnc, (void *)pErrorBlock->pBlock,
1828                                     pErrorBlock->blockSize);
1829             }
1830             break;
1831         }
1832         case RmNocatReport:
1833         {
1834             // currently not added to the OCA dump
1835             break;
1836         }
1837 
1838         default:
1839             // These are the only ones we know about
1840             NV_PRINTF(LEVEL_ERROR,
1841                         "unknown Dcl Record entry type: %d\n",
1842                         pDclRecord->Header.cRecordType);
1843             break;
1844     }
1845 
1846     NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1847     return 0;
1848 }
1849 
1850 static NvU32
_rcdbInsertJournalRecordToList(RmRCCommonJournal_RECORD * pList,RmRCCommonJournal_RECORD * pRecord)1851 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord)
1852 {
1853     RmRCCommonJournal_RECORD *pCurrentRecord = pList;
1854     RmRCCommonJournal_RECORD *pNextRecord;
1855 
1856     if ((NULL != pList) && (NULL != pRecord))
1857     {
1858         for (pNextRecord = (RmRCCommonJournal_RECORD *)pList->pNext; pNextRecord != pList; pNextRecord = (RmRCCommonJournal_RECORD *)pNextRecord->pNext)
1859         {
1860             if (pRecord->timeStamp  < pNextRecord->timeStamp)
1861             {
1862                 break;
1863             }
1864             pCurrentRecord = pNextRecord;
1865         }
1866         pRecord->pNext = pCurrentRecord->pNext;
1867         pCurrentRecord->pNext = (NvU8 *)pRecord;
1868     }
1869     return 0;
1870 }
1871 
1872 // Todo: format the records into a protobuf DCL record at the source
1873 static NvU32
rcdbInsertRingBufferToList(Journal * pRcDB,RmRCCommonJournal_RECORD * pList,RING_BUFFER_LOG * pRingBuffer)1874 rcdbInsertRingBufferToList(
1875     Journal                    *pRcDB,
1876     RmRCCommonJournal_RECORD   *pList,
1877     RING_BUFFER_LOG            *pRingBuffer
1878 )
1879 {
1880     RmRCCommonJournal_RECORD *pCommon;
1881     NvU32 recordSize;
1882     NvU32 i;
1883 
1884     recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, pRingBuffer->entryType);
1885 
1886     //
1887     // Order does not matter here because the record will be inserted into the
1888     // list based on the time of the record, not its postion in the buffer.
1889     //
1890     for (i = 0; i < pRingBuffer->numEntries; i++)
1891     {
1892         pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (recordSize * i));
1893 
1894         _rcdbInsertJournalRecordToList (pList, pCommon);
1895     }
1896 
1897     return 0; // return value should be discarded
1898 }
1899 
1900 static NvU32
rcdbInsertRingBufferCollectionToList(Journal * pRcDB,RmRCCommonJournal_RECORD * pList)1901 rcdbInsertRingBufferCollectionToList(
1902     Journal                    *pRcDB,
1903     RmRCCommonJournal_RECORD   *pList)
1904 {
1905     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1906     RING_BUFFER_LOG *pCurrentBuffer;
1907     NvU32 i;
1908 
1909 
1910     pCurrentBuffer = pRingBufferColl->pFirstEntry;
1911     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1912     {
1913         NvU32 recSize = pCurrentBuffer->bufferSize;
1914 
1915         NV_ASSERT(pCurrentBuffer->maxEntries *
1916                   rcdbGetOcaRecordSizeWithHeader(pRcDB, pCurrentBuffer->entryType) ==
1917                   pCurrentBuffer->bufferSize);
1918 
1919         if (recSize > 0)
1920         {
1921             rcdbInsertRingBufferToList (pRcDB, pList, pCurrentBuffer);
1922         }
1923         pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1924     }
1925 
1926     // Assert that we traversed through the entire list.
1927     NV_ASSERT(pCurrentBuffer == NULL);
1928 
1929     // return value should be ignored
1930     return 0;
1931 }
1932 
1933 NvU32
rcdbDumpJournal_IMPL(OBJRCDB * pRcDB,OBJGPU * pGpu,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc)1934 rcdbDumpJournal_IMPL
1935 (
1936     OBJRCDB *pRcDB,
1937     OBJGPU *pGpu,
1938     PRB_ENCODER *pPrbEnc,
1939     NVD_STATE *pNvDumpState,
1940     const PRB_FIELD_DESC *pFieldDesc
1941 )
1942 {
1943     OS_DRIVER_BLOCK DriverBlock;
1944     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
1945     NvU8 *pJournalBuff      = pJournal->pBuffer;
1946     RmRCCommonJournal_RECORD *pRecord;
1947     NvU32 recSize;
1948     NV_STATUS nvStatus = NV_OK;
1949     RmRCCommonJournal_RECORD List;
1950 
1951     // It is OK to dump the journal entries without the RM lock.
1952     // No need to check pRcDB->nvDumpState.bNoRMLock;
1953 
1954     recSize = pJournal->BufferSize - pJournal->BufferRemaining;
1955 
1956     if (NULL != pGpu)
1957     {
1958         //
1959         // Add RVA Header, even when there are no journal records.
1960         // This header is required to resolve code addresses using the PDB file.
1961         // We can log code addresses outside of the journal entries.
1962         //
1963         NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedStart(pPrbEnc, pFieldDesc));
1964         if (nvStatus == NV_OK)
1965         {
1966             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1967                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_RVAHEADER));
1968             if (nvStatus == NV_OK)
1969             {
1970                 portMemSet(&DriverBlock, 0x00, sizeof(DriverBlock));
1971                 osGetDriverBlock(pGpu->pOsGpuInfo, &DriverBlock);
1972                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_DRIVER_START, (NvU64)DriverBlock.driverStart);
1973                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_OFFSET, DriverBlock.offset);
1974                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_POINTER_SIZE, sizeof(pJournal));
1975                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_HIGH, *((NvU64*) DriverBlock.unique_id));
1976                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_LOW, *((NvU64*) (DriverBlock.unique_id + 8)));
1977                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_AGE, DriverBlock.age);
1978                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1979             }
1980             NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1981         }
1982     }
1983 
1984     // init the list to an empty state
1985     portMemSet(&List, 0x00, sizeof(List));
1986     List.pNext = (NvU8 *)&List;
1987 
1988     //
1989     // Don't dump the ring buffers if something is adding to them.
1990     // If we can dump the ring buffers, hold the lock for them until the
1991     // dump is complete to insure that a record is not changed mid-dump.
1992     //
1993     if (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1)
1994     {
1995         //
1996         // If IRQL is low, spin until it gets available
1997         //
1998         if (!osIsRaisedIRQL() && (NULL != pGpu))
1999         {
2000             RMTIMEOUT         timeout;
2001             NV_STATUS         status = NV_OK;
2002             gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
2003             do {
2004                 portAtomicDecrementS32(&concurrentRingBufferAccess);
2005 
2006                 if (NV_ERR_TIMEOUT == status)
2007                 {
2008                     NV_PRINTF(LEVEL_ERROR,
2009                               "timed out waiting for Rm journal ring buffer to be available\n");
2010                     DBG_BREAKPOINT();
2011                     return 0;
2012                 }
2013                 status = gpuCheckTimeout(pGpu, &timeout);
2014                 osSpinLoop();
2015             } while (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1);
2016         }
2017         else
2018         {
2019             NV_ASSERT_FAILED("Ring Buffer unavailable for dump at high irql.");
2020         }
2021     }
2022 
2023     rcdbInsertRingBufferCollectionToList (pRcDB, &List);
2024 
2025     _rcdbInsertErrorHistoryToList(&List, pNvDumpState);
2026 
2027     // Skip if size is smaller than a header
2028     while (recSize > sizeof(RmRCCommonJournal_RECORD))
2029     {
2030         pRecord = (RmRCCommonJournal_RECORD *)pJournalBuff;
2031 
2032         if (pRecord->Header.cRecordGroup != RmGroup)
2033         {
2034             // We only log RM related data
2035             NV_ASSERT(pRecord->Header.cRecordGroup == RmGroup);
2036             break;
2037         }
2038 
2039         // Just a safety net...
2040         if (pRecord->Header.wRecordSize > recSize)
2041         {
2042             break;
2043         }
2044         _rcdbInsertJournalRecordToList (&List, pRecord);
2045 
2046         recSize -= pRecord->Header.wRecordSize;
2047         pJournalBuff += pRecord->Header.wRecordSize;
2048     }
2049 
2050 
2051     // dump out the records that have been added to the list.
2052     for (pRecord = (RmRCCommonJournal_RECORD *)List.pNext; pRecord != &List; pRecord = (RmRCCommonJournal_RECORD *)pRecord->pNext)
2053     {
2054         _rcdbDumpDclMsgRecord(pPrbEnc, pNvDumpState, pFieldDesc, pRecord);
2055     }
2056     portAtomicDecrementS32(&concurrentRingBufferAccess);
2057 
2058     // return value should be ignored
2059     return 0;
2060 }
2061 
2062 NvU32
rcdbDumpErrorCounters_IMPL(Journal * pRcDB,OBJGPU * pGpu,PRB_ENCODER * pPrbEnc)2063 rcdbDumpErrorCounters_IMPL(Journal *pRcDB, OBJGPU *pGpu, PRB_ENCODER *pPrbEnc)
2064 {
2065     NvU32                   i;
2066     NvU32                   rcErrTyp = RC_ERROR_COUNTER_TYPE_INVALID;
2067     NV_STATUS               nvStatus = NV_OK;
2068     NvU8                    startingDepth = prbEncNestingLevel(pPrbEnc);
2069 
2070     // Opens NVDEBUG_NVDUMP_DCL_MSG
2071     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2072         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_DCL_MSG),
2073         cleanupAndExit);
2074 
2075     for (i = 0; i <= RC_ERROR_COUNTER_OTHER_INDEX; i++)
2076     {
2077         // For Counters
2078         rcErrTyp = pRcDB->rcErrorCounterArray[i].rcErrorType;
2079         if (rcErrTyp != RC_ERROR_COUNTER_TYPE_INVALID)
2080         {
2081             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2082                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_RCCOUNTER),
2083                 cleanupAndExit);
2084 
2085             // Write Power Event
2086             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCERRORTYPE, rcErrTyp);
2087 
2088             // Write Power State
2089             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_COUNT, pRcDB->rcErrorCounterArray[i].rcErrorCount);
2090 
2091             // Dump the channel ID and the last time when this error occurred on this channel ID
2092             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCLASTCHID, pRcDB->rcErrorCounterArray[i].rcLastCHID);
2093             prbEncAddUInt64(pPrbEnc, RC_RCCOUNTER_RCLASTTIME, pRcDB->rcErrorCounterArray[i].rcLastTime);
2094 
2095             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2096                 prbEncNestedEnd(pPrbEnc),
2097                 cleanupAndExit);
2098         }
2099     } // For Counters
2100 
2101     // Close NVDEBUG_NVDUMP_DCL_MSG handled by prbEncUnwindNesting.
2102 
2103 cleanupAndExit:
2104     // Unwind the protobuff to inital depth
2105     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
2106         prbEncUnwindNesting(pPrbEnc, startingDepth));
2107 
2108     return 0;
2109 }
2110 
2111 static void
_rcdbAddRmGpuDumpCallback(void * pData)2112 _rcdbAddRmGpuDumpCallback
2113 (
2114     void *pData
2115 )
2116 {
2117     OBJSYS *pSys = SYS_GET_INSTANCE();
2118     NV_STATUS status;
2119 
2120     NvU32 gpuInstance = *((NvU32 *)pData);
2121     status = osAcquireRmSema(pSys->pSema);
2122     if (status == NV_OK)
2123     {
2124         // LOCK: acquire API lock
2125         status = rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_DIAG);
2126         if (status == NV_OK)
2127         {
2128             // LOCK: acquire GPUs lock
2129             status = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE,
2130                                        RM_LOCK_MODULES_DIAG);
2131             if (status == NV_OK)
2132             {
2133                 Journal *pRcDB = SYS_GET_RCDB(pSys);
2134                 OBJGPU  *pGpu = gpumgrGetGpu(gpuInstance);
2135 
2136                 //
2137                 // Mark the Journal object as in the deferred dump path so we won't
2138                 // re-attempt again.
2139                 //
2140                 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_TRUE);
2141 
2142                 status = rcdbAddRmGpuDump(pGpu);
2143                 NV_ASSERT(status == NV_OK);
2144 
2145                 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_FALSE);
2146 
2147                 // UNLOCK: release GPUs lock
2148                 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
2149             }
2150             else
2151             {
2152                 NV_PRINTF(LEVEL_ERROR, "failed to acquire the GPU locks!\n");
2153             }
2154             // UNLOCK: release API lock
2155             rmapiLockRelease();
2156         }
2157         else
2158         {
2159             NV_PRINTF(LEVEL_ERROR, "failed to acquire the API lock!\n");
2160         }
2161         osReleaseRmSema(pSys->pSema, NULL);
2162     }
2163     else
2164     {
2165         NV_PRINTF(LEVEL_ERROR, "failed to acquire the OS semaphore!\n");
2166     }
2167 }
2168 
2169 static NV_STATUS
nvdDebuggerBufferCallback(void * pEncoder,NvBool bBufferFull)2170 nvdDebuggerBufferCallback(void *pEncoder, NvBool bBufferFull)
2171 {
2172     if (bBufferFull)
2173     {
2174         nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_BUFFER_FULL;
2175     }
2176     else
2177     {
2178         nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_END_OF_MSG;
2179     }
2180 
2181     return NV_OK;
2182 }
2183 
2184 /*!
2185  * @brief NvDebug kernel debugger dump control
2186  *
2187  * Allows external kernel debuggers to control the RM's dump interface
2188  * without assuming anything about the current system state.
2189  *
2190  * WARNING! This function should never be called directly!
2191  *
2192  * If correctly setup, a kernel debugger will place a processor
2193  * hardware watchpoint on the nvDumpConfig.handshake variable.
2194  * Each time this is written to, the debugger will break and get a chance
2195  * to examine the rest of the nvDumpConfig state.
2196  *
2197  * @return This function should never return! External debugger should abort it!
2198  */
2199 static void
nvdDebuggerControlFunc(void)2200 nvdDebuggerControlFunc(void)
2201 {
2202     OBJSYS        *pSys = SYS_GET_INSTANCE();
2203     Journal       *pRcDB = SYS_GET_RCDB(pSys);
2204     OBJGPU        *pGpu = NULL;
2205     NvDebugDump   *pNvd = NULL;
2206     NVDUMP_BUFFER *pBuffer = (NVDUMP_BUFFER *)&nvDumpConfig.buffer; // discard volatile
2207 
2208     // Process actions while debugger provides work to do.
2209     while (nvDumpConfig.dumpStatus != NVDUMP_STATUS_IDLE)
2210     {
2211         nvDumpConfig.rmStatus = NV_OK;
2212 
2213         NV_PRINTF(LEVEL_INFO,
2214                   "Dump triggered: gpuSelect=%u, component=%u,  dumpStatus=%u\n",
2215                   nvDumpConfig.gpuSelect, nvDumpConfig.component,
2216                   nvDumpConfig.dumpStatus);
2217 
2218         if (NVDUMP_IS_GPU_COMPONENT(nvDumpConfig.component))
2219         {
2220             pGpu = gpumgrGetGpu(nvDumpConfig.gpuSelect);
2221             pNvd = GPU_GET_NVD(pGpu);
2222 
2223             switch (nvDumpConfig.dumpStatus)
2224             {
2225                 case NVDUMP_STATUS_COUNT_REQUESTED:
2226                     nvDumpConfig.rmStatus = nvdDumpComponent(
2227                         pGpu, pNvd, nvDumpConfig.component, pBuffer,
2228                         NVDUMP_BUFFER_COUNT, NULL);
2229                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2230                     break;
2231                 case NVDUMP_STATUS_DUMP_REQUESTED:
2232                     nvDumpConfig.rmStatus = nvdDumpComponent(
2233                         pGpu, pNvd, nvDumpConfig.component, pBuffer,
2234                         NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2235                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2236                     break;
2237                 default:
2238                     NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2239                               nvDumpConfig.dumpStatus);
2240                     nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2241                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2242                     break;
2243             }
2244         }
2245         else if (NVDUMP_IS_SYS_COMPONENT(nvDumpConfig.component))
2246         {
2247             switch (nvDumpConfig.dumpStatus)
2248             {
2249                 case NVDUMP_STATUS_COUNT_REQUESTED:
2250                     nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2251                         nvDumpConfig.component, pBuffer,
2252                         NVDUMP_BUFFER_COUNT, NULL);
2253                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2254                     break;
2255                 case NVDUMP_STATUS_DUMP_REQUESTED:
2256                     nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2257                         nvDumpConfig.component, pBuffer,
2258                         NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2259                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2260                     break;
2261                 default:
2262                     NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2263                               nvDumpConfig.dumpStatus);
2264                     nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2265                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2266 
2267                     break;
2268             }
2269         }
2270         else
2271         {
2272              NV_PRINTF(LEVEL_ERROR, "Invalid component %u\n",
2273                        nvDumpConfig.component);
2274              nvDumpConfig.rmStatus = NV_ERR_INVALID_PARAM_STRUCT;
2275              nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2276         }
2277     }
2278 
2279     // Ensure we really don't exit this function without debugger.
2280     while (1)
2281     {
2282         NV_PRINTF(LEVEL_ERROR, "Should never reach this point!\n");
2283         DBG_BREAKPOINT();
2284     }
2285 }
2286 
2287 /*!
2288  * @brief   Release Build NV_ASSERT function
2289  *
2290  * @details Called by NV_ASSERT when the assertion fails.
2291  *          By putting this logic in its own function, we save on binary size.
2292  */
2293 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
_rcdbRmAssert(NvU32 level,NvU32 lineNum,NvU64 ip)2294 static void _rcdbRmAssert(NvU32 level, NvU32 lineNum, NvU64 ip)
2295 {
2296     RmRC2SwRmAssert3_RECORD* pRec = NULL;
2297     if (rcdbAddAssertJournalRecWithLine(NULL, lineNum, (void **)&pRec, RmGroup,
2298         RmRC2SwRmAssert_V3, sizeof(RmRC2SwRmAssert3_RECORD),
2299         level, ip) == NV_OK)
2300     {
2301         pRec->level = level;
2302     }
2303 
2304 #if !defined(DEBUG) && !defined(QA_BUILD)
2305     {
2306         OBJSYS *pSys = SYS_GET_INSTANCE();
2307 
2308         // Add assert to NvLog.  But skip when nvLog asserts to avoid stack overflow.
2309         if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2310         {
2311             // check for GPU lost.
2312             rcdProbeAllGpusPresent(ip);
2313         }
2314         portAtomicDecrementS32(&nvLogRecursion);
2315 
2316         if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_ASSERT) &
2317             DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2318         {
2319             REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release NV_ASSERT Break\n");
2320         }
2321     }
2322 
2323     // If enabled bugcheck on assert
2324     osDbgBugCheckOnAssert();
2325 
2326 #endif
2327 }
2328 
2329 //
2330 // Some param-less wrappers for rcdbXxxEx() functions.
2331 // If the params are not needed, calling these functions saves on binary size
2332 //
rcdbRmAssert(NvU32 LineNum,NvU64 ip)2333 void rcdbRmAssert(NvU32 LineNum, NvU64 ip) {  _rcdbRmAssert(0, LineNum, ip); }
rcdbRmAssertStatus(NvU32 status,NvU32 LineNum,NvU64 ip)2334 void rcdbRmAssertStatus(NvU32 status, NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(status, LineNum, ip); }
2335 
2336 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
2337 
2338 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2339 
2340 /*!
2341  * @brief   Release Build DBGBREAKPOINT() function
2342  *
2343  * @details Called by DBGBREAKPOINT when the assertion fails.
2344  *          By putting this logic in its own function, we save on binary size.
2345  */
_rcdbDbgBreakEx(void * pGpu,NvU32 lineNum,NvU32 level,NvU64 ip)2346 static void _rcdbDbgBreakEx(void *pGpu, NvU32 lineNum, NvU32 level, NvU64 ip)
2347 {
2348     RmRC2SwRmAssert3_RECORD* pRec = NULL;
2349     if (rcdbAddAssertJournalRecWithLine(pGpu, lineNum, (void**)&pRec, RmGroup,
2350          RmRC2SwDbgBreakpoint_V3, sizeof(RmRC2SwRmAssert3_RECORD), level, ip) == NV_OK)
2351     {
2352         pRec->level = level;
2353     }
2354 
2355 #if !defined(DEBUG) && !defined(QA_BUILD)
2356     {
2357         OBJSYS *pSys = SYS_GET_INSTANCE();
2358 
2359         // Add assert to NvLog.  But skip when nvLog asserts to avoid stack overflow.
2360         if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2361         {
2362             NV_PRINTF(LEVEL_NOTICE, "Breakpoint at 0x%llx.\n", ip);
2363         }
2364         portAtomicDecrementS32(&nvLogRecursion);
2365 
2366         if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_DBG_BREAK) &
2367             DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2368         {
2369             REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release Debug Break\n");
2370         }
2371     }
2372 #endif
2373 
2374     // If enabled bugcheck on assert
2375     osDbgBugCheckOnAssert();
2376 }
2377 
rcdbDbgBreak(NvU64 ip)2378 void rcdbDbgBreak(NvU64 ip)                             { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0,      ip); }
rcdbDbgBreakGpu(void * pGpu,NvU64 ip)2379 void rcdbDbgBreakGpu(void *pGpu, NvU64 ip)              { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0,      ip); }
rcdbDbgBreakStatus(NvU32 status,NvU64 ip)2380 void rcdbDbgBreakStatus(NvU32 status, NvU64 ip)         { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
rcdbDbgBreakEx(void * pGpu,NvU32 status,NvU64 ip)2381 void rcdbDbgBreakEx(void *pGpu, NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
2382 
2383 #endif
2384 
2385 NV_STATUS
rcdbAddRmEngDump(OBJGPU * pGpu,NvU32 component)2386 rcdbAddRmEngDump
2387 (
2388     OBJGPU  *pGpu,
2389     NvU32 component
2390 )
2391 {
2392     OBJSYS          *pSys     = SYS_GET_INSTANCE();
2393     Journal         *pRcDB    = SYS_GET_RCDB(pSys);
2394     NvDebugDump     *pNvd     = GPU_GET_NVD(pGpu);
2395     NVDUMP_BUFFER    nvDumpBuffer = {0};
2396     RM_DATA_COLLECTION_RECORD *pRec;
2397     NV_STATUS        rmStatus;
2398     NvU16            totalSize;
2399 
2400     nvDumpBuffer.size = NVDUMP_MAX_DUMP_SIZE;
2401 
2402     rmStatus = nvdDumpComponent(pGpu, pNvd, component, &nvDumpBuffer,
2403                NVDUMP_BUFFER_ALLOCATE, NULL);
2404     if (rmStatus != NV_OK)
2405     {
2406         goto rcdbAddRmEngDump_error_handle;
2407     }
2408 
2409     totalSize = (NvU16)(nvDumpBuffer.curNumBytes + sizeof(*pRec));
2410     //align to 8 bytes to keep the readability of RM journal
2411     totalSize = (totalSize + 0x7) & ~0x7;
2412     // check for overflow
2413     if (((NvU32)totalSize) < nvDumpBuffer.curNumBytes + sizeof(*pRec))
2414     {
2415         goto rcdbAddRmEngDump_error_handle;
2416     }
2417 
2418     rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pRec, RmGroup,
2419                                        RmJournalEngDump, totalSize);
2420     if (rmStatus != NV_OK)
2421     {
2422         goto rcdbAddRmEngDump_error_handle;
2423     }
2424     rcdbSetCommonJournalRecord(pGpu, &pRec->common);
2425 
2426     // copy the dump buffer right after the RM_DATA_COLLECTION_RECORD struct
2427     portMemCopy((void *)(pRec + 1), nvDumpBuffer.curNumBytes, NvP64_VALUE(nvDumpBuffer.address), nvDumpBuffer.curNumBytes);
2428 
2429     pRec->fieldDesc = NVDEBUG_NVDUMP_GPU_INFO;
2430 
2431 rcdbAddRmEngDump_error_handle:
2432     if (nvDumpBuffer.address != NvP64_NULL)
2433     {
2434         portMemFree(NvP64_VALUE(nvDumpBuffer.address));
2435     }
2436 
2437     return rmStatus;
2438 }
2439 
2440 
2441 // Finds the ring buffer for a corresponding type. Returns error if not allocated.
2442 static void
rcdbFindRingBufferForType(Journal * pRcDB,RMCD_RECORD_TYPE recType,RING_BUFFER_LOG ** ppRingBuffer)2443 rcdbFindRingBufferForType
2444 (
2445     Journal *pRcDB,
2446     RMCD_RECORD_TYPE recType,
2447     RING_BUFFER_LOG **ppRingBuffer
2448 )
2449 {
2450     NvU32 i;
2451     RING_BUFFER_LOG *pCurrentRingBuffer = NULL;
2452     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2453 
2454     NV_ASSERT(ppRingBuffer != NULL);
2455     *ppRingBuffer = NULL;
2456 
2457     //
2458     // Loop through our ring buffer collection, and find the
2459     // ring buffer corresponding to our type.
2460     //
2461     pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2462     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2463     {
2464         NV_ASSERT(pCurrentRingBuffer != NULL);
2465         if (pCurrentRingBuffer->entryType == recType)
2466         {
2467             *ppRingBuffer = pCurrentRingBuffer;
2468             return;
2469         }
2470         pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2471     }
2472 
2473     NV_PRINTF(LEVEL_INFO, "Ring Buffer not found for type %d\n", recType);
2474     return;
2475 }
2476 
2477 //
2478 // Creates a ring buffer capable of holding "maxEntries" number of entries, and
2479 // adds it to the ring buffer collection.
2480 // Returns a pointer to the created ring buffer so that individual modules can
2481 // examine the data on-demand easily.
2482 //
2483 //PRINT_BUFFER_LOG
2484 NvU8 *
rcdbCreateRingBuffer_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type,NvU32 maxEntries)2485 rcdbCreateRingBuffer_IMPL
2486 (
2487     Journal *pRcDB,
2488     RMCD_RECORD_TYPE type,
2489     NvU32   maxEntries
2490 )
2491 {
2492     NV_STATUS status;
2493     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2494     RING_BUFFER_LOG *pRingBuffer;
2495     NvU8*           pBuffer = NULL;
2496     NvU32 bufferSize, entrySize;
2497 
2498     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2499 
2500     entrySize = rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2501     if (entrySize == 0)
2502     {
2503         NV_ASSERT(entrySize != 0);
2504         return NULL;
2505     }
2506 
2507     // We need to store maxEntries number of entries. Check for overflow too
2508     if (portSafeMulU32(maxEntries, entrySize, &bufferSize) == NV_FALSE)
2509     {
2510         return NULL;
2511     }
2512 
2513     if (pRingBuffer != NULL)
2514     {
2515         NvU32 totalSize;
2516 
2517         if (portSafeAddU32(bufferSize, pRingBuffer->bufferSize, &totalSize) == NV_FALSE)
2518         {
2519             return NULL;
2520         }
2521 
2522         bufferSize = totalSize;
2523         pRingBuffer->refCount++;
2524 
2525         //
2526         // XXX The collect-all design of the ring buffers allows for
2527         // interleaved entries for different GPUs. This makes it
2528         // hard to dynamically shrink any given ring buffer as GPUs are
2529         // torn down, and requires that an upper bound be placed on
2530         // the buffer's size.
2531         //
2532         // The upper bound, as chosen, is somewhat arbitrary, but at
2533         // the time of this writing, consistent with the use of
2534         // this interface (i.e. the number of entries for each type is
2535         // the same for each GPU).
2536         //
2537         if (bufferSize > pRingBuffer->maxBufferSize)
2538              return NULL;
2539     }
2540     else
2541     {
2542         pRingBuffer = portMemAllocNonPaged(sizeof(RING_BUFFER_LOG));
2543         if (pRingBuffer == NULL)
2544         {
2545             status = NV_ERR_NO_MEMORY;
2546             NV_ASSERT(status == NV_OK);
2547             return NULL;
2548         }
2549 
2550         portMemSet(pRingBuffer, 0x00, sizeof(*pRingBuffer));
2551         pRingBuffer->refCount = 1;
2552     }
2553 
2554     pBuffer = portMemAllocNonPaged(bufferSize);
2555     if (pBuffer == NULL)
2556     {
2557         status = NV_ERR_NO_MEMORY;
2558         NV_ASSERT(status == NV_OK);
2559         pRingBuffer->refCount--;
2560         if (pRingBuffer->pBuffer == NULL)
2561         {
2562             portMemFree(pRingBuffer);
2563         }
2564         return NULL;
2565     }
2566 
2567     // Now, initialize the entries the RING_BUFFER structure.
2568     pRingBuffer->maxEntries += maxEntries;
2569 
2570     // Add the ring buffer to the beginning of the ring buffer collection.
2571     if (pRingBuffer->pBuffer == NULL)
2572     {
2573         if (portSafeMulU32(bufferSize, NV_MAX_DEVICES, &pRingBuffer->maxBufferSize) == NV_FALSE)
2574         {
2575             pRingBuffer->refCount--;
2576             if (pRingBuffer->pBuffer == NULL)
2577             {
2578                 portMemFree(pRingBuffer);
2579             }
2580 
2581             portMemFree(pBuffer);
2582             return NULL;
2583         }
2584 
2585         pRingBuffer->maxBufferSize = (bufferSize * NV_MAX_DEVICES);
2586         pRingBuffer->entryType = type;
2587         pRingBuffer->pNextRingBuffer = pRingBufferColl->pFirstEntry;
2588         pRingBufferColl->pFirstEntry = pRingBuffer;
2589         pRingBufferColl->NumRingBuffers++;
2590     }
2591     else
2592     {
2593         NvU32 copySize;
2594 
2595         if (portSafeSubU32(bufferSize, pRingBuffer->bufferSize, &copySize) == NV_FALSE)
2596         {
2597             pRingBuffer->refCount--;
2598             if (pRingBuffer->pBuffer == NULL)
2599             {
2600                 portMemFree(pRingBuffer);
2601             }
2602 
2603             portMemFree(pBuffer);
2604             return NULL;
2605         }
2606 
2607         portMemCopy(pBuffer, copySize, pRingBuffer->pBuffer, copySize);
2608         portMemFree(pRingBuffer->pBuffer);
2609     }
2610 
2611     pRingBuffer->bufferSize = bufferSize;
2612     pRingBuffer->pBuffer = pBuffer;
2613     return (NvU8 *)pRingBuffer;
2614 }
2615 
2616 void
rcdbDestroyRingBuffer_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type)2617 rcdbDestroyRingBuffer_IMPL
2618 (
2619     Journal *pRcDB,
2620     RMCD_RECORD_TYPE type
2621 )
2622 {
2623     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2624     RING_BUFFER_LOG *pRingBuffer, *pCurrentRingBuffer;
2625     NvU32 i;
2626 
2627     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2628     if (pRingBuffer == NULL)
2629         return;
2630 
2631     if (--pRingBuffer->refCount > 0)
2632         return;
2633 
2634     pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2635     if (pCurrentRingBuffer == pRingBuffer)
2636     {
2637         pRingBufferColl->pFirstEntry = pCurrentRingBuffer->pNextRingBuffer;
2638     }
2639     else
2640     {
2641         for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2642         {
2643             if (pCurrentRingBuffer->pNextRingBuffer == pRingBuffer)
2644             {
2645                 pCurrentRingBuffer->pNextRingBuffer =
2646                     pRingBuffer->pNextRingBuffer;
2647                 break;
2648             }
2649             pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2650         }
2651     }
2652 
2653     portMemFree(pRingBuffer->pBuffer);
2654     portMemFree(pRingBuffer);
2655 
2656     pRingBufferColl->NumRingBuffers--;
2657 }
2658 
2659 /*
2660 ** _rcdbAllocRecFromRingBuffer allocates a buffer entry from the
2661 **  specified ring buffer.
2662 **
2663 **  parameters:
2664 **      pGpu            a pointer to the GPU object associated with the entry.
2665 **      pRcdb           a pointer toe the Journal that contains the ring buffers
2666 **      type            the record type to locate a buffer for.
2667 **      recordSize      the size of the expected record
2668 **
2669 **  notes:
2670 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
2671 **      failure to do so can result in concurrency issues.
2672 */
2673 RmRCCommonJournal_RECORD *
_rcdbAllocRecFromRingBuffer(OBJGPU * pGpu,Journal * pRcDB,RMCD_RECORD_TYPE type)2674 _rcdbAllocRecFromRingBuffer
2675 (
2676     OBJGPU             *pGpu,
2677     Journal            *pRcDB,
2678     RMCD_RECORD_TYPE    type
2679 )
2680 {
2681     RING_BUFFER_LOG    *pRingBuffer = NULL;
2682     NvU32               newItemIndex;
2683     RmRCCommonJournal_RECORD
2684                        *pCommon = NULL;
2685 
2686     // Find the ring buffer for this entry in the collection.
2687     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2688 
2689     if (pRingBuffer == NULL)
2690     {
2691         NV_ASSERT(0);
2692         //
2693         // There is no ring buffer allocated for this type.
2694         // Nothing we can do about it.
2695         //
2696         return NULL;
2697     }
2698 
2699     newItemIndex = (pRingBuffer->numEntries + pRingBuffer->headIndex) % pRingBuffer->maxEntries;
2700 
2701     // prepend the rmJournalCommon record to record.
2702     pCommon = (RmRCCommonJournal_RECORD*)(pRingBuffer->pBuffer + (rcdbGetOcaRecordSizeWithHeader(pRcDB, type) * newItemIndex));
2703     pCommon->Header.cRecordGroup = RmGroup;
2704     pCommon->Header.cRecordType = type;
2705     pCommon->Header.wRecordSize = (NvU16)rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2706     rcdbSetCommonJournalRecord(pGpu, pCommon);
2707 
2708     // Increment the number of entries or advance the head index.
2709     if (pRingBuffer->numEntries < pRingBuffer->maxEntries)
2710     {
2711         ++pRingBuffer->numEntries;
2712     }
2713     else
2714     {
2715         ++(pRingBuffer->headIndex);
2716         if (pRingBuffer->headIndex >= pRingBuffer->maxEntries)
2717         {
2718             pRingBuffer->headIndex = 0;
2719         }
2720     }
2721     return pCommon;
2722 }
2723 
2724 /*
2725 ** rcdbAddRecToRingBuffer_IMPL allocates a buffer entry from the
2726 **  specified ring buffer & copies the supplied data buffer into it.
2727 **
2728 **  parameters:
2729 **      pGpu            a pointer to the GPU object associated with the entry.
2730 **      pRcdb           a pointer toe the Journal that contains the ring buffers
2731 **      type            the record type to locate a buffer for.
2732 **      recordSize      the size of the expected record
2733 **      pRecord         a pointer to the data that will populate the new ring buffer entry.
2734 **
2735 **  notes:
2736 */
2737 RmRCCommonJournal_RECORD *
rcdbAddRecToRingBuffer_IMPL(OBJGPU * pGpu,Journal * pRcDB,RMCD_RECORD_TYPE type,NvU32 recordSize,NvU8 * pRecord)2738 rcdbAddRecToRingBuffer_IMPL
2739 (
2740     OBJGPU             *pGpu,
2741     Journal            *pRcDB,
2742     RMCD_RECORD_TYPE    type,
2743     NvU32               recordSize,
2744     NvU8               *pRecord
2745 )
2746 {
2747     RmRCCommonJournal_RECORD *pCommon = NULL;
2748 
2749     NV_ASSERT(recordSize == _rcdbGetOcaRecordSize(pRcDB, type));
2750 
2751     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
2752     {
2753         pCommon = _rcdbAllocRecFromRingBuffer(pGpu, pRcDB, type);
2754         if (pCommon != NULL)
2755         {
2756             // copy the record to follow the common header.
2757             portMemCopy(&(pCommon[1]), recordSize, pRecord, recordSize);
2758         }
2759     }
2760     portAtomicDecrementS32(&concurrentRingBufferAccess);
2761 
2762     return pCommon;
2763 }
2764 
_rcdbGetOcaRecordSize(Journal * pRcDB,RMCD_RECORD_TYPE type)2765 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type)
2766 {
2767     switch(type)
2768     {
2769         case RmRcDiagReport:
2770             return sizeof(RmRcDiag_RECORD);
2771             break;
2772         case RmNocatReport:
2773             return sizeof(RM_NOCAT_JOURNAL_ENTRY);
2774             break;
2775         default:
2776             return 0;
2777     }
2778 }
2779 
rcdbGetOcaRecordSizeWithHeader_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type)2780 NvU32 rcdbGetOcaRecordSizeWithHeader_IMPL(Journal *pRcDB, RMCD_RECORD_TYPE type)
2781 {
2782     NvU32 recSz;
2783 
2784     recSz = _rcdbGetOcaRecordSize(pRcDB, type);
2785     if (0 < recSz)
2786     {
2787         recSz += sizeof(RmRCCommonJournal_RECORD);
2788     }
2789 
2790     //
2791     // On architecture like RISC-V, loads/stores need to be aligned to the
2792     // request size (1, 2, 4, 8-byte). Here, OCA record and header are stored
2793     // in a ring buffer, hence total recSz needs to be 8-byte aligned for both
2794     // producer (GSP RM) and consumer (CPU RM) of this data.
2795     //
2796     return NV_ALIGN_UP(recSz, 8);
2797 }
2798 
2799 NV_STATUS
rcdbAddRmGpuDump(OBJGPU * pGpu)2800 rcdbAddRmGpuDump
2801 (
2802     OBJGPU *pGpu
2803 )
2804 {
2805     NV_STATUS           status = NV_OK;
2806     OBJSYS             *pSys               = SYS_GET_INSTANCE();
2807     Journal            *pRcDB              = SYS_GET_RCDB(pSys);
2808     NvDebugDump        *pNvd               = GPU_GET_NVD(pGpu);
2809     NVD_STATE          *pNvDumpState       = &pRcDB->nvDumpState;
2810     SYS_ERROR_INFO     *pSysErrorInfo      = &pRcDB->ErrorInfo;
2811     RMPRBERRORELEMENT_V2 *pPrbErrorInfo    = NULL;
2812     RMPRBERRORELEMENT_V2 *pErrorList       = NULL;
2813     RMCD_ERROR_BLOCK   *pNewErrorBlock     = NULL;
2814     RMERRORHEADER      *pErrorHeader       = NULL;
2815     PRB_ENCODER         prbEnc;
2816     NvU32               bufferUsed;
2817     NvU8               *pBuf               = NULL;
2818 
2819     //
2820     // The deferred dump codepath will block out other dumps until the DPC can
2821     // be executed. If this is the deferred callback attempting to do the dump,
2822     // carry on.
2823     //
2824     if (pNvDumpState->bDumpInProcess &&
2825         !pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2826     {
2827         return NV_ERR_STATE_IN_USE;
2828     }
2829 
2830     prbEnc.depth = 0;
2831     pNvDumpState->bDumpInProcess    = NV_TRUE;
2832     pNvDumpState->nvDumpType        = NVD_DUMP_TYPE_OCA;
2833     pNvDumpState->bRMLock           = rmapiLockIsOwner();
2834 
2835     rcdbDumpInitGpuAccessibleFlag(pGpu, pRcDB);
2836 
2837     //
2838     // General process:
2839     //  1. Start the protobuf encoder in ALLOCATE mode, and dump the data
2840     //  2. Allocate an error element to stick in the Journal list
2841     //  3. Add the protobuf dump to the error element
2842     //  4. Put the error element at the end of the error list on OBJRCDB
2843     //
2844     status = prbEncStartAlloc(&prbEnc, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE,
2845                               NULL);
2846     if (status != NV_OK)
2847     {
2848         //
2849         // If we couldn't allocate the memory, it may be because we're at a
2850         // raised IRQL. It's not a great idea to be gathering a bunch of state
2851         // from the interrupt context anyway, so queue a work item to come back
2852         // later and try again.
2853         //
2854         NvU32 *pGpuInstance = NULL;
2855 
2856         //
2857         // If that's what we've already done and we're still failing, bail out
2858         // to avoid an infinite fail/queue-work-item loop.
2859         //
2860         if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2861         {
2862             NV_PRINTF(LEVEL_ERROR,
2863                       "deferred GPU dump encoder init failed (status = 0x%x)\n",
2864                       status);
2865             goto done;
2866         }
2867 
2868         NV_PRINTF(LEVEL_INFO, "deferring GPU dump for normal context\n");
2869 
2870         //
2871         // This will be freed by the OS work item layer. We pass the GPU
2872         // instance as the data separately because if the GPU has fallen off
2873         // the bus, the OS layer may refuse to execute work items attached to
2874         // it. Instead, use the system work item interface and handle the GPU
2875         // ourselves.
2876         //
2877         pGpuInstance = portMemAllocNonPaged(sizeof(NvU32));
2878         if (pGpuInstance == NULL)
2879         {
2880             status = NV_ERR_NO_MEMORY;
2881             goto done;
2882         }
2883 
2884         *pGpuInstance = gpuGetInstance(pGpu);
2885         status = osQueueSystemWorkItem(_rcdbAddRmGpuDumpCallback,
2886                                        pGpuInstance);
2887         if (status != NV_OK)
2888         {
2889             portMemFree(pGpuInstance);
2890             goto done;
2891         }
2892 
2893         //
2894         // Since we've queued the work item, leave the dump state marked as in
2895         // use to prevent other interrupts and codepaths from attempting to
2896         // initiate the dump and/or queue a new work item.
2897         //
2898         return NV_WARN_MORE_PROCESSING_REQUIRED;
2899     }
2900 
2901     status = nvdDumpAllEngines(pGpu, pNvd, &prbEnc, pNvDumpState);
2902     if (status != NV_OK)
2903     {
2904         //
2905         // If the dump failed somewhere, unwind the encoder and then drop
2906         // through to finish it out so we can get the pointer to the
2907         // allocated buffer to free.
2908         //
2909         while (prbEnc.depth > 1)
2910         {
2911             prbEncNestedEnd(&prbEnc);
2912         }
2913     }
2914 
2915     bufferUsed = prbEncFinish(&prbEnc, (void **)&pBuf);
2916 
2917     if (status != NV_OK)
2918     {
2919         goto done;
2920     }
2921 
2922     // Allocate and initialize the error element
2923     pPrbErrorInfo = portMemAllocNonPaged(sizeof(RMPRBERRORELEMENT_V2));
2924     if (pPrbErrorInfo == NULL)
2925     {
2926         status = NV_ERR_NO_MEMORY;
2927         goto done;
2928     }
2929 
2930     portMemSet(pPrbErrorInfo, 0, sizeof(RMPRBERRORELEMENT_V2));
2931     pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordGroup = RmGroup;
2932     pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordType  = RmPrbFullDump_V2;
2933     pPrbErrorInfo->RmPrbErrorData.common.Header.wRecordSize  = sizeof(RMPRBERRORELEMENT_V2);
2934     rcdbSetCommonJournalRecord(pGpu, &(pPrbErrorInfo->RmPrbErrorData.common));
2935     pErrorHeader = &pPrbErrorInfo->ErrorHeader;
2936     pErrorHeader->pErrorBlock = NULL;
2937 
2938     //
2939     // Allocate and initialize the error "block" associated with this protobuf
2940     // dump
2941     //
2942     pNewErrorBlock = portMemAllocNonPaged(sizeof(RMCD_ERROR_BLOCK));
2943     if (pNewErrorBlock == NULL)
2944     {
2945         status = NV_ERR_NO_MEMORY;
2946         goto done;
2947     }
2948 
2949     portMemSet(pNewErrorBlock, 0, sizeof(RMCD_ERROR_BLOCK));
2950     pNewErrorBlock->pBlock = pBuf;
2951     pNewErrorBlock->blockSize = bufferUsed;
2952     pNewErrorBlock->pNext = NULL;
2953     pErrorHeader->pErrorBlock = pNewErrorBlock;
2954 
2955     // Add the error element to the Journal list
2956     if (pSysErrorInfo->pErrorList != NULL)
2957     {
2958         pErrorList = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
2959         while (pErrorList->ErrorHeader.pNextError != NULL)
2960         {
2961             pErrorList = (RMPRBERRORELEMENT_V2*)pErrorList->ErrorHeader.pNextError;
2962         }
2963 
2964         pErrorList->ErrorHeader.pNextError = (RMFIFOERRORELEMENT_V3*)pPrbErrorInfo;
2965     }
2966     else
2967     {
2968         pSysErrorInfo->pErrorList = pPrbErrorInfo;
2969     }
2970 
2971     pSysErrorInfo->ErrorCount++;
2972 
2973 done:
2974     if (status != NV_OK)
2975     {
2976         if (pBuf != NULL)
2977         {
2978             portMemFree(pPrbErrorInfo);
2979             portMemFree(pBuf);
2980         }
2981     }
2982 
2983     pNvDumpState->bDumpInProcess = NV_FALSE;
2984     return status;
2985 }
2986 
2987 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2988 #if !defined(DEBUG) && !defined(QA_BUILD)
2989 /*
2990  */
2991 NvBool
rcdProbeGpuPresent(OBJGPU * pGpu,NvU64 ip)2992 rcdProbeGpuPresent(
2993     OBJGPU  *pGpu,
2994     NvU64    ip
2995 )
2996 {
2997     NvU32       testValue;
2998     NvBool      bFoundLostGpu = NV_FALSE;
2999 
3000     // protect against recursion when probing the GPU.
3001     if (portAtomicIncrementS32(&probeGpuRecursion) == 1)
3002     {
3003         if (NULL != pGpu)
3004         {
3005             // is the GPU we are checking allready reported lost?
3006             if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) &&
3007                 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
3008             {
3009                 testValue = GPU_CHECK_REG_RD32(pGpu, NV_PMC_BOOT_0, (~(pGpu->chipId0)));
3010                 if (testValue == GPU_REG_VALUE_INVALID)
3011                 {
3012                     // there shouldn't be a need to make a journal entry,
3013                     // as that should have been done by GPU_CHECK_REG_RD32
3014 
3015                     // Add GPU lost detection to to NvLog.
3016                     // But skip when nvLog asserts to avoid stack overflow.
3017 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3018                     if (portAtomicIncrementS32(&nvLogRecursion) == 1)
3019 #endif
3020                     {
3021                         NV_PRINTF(LEVEL_ERROR,
3022                                   "found GPU %d (0x%p) inaccessible After assert\n",
3023                                   pGpu->gpuInstance, pGpu);
3024                     }
3025 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3026                     portAtomicDecrementS32(&nvLogRecursion);
3027 #endif
3028                     bFoundLostGpu = NV_TRUE;
3029                 }
3030             }
3031         }
3032     }
3033     portAtomicDecrementS32(&probeGpuRecursion);
3034     return bFoundLostGpu;
3035 }
3036 
3037 NvBool
rcdProbeAllGpusPresent(NvU64 ip)3038 rcdProbeAllGpusPresent(
3039     NvU64   ip
3040 )
3041 {
3042     OBJSYS *pSys = SYS_GET_INSTANCE();
3043     NvBool  bFoundLostGpu = NV_FALSE;
3044     OBJGPU *pGpu;
3045     NvU32   gpuMask;
3046     NvU32   gpuIndex = 0;
3047 
3048     if (pSys->getProperty(pSys, PDB_PROP_SYS_DESTRUCTING))
3049     {
3050         return NV_FALSE;
3051     }
3052 
3053     gpumgrGetGpuAttachInfo(NULL, &gpuMask);
3054     pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3055     while (pGpu)
3056     {
3057         bFoundLostGpu = bFoundLostGpu  || rcdProbeGpuPresent(pGpu, ip);
3058         pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3059     }
3060     return bFoundLostGpu;
3061 }
3062 #endif // !defined(DEBUG) && !defined(QA_BUILD)
3063 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
3064 
3065 void
rcdbAddCrashedFalcon(Falcon * pFlcn)3066 rcdbAddCrashedFalcon
3067 (
3068     Falcon *pFlcn
3069 )
3070 {
3071     OBJSYS *pSys = SYS_GET_INSTANCE();
3072     Journal *pRcDB = SYS_GET_RCDB(pSys);
3073 
3074     pRcDB->pCrashedFlcn = pFlcn;
3075 }
3076 
3077 
3078 /*
3079 ** _rcdbNocatCollectContext records the context of the GPU at the time the error is reported.
3080 **
3081 **  parameters:
3082 **      pGpu        pointer to GPU to be reported on.
3083 **      pContext    pointer to context structure to be filled in.
3084 **
3085 **   returns:
3086 **      NV_ERR_INVALID_ARGUMENT -- pContext is NULL
3087 */
3088 NV_STATUS
_rcdbNocatCollectContext(OBJGPU * pGpu,Journal * pRcdb,NV2080_NOCAT_JOURNAL_GPU_STATE * pContext)3089 _rcdbNocatCollectContext(OBJGPU *pGpu, Journal* pRcdb, NV2080_NOCAT_JOURNAL_GPU_STATE* pContext)
3090 {
3091     NV2080_NOCAT_JOURNAL_GPU_STATE* pContextCache = NULL;
3092     const char *pTag;
3093 
3094     if (pRcdb == NULL)
3095     {
3096         return NV_ERR_INVALID_ARGUMENT;
3097     }
3098 
3099     // determine which tag to use.
3100     if (pRcdb->nocatJournalDescriptor.tag[0] != '\0')
3101     {
3102         pTag = (char *)pRcdb->nocatJournalDescriptor.tag;
3103     }
3104     else
3105     {
3106         pTag = NOCAT_DEFAULT_TAG_VALUE_STR;
3107     }
3108     if (pGpu == NULL)
3109     {
3110         // w/o a GPU the only thing we can do is set the tag.
3111         if (pContext != NULL)
3112         {
3113             portMemSet(pContext, 0, sizeof(*pContext));
3114 
3115                 portStringCopy((char *)pContext->tag,
3116                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3117                     pTag,
3118                     portStringLength(pTag) + 1);
3119         }
3120         return NV_OK;
3121     }
3122 #if NOCAT_COLLECT_PERF
3123     pGpuCache = &(pGpu->nocatGpuCache);
3124 #endif
3125     pContextCache = &(pRcdb->nocatJournalDescriptor.nocatGpuState);
3126 
3127     // insert tag if we have one.
3128     portStringCopy((char *)pContextCache->tag,
3129         NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3130         pTag,
3131         portStringLength(pTag) + 1);
3132 
3133     if (!pContextCache->bValid)
3134     {
3135         pContextCache->deviceId = (NvU16)(DRF_VAL(_PCI, _DEVID, _DEVICE, pGpu->idInfo.PCIDeviceID));
3136         pContextCache->vendorId = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCIDeviceID));
3137         pContextCache->subsystemVendor = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCISubDeviceID));
3138         pContextCache->subsystemId = (NvU16)(DRF_VAL(_PCI, _SUBID, _DEVICE, pGpu->idInfo.PCISubDeviceID));
3139         pContextCache->revision = pGpu->idInfo.PCIRevisionID;
3140         pContextCache->type = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_MOBILE);
3141         pContextCache->bMsHybrid = FLD_TEST_DRF(_JT_FUNC, _CAPS, _MSHYB_ENABLED, _TRUE,
3142             pGpu->acpiMethodData.jtMethodData.jtCaps);
3143 
3144         portStringCopy((char *)pContextCache->vbiosProject, NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3145             NOCAT_UNKNOWN_STR, portStringLength(NOCAT_UNKNOWN_STR) + 1);
3146 
3147         if (!osIsRaisedIRQL())
3148         {
3149             NV_STATUS status = pGpu->acpiMethodData.capsMethodData.status;
3150             if (status == NV_OK)
3151             {
3152                 pContextCache->bOptimus =
3153                     FLD_TEST_DRF(OP_FUNC, _OPTIMUSCAPS, _OPTIMUS_CAPABILITIES,
3154                         _DYNAMIC_POWER_CONTROL, pGpu->acpiMethodData.capsMethodData.optimusCaps);
3155             }
3156 
3157             pContextCache->bValid = NV_TRUE;
3158         }
3159     }
3160     if (pContext != NULL)
3161     {
3162         portMemSet(pContext, 0, sizeof(*pContext));
3163 
3164         *pContext = *pContextCache;
3165 
3166         pContext->bFullPower = gpuIsGpuFullPower(pGpu);
3167         pContext->bInGc6Reset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET);
3168         pContext->bInFullchipReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET);
3169         pContext->bInSecBusReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET);
3170     }
3171     return NV_OK;
3172 }
3173 
3174 /*
3175 **  _rcdbSetTdrReason translates the reason code to a string & puts that string
3176 **  in the provided buffer.
3177 **
3178 **  parameters:
3179 **      tdrReason       the reason code for the TDR
3180 **      pTdrReasonStr   pointer to the place to copy the reason string to
3181 **      maxLen          the size of the buffer pointed to in pTdrReasonStr.
3182 **
3183 */
_rcdbSetTdrReason(Journal * pRcdb,NvU32 tdrReason,char * pTdrReasonStr,NvU32 maxLen)3184 void _rcdbSetTdrReason
3185 (
3186     Journal            *pRcdb,
3187     NvU32               tdrReason,
3188     char               *pTdrReasonStr,
3189     NvU32               maxLen
3190 )
3191 {
3192     const char *pTmpStr;
3193 
3194     // validate inputs.
3195     if (pRcdb == NULL)
3196     {
3197         return;
3198     }
3199 
3200     // is there a string buffer & is it large enough to hold more than a NULL string
3201     if ((pTdrReasonStr == NULL) || (maxLen < 2))
3202     {
3203         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3204         return;
3205     }
3206     switch (tdrReason)
3207     {
3208     case NV2080_CTRL_NOCAT_TDR_TYPE_NONE:
3209         pTmpStr = NOCAT_NA_STR;
3210         break;
3211     case NV2080_CTRL_NOCAT_TDR_TYPE_LEGACY:
3212         pTmpStr = NOCAT_LEGACY_STR;
3213         break;
3214     case NV2080_CTRL_NOCAT_TDR_TYPE_FULLCHIP:
3215         pTmpStr = NOCAT_FULLCHIP_TDR_STR;
3216         break;
3217     case NV2080_CTRL_NOCAT_TDR_TYPE_BUSRESET:
3218         pTmpStr = NOCAT_BUS_RESET_TDR_STR;
3219         break;
3220     case NV2080_CTRL_NOCAT_TDR_TYPE_GC6_RESET:
3221         pTmpStr = NOCAT_GC6_RESET_TDR_STR;
3222         break;
3223     case NV2080_CTRL_NOCAT_TDR_TYPE_SURPRISE_REMOVAL:
3224         pTmpStr = NOCAT_SURPRISE_REMOVAL_TDR_STR;
3225         break;
3226     case NV2080_CTRL_NOCAT_TDR_TYPE_UCODE_RESET:
3227         pTmpStr = NOCAT_UCODE_RESET_TDR_STR;
3228         break;
3229     default:
3230         pTmpStr = NOCAT_UNKNOWN_STR;
3231         break;
3232     }
3233     portStringCopy(pTdrReasonStr, maxLen,
3234         pTmpStr, portStringLength(pTmpStr) + 1);
3235 }
3236 
3237 /*
3238 ** _rcdbAllocNocatJournalRecord allocates a buffer entry from the Journal ring buffer
3239 **  for the specified type
3240 **
3241 **  parameters:
3242 **      pGpu            a pointer to the GPU object associated with the entry.
3243 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3244 **      type            the record type to locate a buffer for.
3245 **
3246 **  returns a pointer to a record in the ring buffer, or NULL if a record could not be allocated.
3247 **
3248 **  notes:
3249 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3250 **      the lock should be held until access the buffer is completed.
3251 **      failure to do so can result in concurrency issues.
3252 **
3253 **      if successful, the buffer that is returned is cleared & an id assigned.
3254 */
_rcdbAllocNocatJournalRecord(OBJGPU * pGpu,OBJRCDB * pRcdb,RmRCCommonJournal_RECORD ** ppCommon)3255 RM_NOCAT_JOURNAL_ENTRY* _rcdbAllocNocatJournalRecord
3256 (
3257     OBJGPU     *pGpu,
3258     OBJRCDB    *pRcdb,
3259     RmRCCommonJournal_RECORD   **ppCommon
3260 )
3261 {
3262     nocatQueueDescriptor   *pDesc = NULL;
3263     RmRCCommonJournal_RECORD* pCommon;
3264     RM_NOCAT_JOURNAL_ENTRY * pNocatEntry = NULL;
3265 
3266     // make sure someone has the lock.
3267     if (concurrentRingBufferAccess == 0)
3268     {
3269         return NULL;
3270     }
3271 
3272     pDesc = &pRcdb->nocatJournalDescriptor;
3273 
3274     // Get the next record from the appropriate nocat ring buffer.
3275     pCommon = _rcdbAllocRecFromRingBuffer(
3276         pGpu,
3277         pRcdb,
3278         RmNocatReport);
3279 
3280     if (pCommon != NULL)
3281     {
3282         // advance the pointer past the common header.
3283         pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3284 
3285         // clear the record & assign an id.
3286         portMemSet(pNocatEntry, 0, sizeof(*pNocatEntry));
3287         pNocatEntry->id = pDesc->nextRecordId++;
3288         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOCATED_IDX]++;
3289         portAtomicIncrementS32(&pNocatEntry->inUse);
3290     }
3291     else
3292     {
3293         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOC_FAILED_IDX]++;
3294     }
3295     if (ppCommon != NULL)
3296     {
3297         *ppCommon = pCommon;
3298     }
3299     return pNocatEntry;
3300 }
3301 
3302 /*
3303 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3304 **      or optionally the oldest record if the requested one is not available.
3305 **
3306 **  parameters:
3307 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3308 **      id              id of the record we are looking for
3309 **      bExactMatch     indicates if we want an exact match, or the closest record.
3310 **      ppCommon        a pointer to a pointer that will hold the pointer to
3311 **                      the common part of the record.
3312 **                      this can be NULL
3313 **      ppReturnedNocatEntry
3314 **                      a pointer to a pointer that will hold the pointer to
3315 **                      the nocat part of the record
3316 **                      this can be NULL
3317 **
3318 **  notes:
3319 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3320 **      the lock should be held until access the buffer is completed.
3321 **      failure to do so can result in concurrency issues.
3322 */
3323 NV_STATUS
_rcdbGetNocatJournalRecord(OBJRCDB * pRcdb,NvU32 reqId,NvBool bExactMatch,RmRCCommonJournal_RECORD ** ppReturnedCommon,RM_NOCAT_JOURNAL_ENTRY ** ppReturnedNocatEntry)3324 _rcdbGetNocatJournalRecord
3325 (
3326     OBJRCDB            *pRcdb,
3327     NvU32               reqId,
3328     NvBool              bExactMatch,
3329     RmRCCommonJournal_RECORD
3330                       **ppReturnedCommon,
3331     RM_NOCAT_JOURNAL_ENTRY
3332                       **ppReturnedNocatEntry
3333 )
3334 {
3335     nocatQueueDescriptor     *pDesc;
3336     RmRCCommonJournal_RECORD *pCommon = NULL;
3337     RM_NOCAT_JOURNAL_ENTRY   *pNocatEntry = NULL;
3338     RING_BUFFER_LOG          *pRingBuffer = NULL;
3339     NvS32                     offset;
3340     NvS32                     idx;
3341 
3342     // make sure someone has the lock.
3343     if (concurrentRingBufferAccess == 0)
3344     {
3345         return NV_ERR_BUSY_RETRY;
3346     }
3347 
3348     // is there anything to do
3349     if ((ppReturnedCommon == NULL) && (ppReturnedNocatEntry == NULL))
3350     {
3351         return NV_OK;
3352     }
3353 
3354     // validate inputs.
3355     if (pRcdb == NULL)
3356     {
3357         return NV_ERR_INVALID_ARGUMENT;
3358     }
3359     pDesc = &pRcdb->nocatJournalDescriptor;
3360 
3361     // assume we will fail
3362     if (ppReturnedCommon != NULL)
3363     {
3364         *ppReturnedCommon = NULL;
3365     }
3366     if (ppReturnedNocatEntry != NULL)
3367     {
3368         *ppReturnedNocatEntry = NULL;
3369     }
3370 
3371     // if there is nothing in the buffer,
3372     // we can't return a record.
3373     if ((pDesc->nextRecordId - pDesc->nextReportedId) == 0)
3374     {
3375         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_RECORDS_IDX]++;
3376         return NV_ERR_OBJECT_NOT_FOUND;
3377     }
3378 
3379     // Find the ring buffer for the diag reports
3380     rcdbFindRingBufferForType(pRcdb, RmNocatReport, &pRingBuffer);
3381     if (pRingBuffer == NULL)
3382     {
3383         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_BUFFER_IDX]++;
3384         return NV_ERR_OBJECT_NOT_FOUND;
3385     }
3386     // determine how far back from the head our record should be.
3387     offset = pDesc->nextRecordId - reqId;
3388 
3389     // start of from the next record we will replace.
3390     // this will be the oldest buffer in the record,
3391     // or the next empty record, either way, we will wrap to the right one
3392     idx = pRingBuffer->headIndex;
3393 
3394     // is the requested record in the buffer?
3395     if ((0 <= offset) && ((NvU16)offset <= pRingBuffer->numEntries))
3396     {
3397         // back out the offset from the newest/empty record.
3398         idx += pRingBuffer->numEntries - offset;
3399         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_MATCH_FOUND_IDX]++;
3400     }
3401     else if (bExactMatch)
3402     {
3403         // the record is not in the buffer, & we weren't asked for the closest match.
3404         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_MATCH_IDX]++;
3405         return NV_ERR_OBJECT_NOT_FOUND;
3406     }
3407     else
3408     {
3409         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_CLOSEST_FOUND_IDX]++;
3410     }
3411     // wrap the idx to the current size of the buffer.
3412     idx %= pRingBuffer->numEntries;
3413 
3414     // get a pointer to the common record & the record from the buffer.
3415     pCommon = (RmRCCommonJournal_RECORD*)(((NvU8*)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcdb, RmNocatReport) * idx));
3416 
3417     // get a pointer to the data that follows the common header, that is the record data.
3418     pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3419     portAtomicIncrementS32(&pNocatEntry->inUse);
3420 
3421     // pass the record along
3422     if (ppReturnedCommon != NULL)
3423     {
3424         *ppReturnedCommon = pCommon;
3425     }
3426     if (ppReturnedNocatEntry != NULL)
3427     {
3428         *ppReturnedNocatEntry = pNocatEntry;
3429     }
3430     return NV_OK;
3431 }
3432 /*
3433 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3434 **      or optionally the oldest record if the requested one is not available.
3435 **
3436 **  parameters:
3437 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3438 **      id              id of the record we are looking for
3439 **      bExactMatch     indicates if we want an exact match, or the closest record.
3440 **      ppCommon        a pointer to a pointer that will hold the pointer to
3441 **                      the common part of the record.
3442 **                      this can be NULL
3443 **      ppReturnedNocatEntry
3444 **                      a pointer to a pointer that will hold the pointer to
3445 **                      the nocat part of the record
3446 **                      this can be NULL
3447 **
3448 **  notes:
3449 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3450 **      the lock should be held until access the buffer is completed.
3451 **      failure to do so can result in concurrency issues.
3452 */
3453 NV_STATUS
_rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY * pNocatEntry)3454 _rcdbReleaseNocatJournalRecord
3455 (
3456     RM_NOCAT_JOURNAL_ENTRY  *pNocatEntry
3457 )
3458 {
3459     if (pNocatEntry == NULL)
3460     {
3461         return NV_ERR_INVALID_ARGUMENT;
3462     }
3463     if (portAtomicDecrementS32(&pNocatEntry->inUse) != 0)
3464     {
3465         return NV_ERR_BUSY_RETRY;
3466     }
3467     return NV_OK;
3468 }
3469 
3470 /*
3471 ** _rcdbGetNewestNocatJournalRecordForType returns a pointer to the newest record for the
3472 **  specified type if there is one.
3473 **
3474 **  parameters:
3475 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3476 **      type            type of record we want.
3477 **      ppCommon        a pointer to a pointer that will hold the pointer to
3478 **                      the common part of the record.
3479 **                      this can be NULL
3480 **      ppCommon        a pointer to a pointer that will hold the pointer to
3481 **                      the nocat part of the record
3482 **                      this can be NULL
3483 **
3484 **  notes:
3485 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3486 **      the lock should be held until access the buffer is completed.
3487 **      failure to do so can result in concurrency issues.
3488 */
3489 NV_STATUS
_rcdbGetNewestNocatJournalRecordForType(OBJRCDB * pRcdb,NvU32 type,RmRCCommonJournal_RECORD ** ppReturnedCommon,RM_NOCAT_JOURNAL_ENTRY ** ppReturnedNocatEntry)3490 _rcdbGetNewestNocatJournalRecordForType
3491 (
3492     OBJRCDB            *pRcdb,
3493     NvU32               type,
3494     RmRCCommonJournal_RECORD
3495                       **ppReturnedCommon,
3496     RM_NOCAT_JOURNAL_ENTRY
3497                       **ppReturnedNocatEntry
3498 )
3499 {
3500     if (type >= NV2080_NOCAT_JOURNAL_REC_TYPE_COUNT)
3501     {
3502         // we failed
3503         if (ppReturnedCommon != NULL)
3504         {
3505             *ppReturnedCommon = NULL;
3506         }
3507         if (ppReturnedNocatEntry != NULL)
3508         {
3509             *ppReturnedNocatEntry = NULL;
3510         }
3511         return NV_ERR_OBJECT_NOT_FOUND;
3512     }
3513     return _rcdbGetNocatJournalRecord(pRcdb, pRcdb->nocatJournalDescriptor.lastRecordId[type], NV_TRUE,
3514         ppReturnedCommon, ppReturnedNocatEntry);
3515 }
3516 
3517 /*
3518 ** rcdbReportNextNocatJournalEntry fills in the provided Nocat Journal record with the next record
3519 ** to be reported, then updates the last reported id.
3520 **
3521 **  parameters:
3522 **      pReturnedNocatEntry a pointer to the buffer where the journal record will be transferred to
3523 **
3524 **  returns:
3525 **      NV_OK -- the record was successfully updated with the next record to report.
3526 **      NV_ERR_INVALID_ARGUMENT -- the provided pointer is NULL
3527 **      NV_ERR_OBJECT_NOT_FOUND -- we could not locate a record to report.
3528 **
3529 **  notes:
3530 **      we are transferring the record to the target location here instead of passing a pointer
3531 **      to insure the data is transferred while we hold the concurrentRingBufferAccess lock.
3532 **      failure to do so can result in concurrency issues.
3533 **
3534 **      priority is determined by the record journal queue values. the lower value has
3535 **      higher priority.
3536 **
3537 **      now that we have moved from a single entry, to a queue, we need to
3538 **      consume the entry once we report it
3539 **
3540 */
3541 NV_STATUS
rcdbReportNextNocatJournalEntry(NV2080_NOCAT_JOURNAL_RECORD * pReturnedNocatEntry)3542 rcdbReportNextNocatJournalEntry
3543 (
3544     NV2080_NOCAT_JOURNAL_RECORD
3545                        *pReturnedNocatEntry
3546 )
3547 {
3548     OBJSYS                   *pSys = SYS_GET_INSTANCE();
3549     Journal                  *pRcdb = SYS_GET_RCDB(pSys);
3550     NV_STATUS                 status = NV_ERR_OBJECT_NOT_FOUND;
3551     nocatQueueDescriptor     *pDesc;
3552     RmRCCommonJournal_RECORD *pCommon = NULL;
3553     RM_NOCAT_JOURNAL_ENTRY   *pNocatEntry = NULL;
3554 
3555     // validate inputs.
3556     if (pRcdb == NULL)
3557     {
3558         return NV_ERR_INVALID_ARGUMENT;
3559     }
3560     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REQUESTED_IDX]++;
3561 
3562     if (pReturnedNocatEntry == NULL)
3563     {
3564         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3565         return NV_ERR_INVALID_ARGUMENT;
3566     }
3567     portMemSet(pReturnedNocatEntry, 0, sizeof(*pReturnedNocatEntry));
3568 
3569     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3570     {
3571         pDesc = &pRcdb->nocatJournalDescriptor;
3572         _rcdbGetNocatJournalRecord(pRcdb,
3573             pDesc->nextReportedId, NV_FALSE,
3574             &pCommon, &pNocatEntry);
3575         if ((pCommon != NULL) && (pNocatEntry != NULL))
3576         {
3577             // we have a record, push it into the return buffer
3578             pReturnedNocatEntry->GPUTag = pCommon->GPUTag;
3579 
3580             // copy over the data into the supplied buffer.
3581             pReturnedNocatEntry->loadAddress = pDesc->loadAddress;
3582             pReturnedNocatEntry->timeStamp = pCommon->timeStamp;
3583             pReturnedNocatEntry->stateMask = pCommon->stateMask;
3584             pReturnedNocatEntry->nocatGpuState = pNocatEntry->nocatGpuState;
3585             pReturnedNocatEntry->nocatJournalEntry = pNocatEntry->nocatJournalEntry;
3586 
3587             // check if we lost any records.
3588             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_DROPPED_IDX] +=
3589                 pNocatEntry->id - pDesc->nextReportedId;
3590 
3591             // update the NocatJournalNextReportedId
3592             pDesc->nextReportedId = pNocatEntry->id + 1;
3593             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REPORTED_IDX]++;
3594 
3595             _rcdbReleaseNocatJournalRecord(pNocatEntry);
3596             status = NV_OK;
3597 
3598         }
3599     }
3600     else
3601     {
3602         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
3603         status = NV_ERR_BUSY_RETRY;
3604     }
3605     portAtomicDecrementS32(&concurrentRingBufferAccess);
3606     if ((pRcdb->nocatJournalDescriptor.lockTimestamp != 0) && (rcdbGetNocatOutstandingCount(pRcdb) == 0))
3607     {
3608         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_UNLOCKED_IDX]++;
3609         pRcdb->nocatJournalDescriptor.lockTimestamp = 0;
3610     }
3611     return status;
3612 }
3613 
3614 /*
3615 ** rcdbGetNocatOutstandingCount returns the number of NOCAT events that have
3616 ** been recorded since the last reported record.
3617 **
3618 **  parameters:
3619 **      pRcdb -- a pointer to the Journal object.
3620 **
3621 **  returns:
3622 **      number of NOCAT events that have been recorded since the last reported record.
3623 **      or NV_U32_MAX if a NULL journal object pointer is provided.
3624 **
3625 **  notes:
3626 **      the returned count includes records that have been dropped due to wrapping.
3627 **
3628 */
3629 NvU32
rcdbGetNocatOutstandingCount(Journal * pRcdb)3630 rcdbGetNocatOutstandingCount(Journal *pRcdb)
3631 {
3632     NvU32 count = NV_U32_MAX;
3633     if (pRcdb != NULL)
3634     {
3635         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3636         {
3637             count = pRcdb->nocatJournalDescriptor.nextRecordId
3638                 - pRcdb->nocatJournalDescriptor.nextReportedId;
3639         }
3640         portAtomicDecrementS32(&concurrentRingBufferAccess);
3641     }
3642     return count;
3643 }
3644 
3645 /*
3646 ** _rcdbSendNocatJournalNotification sends an ETW Notification that a NOCAT Journal record has been posted.
3647 **
3648 **  parameters:
3649 **      pGpu -- a pointer to the GPU object associated with teh new entry
3650 **              (may be NULL)
3651 **      pRcdb -- a pointer to the Journal object NOCAT is using.
3652 **      posted -- the number of records posted since the last record that was retrieved.
3653 **      pCommon -- a pointer to the common record header associated with the record.
3654 **      type -- the record type
3655 **
3656 **  returns:
3657 **      NV_OK -- the call to post the record was made.
3658 **          note that the call to post the record does not return a status,
3659 **          so we do not know if the call was successful.
3660 **      NV_ERR_INVALID_ARGUMENT -- one of the required pointers is NULL
3661 **
3662 */
3663 NV_STATUS
_rcdbSendNocatJournalNotification(OBJGPU * pGpu,Journal * pRcdb,NvU32 posted,RmRCCommonJournal_RECORD * pCommon,NvU32 type)3664 _rcdbSendNocatJournalNotification
3665 (
3666     OBJGPU *pGpu,
3667     Journal *pRcdb,
3668     NvU32    posted,
3669     RmRCCommonJournal_RECORD *pCommon,      // todo: pass in timestamp instead of common.
3670     NvU32 type
3671 )
3672 {
3673     if ((pCommon == NULL) || (pRcdb == NULL))
3674     {
3675         return NV_ERR_INVALID_ARGUMENT;
3676     }
3677     RMTRACE_NOCAT(_REPORT_PENDING, (pGpu ? pGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
3678         RmNocatReport,
3679         posted,
3680         type,
3681         rcdbGetNocatOutstandingCount(pRcdb),
3682         pCommon->timeStamp);
3683 
3684     // count the number of notifications.
3685     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NOTIFICATIONS_IDX]++;
3686     return NV_OK;
3687 }
3688 
3689 /*
3690 ** rcdbInitNocatGpuCache_IMPL initializes a per GPU cache held in the GPU object to be used by NOCAT
3691 **
3692 **  parameters:
3693 **      pGpu -- a pointer to the GPU Object the containing the cache
3694 **
3695 **  notes:
3696 **      this function:
3697 **      * caches the driver load address
3698 **      * allocates a small block of memory in the frame buffer for testing
3699 **      * initializes the GPU context cache
3700 **
3701 */
rcdbInitNocatGpuCache_IMPL(OBJGPU * pGpu)3702 void rcdbInitNocatGpuCache_IMPL(OBJGPU *pGpu)
3703 {
3704     OS_DRIVER_BLOCK         driverBlock;
3705     OBJSYS                 *pSys = SYS_GET_INSTANCE();
3706     Journal                *pRcdb = SYS_GET_RCDB(pSys);
3707 #if NOCAT_PROBE_FB_MEMORY
3708     NvU8                   *pCpuPtr;
3709     NV_STATUS              status;
3710 #endif
3711 
3712     if (pGpu == NULL)
3713     {
3714         return;
3715     }
3716     portMemSet(&driverBlock, 0x00, sizeof(driverBlock));
3717     if (osGetDriverBlock(pGpu->pOsGpuInfo, &driverBlock) == NV_OK)
3718     {
3719         pRcdb->nocatJournalDescriptor.loadAddress = (NvU64)driverBlock.driverStart;
3720     }
3721 
3722 #if NOCAT_PROBE_FB_MEMORY
3723     // Allocate some memory for virtual BAR2 testing
3724     if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_ALL_INST_IN_SYSMEM) && !IsAMODEL(pGpu))
3725     {
3726         memdescCreateExisting(&pGpu->nocatGpuCache.fbTestMemDesc,
3727             pGpu, NOCAT_FBSIZETESTED, ADDR_FBMEM, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE);
3728         memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_102,
3729                         (&pGpu->nocatGpuCache.fbTestMemDesc));
3730         if (status != NV_OK)
3731         {
3732             NV_PRINTF(LEVEL_ERROR, "Could not allocate vidmem for NOCAT bar2 testing\n");
3733             return;
3734         }
3735         pCpuPtr = kbusMapRmAperture_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc);
3736         if (pCpuPtr == NULL)
3737         {
3738             memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3739             memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3740             pGpu->nocatGpuCache.pCpuPtr = NULL;
3741             return;
3742         }
3743         pGpu->nocatGpuCache.pCpuPtr = pCpuPtr;
3744     }
3745 #endif
3746     // initialize the context cache
3747     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3748     {
3749         _rcdbNocatCollectContext(pGpu, pRcdb, NULL);
3750     }
3751     portAtomicDecrementS32(&concurrentRingBufferAccess);
3752 
3753     return;
3754 }
3755 
3756 /*
3757 ** rcdbCleanupNocatGpuCache_IMPL returns per GPU resources used by NOCAT.
3758 **
3759 **  parameters:
3760 **      pGpu -- a pointer to the GPU Object the containing the cache
3761 **
3762 **  notes:
3763 **      This will free up the FB test window if allocated, and clear out the cache
3764 **
3765 */
rcdbCleanupNocatGpuCache_IMPL(OBJGPU * pGpu)3766 void rcdbCleanupNocatGpuCache_IMPL(OBJGPU *pGpu)
3767 {
3768 #if NOCAT_PROBE_FB_MEMORY
3769     if (pGpu == NULL)
3770     {
3771         return;
3772     }
3773     if (pGpu->nocatGpuCache.pCpuPtr != NULL)
3774     {
3775         kbusUnmapRmApertureWithFlags_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc,
3776             &pGpu->nocatGpuCache.pCpuPtr, TRANSFER_FLAGS_NONE);
3777         memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3778         memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3779     }
3780     portMemSet(&pGpu->nocatGpuCache, 0, sizeof(pGpu->nocatGpuCache));
3781 #endif
3782 
3783     return;
3784 }
3785 
3786 
3787 
3788 /*
3789 ** rcdbNocatInsertNocatError records a reported NOCAT error
3790 **
3791 **  parameters:
3792 **      pGpu        Pointer to GPU associated with the error
3793 **                  may be NULL if there is no GPU associated with the error
3794 **                  if NULL the primary GPU is used
3795 **      pNewEntry   A pointer to a structure that contains all the available data for the report
3796 */
3797 NvU32
rcdbNocatInsertNocatError(OBJGPU * pGpu,NOCAT_JOURNAL_PARAMS * pNewEntry)3798 rcdbNocatInsertNocatError(
3799     OBJGPU *pGpu,
3800     NOCAT_JOURNAL_PARAMS *pNewEntry
3801 )
3802 {
3803     OBJSYS                     *pSys = SYS_GET_INSTANCE();
3804     Journal                    *pRcdb = SYS_GET_RCDB(pSys);
3805 #if(NOCAT_PROBE_FB_MEMORY)
3806     NvBool                      bCheckFBState = NV_FALSE;
3807 #endif
3808     RmRCCommonJournal_RECORD   *pCommon = NULL;
3809     RM_NOCAT_JOURNAL_ENTRY     *pNocatEntry = NULL;
3810     NvU32                       id = INVALID_RCDB_NOCAT_ID;
3811     const char                 *pSource = NULL;
3812     NvU32                       diagBufferLen = 0;
3813     const char                 *pFaultingEngine = NULL;
3814     NvBool                      postRecord;
3815     // validate inputs.
3816     if (pRcdb == NULL)
3817     {
3818         return NV_ERR_INVALID_ARGUMENT;
3819     }
3820     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_REQ_IDX]++;
3821     if (pNewEntry == NULL)
3822     {
3823         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3824         return 0;
3825     }
3826     // assign a timestamp if none was provided
3827     if (pNewEntry->timestamp == 0)
3828     {
3829         pNewEntry->timestamp = osGetTimestamp();
3830     }
3831 
3832     // initially set postRecord based on the current state of the lock;
3833     postRecord = pRcdb->nocatJournalDescriptor.lockTimestamp == 0;
3834 
3835     // perform any record type specific setup
3836     switch (pNewEntry->recType)
3837     {
3838     case NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK:
3839 #if(NOCAT_PROBE_FB_MEMORY)
3840         bCheckFBState = NV_TRUE;
3841 #endif
3842         // fall thru
3843 
3844     case NV2080_NOCAT_JOURNAL_REC_TYPE_TDR:
3845         // lock the journal so we don't wrap over the record we are inserting.
3846         if (pRcdb->nocatJournalDescriptor.lockTimestamp == 0)
3847         {
3848             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCKED_IDX]++;
3849         }
3850         else
3851         {
3852             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCK_UPDATED_IDX]++;
3853         }
3854 
3855         pRcdb->nocatJournalDescriptor.lockTimestamp = pNewEntry->timestamp;
3856         postRecord = NV_TRUE;
3857         break;
3858 
3859     case NV2080_NOCAT_JOURNAL_REC_TYPE_RC:
3860 #if(NOCAT_PROBE_FB_MEMORY)
3861         bCheckFBState = NV_TRUE;
3862 #endif
3863         // set the source
3864         pSource = "RC Error";
3865         break;
3866 
3867     case NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT:
3868         // set the source
3869         pSource = "ASSERT";
3870         break;
3871 
3872     case NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE:
3873         break;
3874 
3875     case NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN:
3876     default:
3877         return 0;
3878         break;
3879     }
3880     // check if we should post the record when locked.
3881     if (!postRecord)
3882     {
3883         if ((NvS64)(pNewEntry->timestamp - pRcdb->nocatJournalDescriptor.lockTimestamp) < 0)
3884         {
3885             // the record predates the lock, so it's Grandfathered in.
3886             postRecord = NV_TRUE;
3887             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_GRANDFATHERED_RECORD_IDX]++;
3888         }
3889         else
3890         {
3891             // we are dropping the record, count that.
3892             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_LOCKED_OUT_IDX]++;
3893         }
3894     }
3895     if (postRecord)
3896     {
3897         // is the buffer available?
3898         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3899         {
3900             // start recording this new record by allocating a record from the buffer.
3901             pNocatEntry = _rcdbAllocNocatJournalRecord(pGpu, pRcdb, &pCommon);
3902             if (pNocatEntry != NULL)
3903             {
3904                 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECTED_IDX]++;
3905 
3906                 // update the time stamp to the one supplied.
3907                 pCommon->timeStamp = pNewEntry->timestamp;
3908 
3909                 // save the record Id for the type.
3910                 pRcdb->nocatJournalDescriptor.lastRecordId[pNewEntry->recType] =
3911                     pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY] =
3912                     pRcdb->nocatJournalDescriptor.nextRecordId - 1;
3913 
3914                 // set the type.
3915                 pNocatEntry->nocatJournalEntry.recType = pNewEntry->recType;
3916 
3917                 // set bugcheck
3918                 pNocatEntry->nocatJournalEntry.bugcheck = pNewEntry->bugcheck;
3919 
3920                 // get context
3921                 _rcdbNocatCollectContext(pGpu, pRcdb, &(pNocatEntry->nocatGpuState));
3922 
3923 #if(NOCAT_PROBE_FB_MEMORY)
3924                 if ((bCheckFBState)
3925                     && (pGpu != NULL)
3926                     && (pGpu->nocatGpuCache.pCpuPtr != NULL)
3927                     // If using Coherent CPU mapping instead of BAR2 do not call VerifyBar2
3928                     && !pGpu->getProperty(pGpu, PDB_PROP_GPU_COHERENT_CPU_MAPPING))
3929                 {
3930                     switch (kbusVerifyBar2_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu),
3931                         &pGpu->nocatGpuCache.fbTestMemDesc, pGpu->nocatGpuCache.pCpuPtr, 0, NOCAT_FBSIZETESTED))
3932                     {
3933                     case NV_OK:                     // everything passed
3934                         break;
3935 
3936                     case NV_ERR_MEMORY_ERROR:       // BAR 0 failed & BAR 2 was not checked, or BAR 2 failed
3937                         // for now we don't know which BAR failed, so mark both.
3938                         // but only one BAR failed.
3939                         // (if BAR 0 Failed, BAR 2 was not checked)
3940                         pCommon->stateMask |=
3941                             NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR0
3942                             | NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR2;
3943                         break;
3944 
3945                     default:                        // some other processing error cause us to not test the BAR
3946                         break;
3947                     }
3948                 }
3949 #endif
3950                 // is there a valid string for source?
3951                 // (non NULL ptr & more than just a termination)
3952                 if ((pNewEntry->pSource != NULL) && (pNewEntry->pSource[0] != '\0'))
3953                 {
3954                     // yes, use that.
3955                     pSource = pNewEntry->pSource;
3956                 }
3957                 // the caller did not supply a source,
3958                 // did we set a default source based on record type?
3959                 else if (pSource == NULL)
3960                 {
3961                     // no, supply the unknown string for source.
3962                     pSource = NOCAT_UNKNOWN_STR;
3963                 }
3964                 portStringCopy((char*)pNocatEntry->nocatJournalEntry.source,
3965                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3966                     pSource,
3967                     portStringLength(pSource) + 1);
3968 
3969                 pNocatEntry->nocatJournalEntry.subsystem = pNewEntry->subsystem;
3970                 pNocatEntry->nocatJournalEntry.errorCode = pNewEntry->errorCode;
3971 
3972                 if ((pNewEntry->pDiagBuffer != NULL) && (pNewEntry->diagBufferLen != 0))
3973                 {
3974                     // checking length here as we don't want portMemCopy to assert
3975                     if (pNewEntry->diagBufferLen < NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer))
3976                     {
3977                         diagBufferLen = pNewEntry->diagBufferLen;
3978                     }
3979                     else
3980                     {
3981                         // make best effort
3982                         diagBufferLen = NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer);
3983                     }
3984                     portMemCopy(pNocatEntry->nocatJournalEntry.diagBuffer,
3985                         sizeof(pNocatEntry->nocatJournalEntry.diagBuffer),
3986                         pNewEntry->pDiagBuffer, diagBufferLen);
3987                 }
3988                 pNocatEntry->nocatJournalEntry.diagBufferLen = diagBufferLen;
3989 
3990                 pFaultingEngine = pNewEntry->pFaultingEngine != NULL ?
3991                     pNewEntry->pFaultingEngine : NOCAT_UNKNOWN_STR;
3992 
3993                 portStringCopy((char*)pNocatEntry->nocatJournalEntry.faultingEngine,
3994                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3995                     pFaultingEngine, portStringLength(pFaultingEngine) + 1);
3996 
3997                 _rcdbSetTdrReason(pRcdb, pNewEntry->tdrReason,
3998                     (char*)pNocatEntry->nocatJournalEntry.tdrReason,
3999                     sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
4000 
4001                 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4002             }
4003             else
4004             {
4005                 // record was not allocated, bail.
4006                 postRecord = NV_FALSE;
4007                 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_FAILED_IDX]++;
4008             }
4009         }
4010         else
4011         {
4012             // we are busy, so we can't insert the record, count the record as dropped & count the busy.
4013             postRecord = NV_FALSE;
4014             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4015         }
4016         portAtomicDecrementS32(&concurrentRingBufferAccess);
4017     }
4018 
4019     // no matter what happened, trigger the event to indicate a record was processed.
4020     _rcdbSendNocatJournalNotification(pGpu, pRcdb, postRecord, pCommon, pNewEntry->recType);
4021 
4022     return id;
4023 }
4024 /*
4025 ** rcdbNocatInsertBugcheck is the interface to record a bugcheck NOCAT report
4026 **
4027 **  parameters:
4028 **      deviceInstance  The instance of the GPU associated with the bugcheck.
4029 **      bugcheck        The bugcheck number
4030 */
4031 NvU32
rcdbNocatInsertBugcheck(NvU32 deviceInstance,NvU32 bugCheckCode)4032 rcdbNocatInsertBugcheck
4033 (
4034     NvU32               deviceInstance,
4035     NvU32               bugCheckCode)
4036 {
4037     NOCAT_JOURNAL_PARAMS newEntry;
4038 
4039     portMemSet(&newEntry, 0, sizeof(newEntry));
4040     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK;
4041     newEntry.bugcheck = bugCheckCode;
4042     newEntry.pSource = "OS";
4043     newEntry.errorCode = bugCheckCode;
4044     return rcdbNocatInsertNocatError(gpumgrGetGpu(deviceInstance), &newEntry);
4045 }
4046 
4047 /*
4048 ** rcdbNocatInitEngineErrorEvent initializes a parameter structure for an engine error event
4049 **
4050 **  parameters:
4051 **      pNewEntry       Pointer to event parameter structure to be initialized
4052 */
4053 NV_STATUS
rcdbNocatInitEngineErrorEvent(NOCAT_JOURNAL_PARAMS * pNewEntry)4054 rcdbNocatInitEngineErrorEvent
4055 (
4056     NOCAT_JOURNAL_PARAMS *pNewEntry
4057 )
4058 {
4059     if (pNewEntry == NULL)
4060     {
4061         return NV_ERR_INVALID_ARGUMENT;
4062     }
4063     portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4064     pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE;
4065     return NV_OK;
4066 }
4067 
4068 /*
4069 ** rcdbNocatInsertEngineError records a reported NOCAT error from an engine,
4070 **
4071 **  parameters:
4072 **      pGpu            Pointer to GPU associated with the error
4073 **                      may be NULL if there is no GPU associated with the error
4074 **                      if NULL the primary GPU is used
4075 **      pSource         A string indicating the reporting source of the error.
4076 **                      if NULL, a default values will be used
4077 **      subsystem       The optional subsystem ID used by the source to identify the error
4078 **      errorCode       The error code
4079 **      pDiagBuffer     A pointer to the diagnostic buffer associated with the error
4080 **                      may be NULL
4081 **      diagBufferLen   The size of the diagnostic buffer
4082 **                      if the size exceeds the supported diagBuffer size, the buffer contents will be truncated to fit.
4083 */
4084 NvU32
rcdbNocatInsertEngineError(OBJGPU * pGpu,const char * pSource,NvU32 subsystem,NvU64 errorCode,NvU8 * pDiagBuffer,NvU32 diagBufferLen)4085 rcdbNocatInsertEngineError(
4086     OBJGPU             *pGpu,
4087     const char         *pSource,
4088     NvU32               subsystem,
4089     NvU64               errorCode,
4090     NvU8               *pDiagBuffer,
4091     NvU32               diagBufferLen
4092 )
4093 {
4094     NOCAT_JOURNAL_PARAMS newEntry;
4095 
4096     rcdbNocatInitEngineErrorEvent(&newEntry);
4097     newEntry.pSource = pSource;
4098     newEntry.subsystem = subsystem;
4099     newEntry.errorCode = errorCode;
4100     newEntry.pDiagBuffer = pDiagBuffer;
4101     newEntry.diagBufferLen = diagBufferLen;
4102     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4103 }
4104 
4105 /*
4106 ** rcdbNocatInsertTDRError records an TDR error,
4107 **
4108 **  parameters:
4109 **      pGpu            Pointer to GPU associated with the error
4110 **                      may be NULL if there is no GPU associated with the error
4111 **                      if NULL the primary GPU is used
4112 **      pSource         A string indicating the reporting source of the error.
4113 **                      if NULL, a default values will be used
4114 **      subsystem       The optional subsystem ID used by the source to identify the error
4115 **      errorCode       The error code
4116 **      TDRBucket       The TDR bucket
4117 **      pDiagBuffer     A pointer to the diagnostic buffer associated with the error
4118 **                      may be NULL
4119 **      diagBufferLen   The size of the diagnostic buffer
4120 **                      if the size exceeds the supported diagBuffer size,
4121 **                      the buffer contents will be truncated to fit.
4122 **      tdrReason       A reason code for the TDR
4123 **      pFaultingApp    A pointer to the faulting app name if known
4124 */
4125 NvU32
rcdbNocatInsertTDRError(OBJGPU * pGpu,const char * pSource,NvU32 subsystem,NvU64 errorCode,NvU32 TdrBucket,NvU8 * pDiagBuffer,NvU32 diagBufferLen,NvU32 tdrReason,const char * pFaultingEngine)4126 rcdbNocatInsertTDRError
4127 (
4128     OBJGPU             *pGpu,
4129     const char         *pSource,
4130     NvU32               subsystem,
4131     NvU64               errorCode,
4132     NvU32               TdrBucket,
4133     NvU8               *pDiagBuffer,
4134     NvU32               diagBufferLen,
4135     NvU32               tdrReason,
4136     const char         *pFaultingEngine
4137 )
4138 {
4139     NOCAT_JOURNAL_PARAMS newEntry;
4140 
4141     portMemSet(&newEntry, 0, sizeof(newEntry));
4142     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4143     newEntry.pSource = pSource;
4144     newEntry.subsystem = subsystem;
4145     newEntry.errorCode = errorCode;
4146     newEntry.pDiagBuffer = pDiagBuffer;
4147     newEntry.diagBufferLen = diagBufferLen;
4148     newEntry.pFaultingEngine = pFaultingEngine;
4149     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4150 }
4151 NV_STATUS
rcdbNocatInitRCErrorEvent(NOCAT_JOURNAL_PARAMS * pNewEntry)4152 rcdbNocatInitRCErrorEvent
4153 (
4154     NOCAT_JOURNAL_PARAMS *pNewEntry
4155 )
4156 {
4157     if (pNewEntry == NULL)
4158     {
4159         return NV_ERR_INVALID_ARGUMENT;
4160     }
4161     portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4162     pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_RC;
4163     pNewEntry->pSource = "RC ERROR";
4164     return NV_OK;
4165 }
4166 
4167 /*
4168 ** _rcdbNocatReportAssert adds an assert record.
4169 **
4170 **  parameters:
4171 **      pGpu        Pointer to GPU associated with the error
4172 **                  may be NULL
4173 **      pAssertRec  A pointer to the assert to report
4174 */
4175 NV_STATUS
_rcdbNocatReportAssert(OBJGPU * pGpu,RmRCCommonAssert_RECORD * pAssertRec)4176 _rcdbNocatReportAssert
4177 (
4178     OBJGPU                  *pGpu,
4179     RmRCCommonAssert_RECORD *pAssertRec
4180 )
4181 {
4182     OBJSYS                 *pSys = SYS_GET_INSTANCE();
4183     Journal                *pRcdb = SYS_GET_RCDB(pSys);
4184     NOCAT_JOURNAL_PARAMS    newEntry;
4185     RM_NOCAT_ASSERT_DIAG_BUFFER    diagBuffer;
4186     RM_NOCAT_ASSERT_DIAG_BUFFER   *pDiagData;
4187     NvU32                   idx;
4188     RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
4189     NvU32                   gpuCnt= 0;
4190     OBJGPU                  *pTmpGpu = gpumgrGetGpu(0);
4191 
4192     // validate inputs.
4193     if ((pRcdb == NULL) || (pAssertRec == NULL))
4194     {
4195         return NV_ERR_INVALID_ARGUMENT;
4196     }
4197     if (pGpu == NULL)
4198     {
4199         // we don't have a GPU, if there is only 1 GPU,
4200         // we can safely use it for logging this assert
4201         gpumgrGetGpuAttachInfo(&gpuCnt, NULL);
4202         if (gpuCnt == 1)
4203         {
4204             pGpu = pTmpGpu;
4205         }
4206     }
4207 
4208     // start off assuming we will be recording a report
4209     portMemSet(&newEntry, 0, sizeof(newEntry));
4210     newEntry.timestamp = pAssertRec->common.timeStamp;
4211     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT;
4212     newEntry.pSource = "ASSERT";
4213 
4214     // save the assert point as the error code.
4215     newEntry.errorCode =
4216         (NvU32)((pAssertRec->breakpointAddrHint - pRcdb->nocatJournalDescriptor.loadAddress)
4217             & 0xffffffff);
4218 
4219     // put the line number in the upper 32 bits.
4220     newEntry.errorCode |= ((NvU64)pAssertRec->lineNum) << 32;
4221 
4222     for (idx = 0; idx < NV_ARRAY_ELEMENTS(pAssertRec->callStack); idx++)
4223     {
4224         diagBuffer.callStack[idx] =
4225             (NvU32)((pAssertRec->callStack[idx] - pRcdb->nocatJournalDescriptor.loadAddress)
4226                 & 0xffffffff);
4227     }
4228     // initialize count
4229     diagBuffer.count = 1;
4230 
4231     // setup the pointer to our diag buffer & its length
4232     newEntry.pDiagBuffer = (NvU8 *)&diagBuffer;
4233 
4234     newEntry.diagBufferLen = sizeof(diagBuffer);
4235 
4236     // is the last thing we logged an assert, & is this the same assert?
4237     if ((pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT]
4238         == pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY])
4239         && (0 == portMemCmp(&pRcdb->nocatJournalDescriptor.lastAssertData,
4240             diagBuffer.callStack,                                       // same stack
4241             sizeof(diagBuffer.callStack))))
4242     {
4243         // it is the same as the last assert we logged. so don't log it again.
4244         // but see if we can increment the counter in an unreported assert.
4245         // check if the last record is also an assert
4246         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4247         {
4248             // get the last record from the buffer
4249             _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4250                 NV2080_NOCAT_JOURNAL_REC_TYPE_ANY,
4251                 NULL, &pNocatEntry);
4252             if (pNocatEntry != NULL)
4253             {
4254                 // is it an assert?
4255                 if (pNocatEntry->nocatJournalEntry.recType == (NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT))
4256                 {
4257                     // increment the count
4258                     pDiagData = (RM_NOCAT_ASSERT_DIAG_BUFFER*)&pNocatEntry->nocatJournalEntry.diagBuffer;
4259                     pDiagData->count++;
4260                 }
4261                 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4262 
4263             }
4264         }
4265         else
4266         {
4267             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4268         }
4269         portAtomicDecrementS32(&concurrentRingBufferAccess);
4270     }
4271     else
4272     {
4273         // we are logging this assert, save off the stack so we can use it to
4274         // compare against future asserts.
4275         portMemCopy(&pRcdb->nocatJournalDescriptor.lastAssertData,
4276             sizeof(pRcdb->nocatJournalDescriptor.lastAssertData),
4277             &diagBuffer, sizeof(diagBuffer));
4278         rcdbNocatInsertNocatError(pGpu, &newEntry);
4279     }
4280 
4281     return NV_OK;
4282 }
4283 
4284 /*
4285 ** rcdbNocatInsertRMCDErrorEvent creates an event from an RMCD error block
4286 **
4287 **  parameters:
4288 **  pGpu        pointer to GPU object associated with the error
4289 **  recType     the type of event to create
4290 **  pSource     a pointer to the source string
4291 **  subsystem   the subsystem associated with the event.
4292 **  errorCode   error code for the event
4293 **  pFault      pointer to a faulting component identifier associated with the error
4294 */
rcdbNocatInsertRMCDErrorEvent(OBJGPU * pGpu,NvU32 recType,const char * pSource,NvU32 subsystem,NvU64 errorCode,const char * pFault,RMCD_ERROR_BLOCK * pRcdError)4295 NvU32 rcdbNocatInsertRMCDErrorEvent(OBJGPU *pGpu, NvU32 recType,
4296     const char *pSource, NvU32 subsystem, NvU64 errorCode, const char *pFault,
4297     RMCD_ERROR_BLOCK *pRcdError)
4298 {
4299     NOCAT_JOURNAL_PARAMS    newEntry;
4300 
4301     portMemSet(&newEntry, 0, sizeof(newEntry));
4302     newEntry.recType = recType;
4303     newEntry.pSource = pSource;
4304     newEntry.subsystem = subsystem;
4305     newEntry.errorCode = errorCode;
4306     newEntry.pFaultingEngine = pFault;
4307     if (pRcdError != NULL)
4308     {
4309         newEntry.pDiagBuffer = (NvU8 * )pRcdError->pBlock;
4310         newEntry.diagBufferLen = pRcdError->blockSize;
4311     }
4312     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4313 }
4314 
4315 /*
4316 ** rcdbSetNocatTdrReason sets the TDR reason code in the most recent TDR record if there is one,
4317 **  otherwise, it creates one with the reason code.
4318 **
4319 **  parameters:
4320 **      pReasonData     the data supplied with including the reason code.
4321 **                      if a TDR record exists, the reason will be added to the existing record,
4322 **                      otherwise the rest of the data will be used to create a new TDR event.
4323 */
rcdbSetNocatTdrReason(NV2080CtrlNocatJournalDataTdrReason * pReasonData)4324 NV_STATUS rcdbSetNocatTdrReason
4325 (
4326     NV2080CtrlNocatJournalDataTdrReason *pReasonData
4327 )
4328 {
4329     OBJSYS             *pSys = SYS_GET_INSTANCE();
4330     Journal            *pRcdb = SYS_GET_RCDB(pSys);
4331     RM_NOCAT_JOURNAL_ENTRY* pNocatEntry = NULL;
4332 
4333     // validate inputs.
4334     if ((pRcdb == NULL) || (pReasonData == NULL))
4335     {
4336         return NV_ERR_INVALID_ARGUMENT;
4337     }
4338     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATE_REQ_IDX]++;
4339 
4340     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4341     {
4342         // see if there is a TDR record.
4343         _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4344             NV2080_NOCAT_JOURNAL_REC_TYPE_TDR,
4345             NULL, &pNocatEntry);
4346         if (pNocatEntry != NULL)
4347         {
4348             // there is, set the reason.
4349             _rcdbSetTdrReason(pRcdb, pReasonData->reasonCode,
4350                 (char *)pNocatEntry->nocatJournalEntry.tdrReason,
4351                 sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
4352             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATED_IDX]++;
4353             _rcdbReleaseNocatJournalRecord(pNocatEntry);
4354         }
4355     }
4356     portAtomicDecrementS32(&concurrentRingBufferAccess);
4357 
4358     // if we did not get a TDR record, create one.
4359     // we need to do it after the ring buffers are released.
4360     if (pNocatEntry == NULL)
4361     {
4362         NOCAT_JOURNAL_PARAMS newEntry;
4363 
4364         portMemSet(&newEntry, 0, sizeof(newEntry));
4365         newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4366         newEntry.pSource = (char *)pReasonData->source;
4367         newEntry.subsystem = pReasonData->subsystem;
4368         newEntry.errorCode = pReasonData->errorCode;
4369         newEntry.tdrReason = pReasonData->reasonCode;
4370         return rcdbNocatInsertNocatError(NULL, &newEntry);
4371     }
4372     return NV_OK;
4373 }
4374