1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /***************************** HW State Routines ***************************\
25 *                                                                           *
26 *         RM robust error journal (formerly RCDB)                           *
27 *                                                                           *
28 \***************************************************************************/
29 
30 #include "gpu_mgr/gpu_mgr.h"
31 #include "nvRmReg.h"
32 #include "nvBldVer.h"
33 #include "nvVer.h"
34 #include "os/os.h"
35 #include "core/system.h"
36 #include "gpu/gpu.h"
37 #include "diagnostics/journal.h"
38 #include "platform/chipset/chipset.h"
39 #include "diagnostics/nv_debug_dump.h"
40 #include "diagnostics/tracer.h"
41 #include "core/locks.h"
42 #include "rmapi/rs_utils.h"
43 #include "vgpu/rpc.h"
44 #include "gpu/bus/kern_bus.h"
45 #include "gpu/mem_mgr/mem_mgr.h"
46 #include "nvdevid.h"
47 #include "nvop.h"
48 #include "jt.h"
49 
50 
51 
52 #include "ctrl/ctrl0000/ctrl0000nvd.h"
53 
54 #include "nvlimits.h"
55 #include "Nvcm.h"
56 
57 #include "lib/protobuf/prb_util.h"
58 #include "g_all_dcl_pb.h"
59 #include "g_nvdebug_pb.h"
60 #include "nv_ref.h"
61 
62 #define NOCAT_UNKNOWN_STR                       "*** unknown ***"
63 #define NOCAT_NA_STR                            "N/A"
64 #define NOCAT_LEGACY_STR                        "LEGACY"
65 #define NOCAT_FULLCHIP_TDR_STR                  "FULL CHIP RESET"
66 #define NOCAT_BUS_RESET_TDR_STR                 "BUS RESET"
67 #define NOCAT_GC6_RESET_TDR_STR                 "GC6 RESET"
68 #define NOCAT_NORMAL_TDR_STR                    "NORMAL TDR"
69 #define NOCAT_UCODE_RESET_TDR_STR               "UCODE RESET"
70 #define NOCAT_SURPRISE_REMOVAL_TDR_STR          "SURPRISE REMOVAL"
71 #define NOCAT_DEFAULT_TAG_VALUE_STR             "prod"
72 #define NOCAT_DEFAULT_TDR_REASON_SRC_STR        "KMD"
73 #define NOCAT_FBSIZETESTED                      0x10
74 
75 // Reducing size to 4K for reducing non-paged allocations on win8
76 #define NVDUMP_DEBUGGER_BUFFER_SIZE (4 * 1024)
77 
78 #define JOURNAL_BUFFER_SIZE_DEFAULT    (4 * 1024)
79 
80 #define JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES    5
81 
82 static void nvdDebuggerControlFunc(void);
83 
84 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
85 #if !defined(DEBUG) && !defined(QA_BUILD)
86 static NvBool rcdProbeGpuPresent(OBJGPU *pGpu, NvU64 ip);
87 static NvBool rcdProbeAllGpusPresent(NvU64 ip);
88 static volatile NvS32 probeGpuRecursion = 0;
89 #endif
90 #endif
91 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type);
92 static volatile NvS32 concurrentRingBufferAccess = 0;
93 static volatile NvS32 assertListRecursion = 0;
94 static void rcdbFindRingBufferForType(Journal *pRcDB, RMCD_RECORD_TYPE recType, RING_BUFFER_LOG **ppRingBuffer);
95 static NV_STATUS _rcdbGetNocatJournalRecord(OBJRCDB* pRcdb,
96     NvU32 id, NvBool bExactMatch,
97     RmRCCommonJournal_RECORD** ppReturnedCommon, RM_NOCAT_JOURNAL_ENTRY** ppReturnedJournal);
98 static NV_STATUS _rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY* pReturnedJournal);
99 static NV_STATUS _rcdbNocatReportAssert(OBJGPU* pGpu, RmRCCommonAssert_RECORD* pAssert);
100 
101 // Global flag to make sure we never re-enter the nvLog code.
102 #if defined(DEBUG) || defined(ASSERT_BUILD) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
103 static volatile NvS32 nvLogRecursion = 0;
104 #endif
105 
106 // NvDump interface config - communicates with external kernel debuggers
107 NVDUMP_EXPORT volatile NV_DECLARE_ALIGNED(NVDUMP_CONFIG, 8) nvDumpConfig =
108 {
109     NVDUMP_CONFIG_SIGNATURE, // sigHead
110     NvP64_NULL, // debuggerControlFuncAddr
111     { NvP64_NULL, NVDUMP_DEBUGGER_BUFFER_SIZE, 0 }, // buffer
112     0, // gpuSelect
113     NVDUMP_COMPONENT_SYS_ALL, // component
114     NVDUMP_STATUS_IDLE, // dumpStatus
115     NV_OK, // rmStatus
116 
117     NVDUMP_CONFIG_SIGNATURE // sigTail
118 };
119 
120 void
121 rcdbDestruct_IMPL(Journal *pRcDB)
122 {
123     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
124 
125     // Deallocate NvDebug debugger dump buffer.
126     if (nvDumpConfig.buffer.address != NvP64_NULL)
127     {
128         portMemFree(NvP64_VALUE(nvDumpConfig.buffer.address));
129         nvDumpConfig.buffer.address = NvP64_NULL;
130     }
131 
132     // Delete Journal and Btree
133     if (pJournal->pBuffer != NULL)
134     {
135         portMemFree(pJournal->pBuffer);
136         portMemFree(pJournal->AssertList.ppList);
137 
138         // clear journal of anything
139         portMemSet(pJournal, 0, sizeof(EVENT_JOURNAL));
140     }
141 
142     rcdbClearErrorHistory(pRcDB);
143 
144     rcdbDestroyRingBufferCollection(pRcDB);
145 
146     portMemFree(pRcDB->previousDriverVersion);
147     pRcDB->previousDriverVersion = NULL;
148 
149     portMemFree(pRcDB->previousDriverBranch);
150     pRcDB->previousDriverBranch = NULL;
151 }
152 
153 static void
154 _initJournal(EVENT_JOURNAL *pJournal, NvU32 size)
155 {
156     // verify we are not abandoning any memory allocations.
157     NV_ASSERT(NULL == pJournal->pBuffer);
158     NV_ASSERT(NULL == (NvU8*) pJournal->AssertList.ppList);
159 
160     // init the Journal to an empty buffer.
161     pJournal->pBuffer = NULL;
162     pJournal->BufferSize = 0;
163     pJournal->pFree = pJournal->pBuffer;
164     pJournal->BufferRemaining = pJournal->BufferSize;
165     pJournal->pCurrCollection = NULL;
166     pJournal->RecordCount = 0;
167 
168     // init the assert list to an empty buffer.
169     pJournal->AssertList.ppList = NULL;
170     pJournal->AssertList.Size = 0;
171     pJournal->AssertList.Count = 0;
172     pJournal->AssertList.QualifyingStackSize = JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES;
173 
174     // allocate and initialize journal buffer memory
175     pJournal->pBuffer = portMemAllocNonPaged(size);
176     if (pJournal->pBuffer != NULL )
177     {
178         pJournal->BufferSize = size;
179         pJournal->pFree = pJournal->pBuffer;
180         pJournal->BufferRemaining = pJournal->BufferSize;
181 
182         // if the journal is large enough to hold at least one assert record,
183         // init the assert list as well.
184         if (sizeof(RmRCCommonAssert_RECORD) <= pJournal->BufferSize)
185         {
186             pJournal->AssertList.Size = pJournal->BufferSize / sizeof(RmRCCommonAssert_RECORD);
187             pJournal->AssertList.ppList = portMemAllocNonPaged(pJournal->AssertList.Size * sizeof(pJournal->AssertList.ppList[0]));
188             if (pJournal->AssertList.ppList == NULL )
189             {
190                 NV_PRINTF(LEVEL_ERROR,
191                           "Failure to allocate RC assert tracking buffer \n");
192                 pJournal->AssertList.Size = 0;
193             }
194         }
195     }
196     else
197     {
198         NV_PRINTF(LEVEL_ERROR, "Failure to allocate RC journal buffer \n");
199     }
200 }
201 
202 NV_STATUS
203 rcdbConstruct_IMPL(Journal *pRcDB)
204 {
205     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
206     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
207     NvU32 i;
208     void *pBuffer;
209 
210     // Time parameters
211     NvU32 sec, usec;
212     NvU64 timeStamp;
213     NvU64 systemTime;
214     NvU64 timeStampFreq;
215 
216     _initJournal(pJournal, JOURNAL_BUFFER_SIZE_DEFAULT);
217 
218     portMemSet(pRingBufferColl, 0x00, sizeof(pRcDB->RingBufferColl));
219 
220     pRcDB->BugcheckCount = 0;
221 
222     // Allocate NvDebug debugger dump buffer.
223     pBuffer = portMemAllocNonPaged(nvDumpConfig.buffer.size);
224     if (pBuffer != NULL)
225     {
226         nvDumpConfig.buffer.address = NV_SIGN_EXT_PTR_TO_NvP64(pBuffer);
227     }
228     else
229     {
230         NV_PRINTF(LEVEL_ERROR,
231                   "failed to allocate NVD debugger dump buffer\n");
232     }
233 
234     // Initialize NvDebug debugger function address.
235     nvDumpConfig.debuggerControlFuncAddr = NV_SIGN_EXT_PTR_TO_NvP64(nvdDebuggerControlFunc);
236 
237     //
238     // Create RC Diagnostic report Wrap Buffer
239     //
240     if (NULL == rcdbCreateRingBuffer(pRcDB, RmRcDiagReport, MAX_RCDB_RCDIAG_WRAP_BUFF))
241     {
242         NV_PRINTF(LEVEL_ERROR, "failed to allocate RC Diagnostic Ring Buffer\n");
243     }
244     // init the RC error report data
245     pRcDB->RcErrRptNextIdx = 0;
246     pRcDB->RcErrRptRecordsDropped = NV_FALSE;
247 
248     // Initialize RC Error Counters.
249     for ( i = 0  ;  i < MAX_RC_ERROR_COUNTER  ;  i++)
250     {
251         pRcDB->rcErrorCounterArray[i].rcErrorType  = RC_ERROR_COUNTER_TYPE_INVALID;
252         pRcDB->rcErrorCounterArray[i].rcErrorCount = 0;
253         pRcDB->rcErrorCounterArray[i].rcLastCHID   = INVALID_CHID;
254         pRcDB->rcErrorCounterArray[i].rcLastTime   = 0;
255     }
256      pRcDB->rcErrorCounterArray[RC_ERROR_COUNTER_OTHER_INDEX].rcErrorType  = RC_ERROR_COUNTER_OTHER_TYPE;
257 
258      // clear the Nocat Queue descriptors & counters
259      portMemSet(&pRcDB->nocatJournalDescriptor, 0x00, sizeof(pRcDB->nocatJournalDescriptor));
260      portMemSet(pRcDB->nocatJournalDescriptor.lastRecordId, 0xff, sizeof(pRcDB->nocatJournalDescriptor.lastRecordId));
261      pRcDB->nocatJournalDescriptor.nocatLastRecordType = NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN;
262      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks = NOCAT_CACHE_FRESHNESS_PERIOD_MS;
263      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks *= osGetTimestampFreq();
264      pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks /= 1000ULL;
265 
266      //
267      // Create NOCAT report Wrap Buffer
268      //
269      if (NULL == rcdbCreateRingBuffer(pRcDB, RmNocatReport, MAX_RCDB_NOCAT_WRAP_BUFF))
270      {
271          NV_PRINTF(LEVEL_ERROR, "failed to allocate NOCAT Ring Buffer\n");
272      }
273 
274      // Save params for timestamp conversion
275      timeStampFreq = osGetTimestampFreq();
276      timeStamp = osGetTimestamp();
277      osGetCurrentTime(&sec, &usec);
278      systemTime = ((NvU64)sec * 1000000) + (NvU64)usec;
279 
280      pRcDB->systemTimeReference = systemTime - ((timeStamp * 1000000) / timeStampFreq);
281      pRcDB->timeStampFreq = timeStampFreq;
282 
283      return NV_OK;
284 }
285 
286 //
287 // Retrieve the previous driver version from volatile registry entires
288 // and then save the current driver version for next time.
289 //
290 NV_STATUS rcdbSavePreviousDriverVersion_IMPL
291 (
292     OBJGPU  *pGpu,
293     Journal *pRcDB
294 )
295 {
296     NV_STATUS nvStatus = NV_OK;
297 
298     NvU32     regEntrySize = 0;
299     NvU32     changeListNum = NV_LAST_OFFICIAL_CHANGELIST_NUM;
300 
301     // Only run this code only once each time the driver is loaded.
302     if (pRcDB->bPrevDriverCodeExecuted)
303         return NV_OK;
304 
305     pRcDB->bPrevDriverCodeExecuted = NV_TRUE;
306 
307     //
308     // Get the previous driver version information
309     // from volatile registry settings.
310     //
311     nvStatus = osReadRegistryVolatileSize(pGpu,
312         NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, &regEntrySize);
313 
314     // Early exit if this platform does not support volatile registry.
315     if (nvStatus == NV_ERR_NOT_SUPPORTED)
316         return NV_OK;
317 
318     if ((NV_OK == nvStatus) && (0 != regEntrySize))
319     {
320         //
321         // Previous driver version is there, so assume all previous driver
322         // information is there as well.
323         //
324         pRcDB->previousDriverVersion = portMemAllocNonPaged(regEntrySize + 1);
325         if (pRcDB->previousDriverVersion == NULL)
326         {
327             nvStatus = NV_ERR_NO_MEMORY;
328             DBG_BREAKPOINT();
329             goto rcdbSavePreviousDriverVersion_writeRegistry;
330         }
331 
332         nvStatus = osReadRegistryVolatile(pGpu,
333                                      NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
334                                      (NvU8 *)pRcDB->previousDriverVersion,
335                                      regEntrySize);
336         if (nvStatus != NV_OK)
337         {
338             DBG_BREAKPOINT();
339             goto rcdbSavePreviousDriverVersion_writeRegistry;
340         }
341         pRcDB->previousDriverVersion[regEntrySize] = 0;
342 
343         nvStatus = osReadRegistryVolatileSize(pGpu,
344             NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, &regEntrySize);
345         if ((nvStatus != NV_OK) || (0 == regEntrySize))
346         {
347             DBG_BREAKPOINT();
348             goto rcdbSavePreviousDriverVersion_writeRegistry;
349         }
350 
351         pRcDB->previousDriverBranch = portMemAllocNonPaged(regEntrySize + 1);
352         if (pRcDB->previousDriverBranch == NULL)
353         {
354             nvStatus = NV_ERR_NO_MEMORY;
355             DBG_BREAKPOINT();
356             goto rcdbSavePreviousDriverVersion_writeRegistry;
357         }
358 
359         nvStatus = osReadRegistryVolatile(pGpu,
360                                          NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
361                                          (NvU8 *)pRcDB->previousDriverBranch,
362                                          regEntrySize);
363         if (nvStatus != NV_OK)
364         {
365             DBG_BREAKPOINT();
366             goto rcdbSavePreviousDriverVersion_writeRegistry;
367         }
368         pRcDB->previousDriverBranch[regEntrySize] = 0;
369 
370         nvStatus = osReadRegistryVolatile(pGpu,
371                                      NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
372                                      (NvU8 *)&pRcDB->prevDriverChangelist,
373                                      sizeof(pRcDB->prevDriverChangelist));
374         if (nvStatus != NV_OK)
375         {
376             DBG_BREAKPOINT();
377             goto rcdbSavePreviousDriverVersion_writeRegistry;
378         }
379 
380         nvStatus = osReadRegistryVolatile(pGpu,
381                                      NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
382                                      (NvU8 *)&pRcDB->driverLoadCount,
383                                      sizeof(pRcDB->driverLoadCount));
384         if (nvStatus != NV_OK)
385         {
386             DBG_BREAKPOINT();
387             goto rcdbSavePreviousDriverVersion_writeRegistry;
388         }
389     }
390 
391     // Always write out the driver info, even if there was an error reading it.
392 rcdbSavePreviousDriverVersion_writeRegistry:
393     pRcDB->driverLoadCount++;
394 
395     osWriteRegistryVolatile(pGpu,
396                             NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
397                             (NvU8 *)NV_VERSION_STRING,
398                             sizeof(NV_VERSION_STRING));
399 
400     osWriteRegistryVolatile(pGpu,
401                             NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
402                             (NvU8 *)NV_BUILD_BRANCH_VERSION,
403                             sizeof(NV_BUILD_BRANCH_VERSION));
404 
405     osWriteRegistryVolatile(pGpu,
406                             NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
407                             (NvU8 *)&changeListNum,
408                             sizeof(changeListNum));
409 
410     osWriteRegistryVolatile(pGpu,
411                             NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
412                             (NvU8 *)&pRcDB->driverLoadCount,
413                             sizeof(pRcDB->driverLoadCount));
414 
415     return nvStatus;
416 }
417 
418 NV_STATUS rcdbAddAssertJournalRecWithLine(void *pVoidGpu, NvU32 lineNum, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
419 {
420     OBJSYS                     *pSys;
421     Journal                    *pRcDB;
422     OBJGPU                     *pPossibleNULLGpu;
423     JOURNAL_ASSERT_LIST        *pAssertList;
424     RmRCCommonAssert_RECORD     newAssertRec;
425     RmRCCommonAssert_RECORD    *pAssertRec;
426     NV_STATUS                   rmStatus = NV_ERR_GENERIC;
427     NvU32                       i;
428 
429     //
430     // Note: we allow NULL pGpu here, as many clients (such as KMD)
431     // do not have access to pGpu.  And much of the RM does not provide this either.
432     //
433     pPossibleNULLGpu = reinterpretCast(pVoidGpu, OBJGPU *);
434 
435     pSys = SYS_GET_INSTANCE();
436     if (!pSys)
437     {
438         return NV_ERR_INVALID_STATE;
439     }
440 
441     pRcDB = SYS_GET_RCDB(pSys);
442     if (!pRcDB)
443     {
444         return NV_ERR_INVALID_STATE;
445     }
446 
447     pAssertList = &pRcDB->Journal.AssertList;
448 
449     *ppRec = NULL;
450 
451     RMTRACE_PROBE4_PRIMTYPE(rcjournal, assertlog, NvU32, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : 0), NvU8, type, NvU32, level, NvU64, key);
452 
453     // create a local instance of the Assert record.
454     portMemSet(&newAssertRec, 0x00, sizeof(newAssertRec));
455     rcdbSetCommonJournalRecord(pPossibleNULLGpu, &newAssertRec.common);
456     newAssertRec.count = 1;
457     newAssertRec.breakpointAddrHint = key;
458     newAssertRec.lineNum = lineNum;
459 
460     if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_COMPRESS))
461     {
462         // search for a pre-existing assert record with the same stack
463         for (i = 0; i < pAssertList->Count; ++i)
464         {
465             pAssertRec = pAssertList->ppList[i];
466             if ((newAssertRec.breakpointAddrHint == pAssertRec->breakpointAddrHint) &&
467                 (0 == portMemCmp(newAssertRec.callStack, pAssertRec->callStack,
468                     sizeof(newAssertRec.callStack[0]) * pAssertList->QualifyingStackSize)))
469             {
470                 pAssertRec->count++;
471                 pAssertRec->lastTimeStamp = newAssertRec.common.timeStamp;
472 
473                 rmStatus = NV_OK;
474                 break;
475             }
476         }
477     }
478 
479     if (rmStatus != NV_OK)
480     {
481         // Discard to avoid reentry from messing up record array.
482         if (portAtomicIncrementS32(&assertListRecursion) == 1)
483         {
484             rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pAssertRec, jGroup, type, size);
485             if (NV_OK == rmStatus)
486             {
487                 // the Header is filled in when the record is allocated, so update the local instance header.
488                 newAssertRec.common.Header = pAssertRec->common.Header;
489                 *pAssertRec = newAssertRec;
490                 if (pAssertList->Count < pAssertList->Size)
491                 {
492                     pAssertList->ppList[pAssertList->Count] = pAssertRec;
493                     ++(pAssertList->Count);
494                 }
495                 else
496                 {
497                     // based on the way the assert list size is calculated this should never happen....
498                     NV_PRINTF(LEVEL_ERROR,
499                               "failed to insert tracking for assert record\n");
500                 }
501             }
502         }
503         portAtomicDecrementS32(&assertListRecursion);
504     }
505 
506     if (rmStatus == NV_OK)
507     {
508         RMTRACE_RMJOURNAL(_ASSERTLOG, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
509                                       type,
510                                       jGroup,
511                                       key,
512                                       pAssertRec->count,
513                                       pAssertRec->common.timeStamp,
514                                       pAssertRec->lastTimeStamp);
515         *ppRec = pAssertRec;
516 
517         _rcdbNocatReportAssert(pPossibleNULLGpu, pAssertRec);
518     }
519     else
520     {
521         _rcdbNocatReportAssert(pPossibleNULLGpu, &newAssertRec);
522     }
523 
524     return rmStatus;
525 }
526 
527 NV_STATUS rcdbAddAssertJournalRec(void *pVoidGpu, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
528 {
529     return rcdbAddAssertJournalRecWithLine(pVoidGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, ppRec, jGroup, type, size, level, key);
530 }
531 // Populate stateMask with flags that represent the power state and other useful things.
532 static NvU64 _getCommonJournalStateMask(OBJGPU *pGpu)
533 {
534     NvU64 stateMask = REF_NUM(NV_RM_JOURNAL_STATE_MASK_GC6_STATE,
535         pGpu->gc6State.currentState);
536 
537     if (!gpuIsGpuFullPower(pGpu))
538         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_FULL_POWER;
539 
540     if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED))
541         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_CONNECTED;
542 
543     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_STANDBY))
544         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_STANDBY;
545 
546     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_HIBERNATE))
547         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_HIBERNATE;
548 
549     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH))
550         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_PM_CODEPATH;
551 
552     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET))
553         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_GC6_RESET;
554 
555     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET))
556         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_FULLCHIP_RESET;
557 
558     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET))
559         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_SEC_BUS_RESET;
560 
561     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_TIMEOUT_RECOVERY))
562         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_TIMEOUT_RECOVERY;
563 
564     if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
565         stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_LOST;
566 
567     return stateMask;
568 }
569 
570 // Fill in the common portion of the journal structure.
571 void
572 rcdbSetCommonJournalRecord
573 (
574     OBJGPU *pGpu,
575     RmRCCommonJournal_RECORD *pRec
576 )
577 {
578     OS_THREAD_HANDLE threadId;
579 
580     pRec->timeStamp = osGetTimestamp();
581     pRec->GPUTag    = 0;
582     pRec->CPUTag    = 0;
583     pRec->stateMask = 0;
584 
585     if (pGpu)
586     {
587         pRec->GPUTag    = pGpu->gpuId;
588         pRec->stateMask = _getCommonJournalStateMask(pGpu);
589     }
590 
591     if (NV_OK == osGetCurrentThread(&threadId))
592     {
593         pRec->CPUTag = (NvU64)threadId;
594     }
595 }
596 
597 NV_STATUS
598 rcdbAddBugCheckRec_IMPL
599 (
600     OBJGPU  *pGpu,
601     Journal *pRcDB,
602     NvU32    bugCheckCode
603 )
604 {
605     RmJournalBugcheck_RECORD *pRec;
606     NV_STATUS                 rmStatus;
607 
608     rmStatus = rcdbAllocNextJournalRec(pRcDB,
609                                        (NVCD_RECORD **)&pRec,
610                                        RmGroup,
611                                        RmJournalBugCheck,
612                                        sizeof(*pRec));
613     if (NV_OK == rmStatus)
614     {
615         rcdbSetCommonJournalRecord(pGpu, &pRec->common);
616         pRec->bugCheckCode = bugCheckCode;
617     }
618 
619      pRcDB->BugcheckCount++;
620 
621     return rmStatus;
622 }
623 
624 NV_STATUS
625 rcdbAddPowerStateRec_IMPL
626 (
627     OBJGPU  *pGpu,
628     Journal *pRcDB,
629     NvU32    powerEvent,
630     NvU32    state,
631     NvU32    fastBootPowerState
632 )
633 {
634     RmPowerState_RECORD       newRmDiagWrapBuffRec;
635 
636     // Create Records, then write it.
637     newRmDiagWrapBuffRec.powerState = state;
638     newRmDiagWrapBuffRec.powerEvent = powerEvent;
639     newRmDiagWrapBuffRec.fastBootPowerState = fastBootPowerState;
640     rcdbAddRecToRingBuffer(pGpu, pRcDB, RmPowerState,
641                               sizeof(RmPowerState_RECORD), (NvU8 *)&newRmDiagWrapBuffRec);
642     return NV_OK;
643 }
644 
645 NV_STATUS
646 rcdbGetRcDiagRecBoundaries_IMPL
647 (
648     Journal  *pRcDB,
649     NvU16    *pStart,
650     NvU16    *pEnd,
651     NvU32     owner,
652     NvU32     processId
653 )
654 {
655     NV_STATUS                   status = NV_ERR_MISSING_TABLE_ENTRY;
656     RmRCCommonJournal_RECORD   *pCommon;
657     RmRcDiag_RECORD            *pRecord = NULL;
658     RING_BUFFER_LOG            *pRingBuffer = NULL;
659     NvU32                       i;
660     NvU16                       logicalStartIdx;
661     NvU16                       start = 0;
662     NvU16                       end = 0;
663     NvBool                      foundStart = NV_FALSE;
664     NvBool                      foundEnd = NV_FALSE;
665 
666     // scan the buffer to find all the qualified records & return the
667     // first & last indicies of the qualified records found.
668 
669     // Get the Diag Report Ring buffer.
670     rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
671 
672     // attempt to claim ownership
673     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
674     {
675         // get the logical start of the buffer.
676         logicalStartIdx = pRingBuffer->headIndex;
677 
678         // run thru all the entries in the buffer, start to end, until we find the start & end of the range we are looking for.
679         for (i = 0; i < pRingBuffer->numEntries; ++i)
680         {
681             // get a pointer to the record from the buffer.
682             pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * ((logicalStartIdx + i) % pRingBuffer->maxEntries)));
683             pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
684 
685             // check to see if the record qualifies
686             if (((RCDB_RCDIAG_DEFAULT_OWNER != owner) && (pRecord->owner != owner) && (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID != owner))
687                 || ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID != processId) && (pRecord->processId != processId)))
688             {
689                 continue;
690             }
691             switch (foundStart)
692             {
693             case NV_FALSE:
694                 // check if this is a start record.
695                 // we want the first record to be a start record to insure that all the reports that are in the range are complete
696                 // (I.E. we didn't wrap over the first record of a report)
697                 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_FIRST))
698                 {
699                     // yes save the idx as the first Idx, & note that we found the start of the range.
700                     start = pRecord->idx;
701                     foundStart = NV_TRUE;
702                 }
703                 // fall thru to check if the start of the report is also the end of the report.
704 
705             case NV_TRUE:
706                 // check if this is an end record.
707                 // we want the last record in the range to be an end record to insure that all the reports that are in the range are complete
708                 // (Note -- in the case of end records, this should only be an issue if we are interrupting the collection of a report)
709                 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_LAST))
710                 {
711                     // save the idx as the last idx & continue scanning until we have checked all the records.
712                     // the last idx saved will be the last idx.
713                     end = pRecord->idx;
714                     foundEnd = foundStart;
715                 }
716                 break;
717             }
718         }
719         // checking end is sufficient, because end can't be set w/o start being set first.
720         if (foundEnd)
721         {
722             // we found a complete range, mark us as succeeding.
723             status = NV_OK;
724 
725             // pass up the results.
726             if (NULL != pEnd)
727             {
728                 *pEnd = end;
729             }
730             if (NULL != pStart)
731             {
732                 *pStart = start;
733             }
734         }
735     }
736     else
737     {
738         // the buffer is currently busy.
739         status = NV_ERR_BUSY_RETRY;
740     }
741     portAtomicDecrementS32(&concurrentRingBufferAccess);
742     return status;
743 }
744 
745 RmRCCommonJournal_RECORD *
746 rcdbAddRcDiagRec_IMPL
747 (
748     OBJGPU  *pGpu,
749     Journal *pRcDB,
750     RmRcDiag_RECORD       *pRmDiagWrapBuffRec
751 )
752 {
753     RmRCCommonJournal_RECORD *pCommon;
754     NvU32   usec;
755 
756     // Create Records, then write it.
757     pRmDiagWrapBuffRec->idx = (pRcDB->RcErrRptNextIdx)++;
758     if (MAX_RCDB_RCDIAG_ENTRIES < pRmDiagWrapBuffRec->count)
759     {
760         NV_ASSERT_FAILED("Diag report to large for buffer");
761         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].offset = 0;
762         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].tag = NV0000_CTRL_CMD_NVD_RCERR_RPT_REG_OVERFLOWED;
763         pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].value = pRmDiagWrapBuffRec->count - MAX_RCDB_RCDIAG_ENTRIES + 1;
764         pRmDiagWrapBuffRec->count = MAX_RCDB_RCDIAG_ENTRIES;
765     }
766     osGetCurrentTime(&(pRmDiagWrapBuffRec->timeStamp), &usec);
767 
768     pCommon = rcdbAddRecToRingBuffer(pGpu, pRcDB, RmRcDiagReport,
769                                      sizeof(RmRcDiag_RECORD), (NvU8 *)pRmDiagWrapBuffRec);
770 
771     pRcDB->RcErrRptRecordsDropped |= pRcDB->RcErrRptNextIdx >= MAX_RCDB_RCDIAG_WRAP_BUFF;
772     return pCommon;
773 }
774 
775 RmRCCommonJournal_RECORD *
776 rcdbAddRcDiagRecFromGsp_IMPL
777 (
778     OBJGPU  *pGpu,
779     Journal *pRcDB,
780     RmRCCommonJournal_RECORD   *pCommonGsp,
781     RmRcDiag_RECORD            *pRmDiagGsp
782 )
783 {
784     RmRCCommonJournal_RECORD   *pCommonCpu;
785 
786     pCommonCpu = rcdbAddRcDiagRec(pGpu, pRcDB, pRmDiagGsp);
787     if (pCommonCpu)
788     {
789         NV_ASSERT(pCommonCpu->GPUTag == pCommonGsp->GPUTag);
790         pCommonCpu->stateMask |= pCommonGsp->stateMask;
791     }
792 
793     return pCommonCpu;
794 }
795 
796 NV_STATUS
797 _rcdbInternalGetRcDiagRec
798 (
799     Journal                    *pRcDB,
800     NvU16                       reqIdx,
801     RmRCCommonJournal_RECORD  **ppRmDiagWrapBuffRec,
802     NvU32                       owner,
803     NvU32                       processId
804 )
805 {
806     RmRCCommonJournal_RECORD   *pCommon;
807     RmRcDiag_RECORD*            pRecord = NULL;
808     NV_STATUS                   status = NV_ERR_INVALID_INDEX;
809     RING_BUFFER_LOG            *pRingBuffer = NULL;
810 
811     NvU32                       i;
812 
813     // assume we will fail.
814     *ppRmDiagWrapBuffRec = NULL;
815 
816     // Find the ring buffer for the diag reports
817     rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
818 
819     // is the requested record in the buffer?
820     if ((NvU16)(pRcDB->RcErrRptNextIdx - reqIdx) <= pRingBuffer->numEntries)
821     {
822         // calculate the location of the record.
823         // find the record just past the last record in the buffer. to use as the initial offset.
824         i = pRingBuffer->headIndex + pRingBuffer->numEntries;
825 
826         // subtract off the diff between the next idx to be used & the requested idx.
827         i -= pRcDB->RcErrRptNextIdx - reqIdx;
828 
829         // wrap the offset to the size of the buffer.
830         i %= pRingBuffer->maxEntries;
831 
832         // get a pointer to the record from the buffer.
833         pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * i));
834         pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
835 
836         // verify we have the record that was requested.
837         NV_ASSERT_OR_RETURN(pRecord->idx == reqIdx, NV_ERR_INVALID_INDEX);
838 
839         // we found the requested Index,
840         // check to see if the record qualifies
841         if (((RCDB_RCDIAG_DEFAULT_OWNER == owner) || (pRecord->owner == owner) || (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner))
842             && ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId) || (pRecord->processId == processId)))
843         {
844             // combination of ANY_OWNER_ID && ANY_PROCESS_ID is not valid
845             if (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner && NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId)
846             {
847                 status = NV_ERR_INSUFFICIENT_PERMISSIONS;
848                 goto exit;
849             }
850             // we found a record that fully qualifies
851             *ppRmDiagWrapBuffRec = pCommon;
852             status = NV_OK;
853         }
854         else
855         {
856             // we found the record, but it does not pass the filter.
857             status = NV_ERR_INSUFFICIENT_PERMISSIONS;
858         }
859     }
860 exit:
861     return status;
862 }
863 
864 NV_STATUS
865 rcdbGetRcDiagRec_IMPL
866 (
867     Journal                    *pRcDB,
868     NvU16                       reqIdx,
869     RmRCCommonJournal_RECORD  **ppRmDiagWrapBuffRec,
870     NvU32                       owner,
871     NvU32                       processId
872 )
873 {
874     NV_STATUS                   status;
875 
876     if (ppRmDiagWrapBuffRec == NULL)
877     {
878         return NV_ERR_INVALID_ARGUMENT;
879     }
880 
881     *ppRmDiagWrapBuffRec = NULL;
882 
883     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
884     {
885         status = _rcdbInternalGetRcDiagRec(pRcDB, reqIdx, ppRmDiagWrapBuffRec, owner, processId);
886     }
887     else
888     {
889         status = NV_ERR_BUSY_RETRY;
890     }
891     portAtomicDecrementS32(&concurrentRingBufferAccess);
892     return status;
893 }
894 
895 //
896 //  The function to set context data for all the RmRcDiag_RECORDs in a specified range
897 //
898 NV_STATUS
899 rcdbUpdateRcDiagRecContext_IMPL
900 (
901     Journal                    *pRcDB,
902     NvU16                       rangeStartIdx,
903     NvU16                       rangeEndIdx,
904     NvU32                       processId,
905     NvU32                       owner
906 )
907 {
908     RmRCCommonJournal_RECORD   *pCommon = NULL;
909     RmRcDiag_RECORD*            pRecord = NULL;
910     NV_STATUS                   status = NV_OK;
911     NV_STATUS                   recStatus = NV_ERR_OUT_OF_RANGE;
912 
913     NvU16                       i;
914 
915     // go from the start index thru the end index.
916     // note we use != because the indicies will wrap.
917     for (i = rangeStartIdx; i != (NvU16)(rangeEndIdx + 1U); i++)
918     {
919         recStatus = rcdbGetRcDiagRec(pRcDB, i, &pCommon, RCDB_RCDIAG_DEFAULT_OWNER, NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID);
920         if (NV_OK != recStatus)
921         {
922             // something went wrong,
923             // record the status & skip this record.
924             status = recStatus;
925             continue;
926         }
927         // get the pointer to the diag record.
928         pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
929 
930         pRecord->owner = owner;
931         pRecord->processId = processId;
932     }
933     return status;
934 }
935 
936 //
937 // size must include NVCD_RECORD size too
938 //
939 NV_STATUS rcdbAllocNextJournalRec_IMPL(Journal *pRcDB, NVCD_RECORD** ppRec, NvU8 jGroup, NvU8 type, NvU16 size)
940 {
941     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
942 
943     if ( ppRec == NULL )
944         return NV_ERR_GENERIC;
945 
946     if ( pJournal->pBuffer == NULL || pJournal->BufferSize == 0 )
947         return NV_ERR_GENERIC;
948 
949     if ( size == 0 || pJournal->BufferRemaining < size )
950     {
951         return NV_ERR_GENERIC;
952     }
953 
954     *ppRec = (NVCD_RECORD*)(pJournal->pFree);
955 
956     (*ppRec)->cRecordGroup = jGroup;
957     (*ppRec)->cRecordType = type;
958     (*ppRec)->wRecordSize = size;
959 
960     if ( pJournal->pCurrCollection )
961     {
962         pJournal->pCurrCollection->NumRecords++;
963         pJournal->pCurrCollection->Header.wRecordSize += size;
964     }
965     else
966     {
967         // standalone record (not part of collection) - increase total count
968         pJournal->RecordCount++;
969     }
970 
971     pJournal->pFree += size;
972     pJournal->BufferRemaining -= size;
973 
974     return NV_OK;
975 }
976 
977 NV_STATUS rcdbClearErrorHistory_IMPL(Journal *pRcDB)
978 {
979     SYS_ERROR_INFO         *pSysErrorInfo = &pRcDB->ErrorInfo;
980     RMFIFOERRORELEMENT_V3* pFifoErrorInfo;
981     RMFIFOERRORELEMENT_V3* pFreeErrorInfo;
982 
983     // Wait until any errors currently being reported are complete
984     while (!portAtomicCompareAndSwapU32(&pSysErrorInfo->InUse, 1, 0))
985     {
986         // We're not going to sleep, but safe to sleep also means safe to spin..
987         NV_ASSERT_OR_RETURN(portSyncExSafeToSleep(), NV_ERR_INVALID_STATE);
988         portUtilSpin();
989     }
990 
991     pFifoErrorInfo = (RMFIFOERRORELEMENT_V3*) pSysErrorInfo->pErrorList;
992     while (NULL != pFifoErrorInfo)
993     {
994         pFreeErrorInfo = pFifoErrorInfo;
995         pFifoErrorInfo = pFifoErrorInfo->ErrorHeader.pNextError;
996         rcdbDeleteErrorElement(pRcDB, pFreeErrorInfo);
997     }
998 
999     pSysErrorInfo->ErrorCount = 0x0;
1000     pSysErrorInfo->LogCount = 0x0;
1001     pSysErrorInfo->pErrorList = NULL;
1002 
1003     portAtomicSetU32(&pSysErrorInfo->InUse, 0);
1004     return NV_OK;
1005 }
1006 
1007 
1008 NV_STATUS rcdbDeleteErrorElement_IMPL(Journal *pRcDB, void *pDelete)
1009 {
1010     RMFIFOERRORELEMENT_V3* pFifoDelete = (RMFIFOERRORELEMENT_V3*)pDelete;
1011     RMCD_ERROR_BLOCK*              pErrorBlock;
1012     RMCD_ERROR_BLOCK*              pOldErrorBlock;
1013 
1014     // Free Additional Error Block
1015     for (pErrorBlock = pFifoDelete->ErrorHeader.pErrorBlock; pErrorBlock != NULL;)
1016     {
1017         pOldErrorBlock = pErrorBlock;
1018         pErrorBlock = pErrorBlock->pNext;
1019         portMemFree(pOldErrorBlock->pBlock);
1020         portMemFree(pOldErrorBlock);
1021     }
1022 
1023     // Free Error Collector
1024     portMemFree(pFifoDelete);
1025 
1026     return NV_OK;
1027 }
1028 
1029 // Frees up the all the ring buffers
1030 void rcdbDestroyRingBufferCollection_IMPL(Journal *pRcDB)
1031 {
1032     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1033     NvU32 i;
1034     RING_BUFFER_LOG* pCurrentBuffer = pRingBufferColl->pFirstEntry;
1035 
1036     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1037     {
1038         RING_BUFFER_LOG* pTempCurrentBuffer = pCurrentBuffer;
1039 
1040         NV_ASSERT(pCurrentBuffer != NULL);
1041         NV_ASSERT(pCurrentBuffer->pBuffer != NULL);
1042 
1043         portMemFree(pCurrentBuffer->pBuffer);
1044 
1045         pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1046 
1047         // Free the current ring buffer entry.
1048         portMemFree(pTempCurrentBuffer);
1049     }
1050 
1051     // pCurrentBuffer should be NULL if our accounting of NumEntries is correct
1052     NV_ASSERT(pCurrentBuffer == NULL);
1053 
1054     portMemSet(pRingBufferColl, 0x00, sizeof(*pRingBufferColl));
1055 }
1056 
1057 
1058 static NvU32 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord);
1059 static void _rcdbDumpCommonJournalRecord(PRB_ENCODER *pPrbEnc,const PRB_FIELD_DESC *pFieldDesc,PRmRCCommonJournal_RECORD pRec);
1060 
1061 /*!
1062  * @brief Initialize the GPU accessible flag
1063  *
1064  * @param[in] pGPU
1065  * @param[in] pRcDB
1066  *
1067  * @return NV_OK
1068  */
1069 NV_STATUS
1070 rcdbDumpInitGpuAccessibleFlag_IMPL
1071 (
1072     OBJGPU  *pGpu,
1073     Journal *pRcDB
1074 )
1075 {
1076     pRcDB->nvDumpState.bGpuAccessible =
1077         pRcDB->nvDumpState.bRMLock                                    &&
1078         !pGpu->bIsSOC                                                 &&
1079         !IS_VIRTUAL(pGpu)                                             &&
1080         gpuIsGpuFullPower(pGpu)                                       &&
1081         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET)      &&
1082         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET) &&
1083         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET)           &&
1084         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH)         &&
1085         !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST);
1086 
1087     // The GPU should be there... but make sure.
1088     if (pRcDB->nvDumpState.bGpuAccessible)
1089     {
1090         if (GPU_REG_RD32(pGpu, NV_PMC_BOOT_0) != pGpu->chipId0)
1091         {
1092             pRcDB->nvDumpState.bGpuAccessible = NV_FALSE;
1093         }
1094     }
1095 
1096     return NV_OK;
1097 }
1098 
1099 /*!
1100  * @brief Performs a dump of the specified system component into the given buffer.
1101  *
1102  * @param[in] pSys The system object
1103  * @param[in] component NVDUMP_IS_SYS_COMPONENT(component) must be true.
1104  * @param[in, out] pBuffer Buffer to populate with dump results
1105  * @param[in] policy Policy for buffer allocation: use this one, allocate one or count
1106  * @param[in, out] pBufferCallback Callback function for use with fixed-sized buffer encoding.
1107  *                                 If this is NULL then pBuffer->size is assumed to be large
1108  *                                 enough for the whole dump. Otherwise pBufferCallback is called
1109  *                                 when the buffer is full or when a message ends, allowing the
1110  *                                 the callback to construct the whole buffer piece by piece.
1111  *
1112  * @return NV_OK on success and specific error status on failure
1113  */
1114 NV_STATUS
1115 rcdbDumpComponent_IMPL
1116 (
1117     OBJRCDB *pRcDB,
1118     NvU32 component,
1119     NVDUMP_BUFFER *pBuffer,
1120     NVDUMP_BUFFER_POLICY policy,
1121     PrbBufferCallback *pBufferCallback
1122 )
1123 {
1124     NVD_STATE *pNvDumpState = &pRcDB->nvDumpState;
1125     void *pBuff;
1126     PRB_ENCODER encoder;
1127     NV_STATUS status = NV_OK;
1128     NvU8 startingDepth;
1129 
1130     // Validate arguments.
1131     NV_ASSERT_OR_RETURN(pBuffer != NULL, NV_ERR_INVALID_ARGUMENT);
1132 
1133     // Make sure we were not reentered.
1134     if (pNvDumpState->bDumpInProcess)
1135         return NV_ERR_STATE_IN_USE;
1136 
1137     // Initialize dump state.
1138     pNvDumpState->bDumpInProcess    = NV_TRUE;
1139     pNvDumpState->bugCheckCode      = 0;
1140     pNvDumpState->internalCode      = NVD_ERROR_CODE(NVD_EXTERNALLY_GENERATED, 0);
1141     pNvDumpState->bRMLock           = rmapiLockIsOwner();
1142     pNvDumpState->bGpuAccessible    = NV_FALSE;
1143     pNvDumpState->initialbufferSize = pBuffer->size;
1144     pNvDumpState->nvDumpType        = NVD_DUMP_TYPE_API;
1145 
1146     // Clear dump buffer.
1147     pBuffer->curNumBytes = 0;
1148 
1149     // Start encoding protobuf dump message.
1150     switch (policy)
1151     {
1152         case NVDUMP_BUFFER_PROVIDED:
1153             prbEncStart(&encoder, NVDEBUG_NVDUMP, NvP64_VALUE(pBuffer->address),
1154                         pBuffer->size, pBufferCallback);
1155             break;
1156         case NVDUMP_BUFFER_ALLOCATE:
1157             NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1158                 prbEncStartAlloc(&encoder, NVDEBUG_NVDUMP,
1159                                  pBuffer->size, pBufferCallback));
1160             break;
1161         case NVDUMP_BUFFER_COUNT:
1162             prbEncStartCount(&encoder, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE);
1163             break;
1164         default:
1165             return NV_ERR_INVALID_ARGUMENT;
1166     }
1167 
1168     startingDepth = prbEncNestingLevel(&encoder);
1169 
1170     switch (component)
1171     {
1172         case NVDUMP_COMPONENT_SYS_RCDB:
1173         {
1174             NV_CHECK_OK(status, LEVEL_ERROR,
1175                 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1176             break;
1177         }
1178         case NVDUMP_COMPONENT_SYS_SYSINFO:
1179         {
1180             NV_CHECK_OK(status, LEVEL_ERROR,
1181                 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1182             break;
1183         }
1184         case NVDUMP_COMPONENT_SYS_ALL:
1185         {
1186             NV_CHECK_OK(status, LEVEL_ERROR,
1187                 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1188             NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1189                 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1190             break;
1191         }
1192         default:
1193         {
1194             NV_PRINTF(LEVEL_ERROR,
1195                       "called with invalid component %u selected.\n",
1196                       component);
1197             status = NV_ERR_INVALID_ARGUMENT;
1198             break;
1199         }
1200     }
1201 
1202     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1203         prbEncUnwindNesting(&encoder, startingDepth));
1204 
1205     {
1206         NvU32   gpu;
1207         OBJGPU *pGpu;
1208 
1209         for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1210         {
1211             pGpu = gpumgrGetGpu(gpu);
1212 
1213             if ((pGpu != NULL) && IS_GSP_CLIENT(pGpu))
1214             {
1215                 NV_RM_RPC_DUMP_PROTOBUF_COMPONENT(pGpu, status, &encoder,
1216                     pNvDumpState, component);
1217 
1218                 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1219                     prbEncUnwindNesting(&encoder, startingDepth));
1220             }
1221         }
1222     }
1223 
1224     // Finish encoding protobuf dump message.
1225     pBuffer->curNumBytes = prbEncFinish(&encoder, &pBuff);
1226     pBuffer->address = NV_SIGN_EXT_PTR_TO_NvP64(pBuff);
1227     pNvDumpState->bDumpInProcess = NV_FALSE;
1228 
1229     return status;
1230 }
1231 
1232 static NV_STATUS
1233 _rcdbGetTimeInfo
1234 (
1235     PRB_ENCODER          *pPrbEnc,
1236     NVD_STATE            *pNvDumpState,
1237     const PRB_FIELD_DESC *pFieldDesc
1238 )
1239 {
1240     NvU64 timeSinceBoot;
1241     NvU32 sec;
1242     NvU32 usec;
1243     NV_STATUS nvStatus = NV_OK;
1244     NvU8 startingDepth = prbEncNestingLevel(pPrbEnc);
1245 
1246     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1247         prbEncNestedStart(pPrbEnc, pFieldDesc));
1248 
1249     prbEncAddUInt64(pPrbEnc,
1250                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_FREQ,
1251                     osGetTimestampFreq());
1252 
1253     // Add Timestamp
1254     prbEncAddUInt64(pPrbEnc,
1255                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_DUMP,
1256                     osGetTimestamp());
1257     osGetCurrentTime(&sec, &usec);
1258     prbEncAddUInt64(pPrbEnc,
1259                     NVDEBUG_SYSTEMINFO_TIMEINFO_SYSTEM_TIME_DUMP,
1260                     (NvU64)sec * 1000000 + usec);
1261 
1262     // Add time since boot in seconds.
1263     osGetCurrentTick(&timeSinceBoot);
1264     prbEncAddUInt32(pPrbEnc,
1265                     NVDEBUG_SYSTEMINFO_TIMEINFO_TIME_SINCE_BOOT_SEC,
1266                     (NvU32)(timeSinceBoot / 1000000000ULL));
1267 
1268     // Unwind the protobuf to the correct depth.
1269     NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1270         prbEncUnwindNesting(pPrbEnc, startingDepth));
1271 
1272     return nvStatus;
1273 }
1274 
1275 static const char * GPU_NA_UUID = "N/A";
1276 
1277 NV_STATUS
1278 rcdbDumpSystemInfo_IMPL
1279 (
1280     OBJRCDB *pRcDB,
1281     PRB_ENCODER *pPrbEnc,
1282     NVD_STATE   *pNvDumpState
1283 )
1284 {
1285     OBJGPU     *pGpu;
1286     NvU8       *pGidString;
1287     NvU32       gpu;
1288     NvU32       numGpus;
1289     NvU32       gidStrlen;
1290     NvU32       sizeStr;
1291     NV_STATUS   nvStatus = NV_OK;
1292     NvBool      bRelease;
1293     NvU8        startingDepth = prbEncNestingLevel(pPrbEnc);
1294 
1295     OBJSYS     *pSys = SYS_GET_INSTANCE();
1296     OBJCL      *pCl = SYS_GET_CL(pSys);
1297     OBJGPU     *pParent;
1298     NvU32       gpuIndex;
1299     NvU32       gpuMask;
1300     NvBool      bGpuDone[NV_MAX_DEVICES];
1301 
1302     // All of this stuff should run OK even without the RM lock.
1303     // No need to check pRcDB->nvDumpState.bNoRMLock;
1304 
1305     switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1306     {
1307     case NVD_GPU_GENERATED:
1308     case NVD_SKIP_ZERO:
1309         // don't report on these internal codes.
1310         return NV_OK;
1311         break;
1312     }
1313 
1314     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1315         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_SYSTEM_INFO));
1316 
1317     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1318         _rcdbGetTimeInfo(pPrbEnc, pNvDumpState, NVDEBUG_SYSTEMINFO_TIME_INFO),
1319         External_Cleanup);
1320 
1321     prbEncAddUInt32(pPrbEnc,
1322                     NVDEBUG_SYSTEMINFO_BUGCHECK_COUNT,
1323                     pRcDB->BugcheckCount);
1324 
1325     // Add NorthBridge Info
1326     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1327         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO),
1328         External_Cleanup);
1329 
1330     prbEncAddUInt32(pPrbEnc,
1331         NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_ID,
1332         pCl->FHBBusInfo.vendorID |
1333         (pCl->FHBBusInfo.deviceID << 16));
1334 
1335     prbEncAddUInt32(pPrbEnc,
1336         NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_SSID,
1337         pCl->FHBBusInfo.subvendorID |
1338         (pCl->FHBBusInfo.subdeviceID << 16));
1339 
1340     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO
1341         prbEncNestedEnd(pPrbEnc),
1342         External_Cleanup);
1343 
1344     //CPU Info
1345     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1346         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_CPU_INFO),
1347         External_Cleanup);
1348 
1349     prbEncAddUInt32(pPrbEnc,
1350         NVDEBUG_SYSTEMINFO_CPUINFO_CPU_TYPE,
1351         pSys->cpuInfo.type);
1352 
1353     prbEncAddUInt32(pPrbEnc,
1354         NVDEBUG_SYSTEMINFO_CPUINFO_CPU_CAPS,
1355         pSys->cpuInfo.caps);
1356 
1357     prbEncAddUInt32(pPrbEnc,
1358         NVDEBUG_SYSTEMINFO_CPUINFO_NUM_CPU_CORES,
1359         pSys->cpuInfo.numPhysicalCpus);
1360 
1361     prbEncAddUInt32(pPrbEnc,
1362         NVDEBUG_SYSTEMINFO_CPUINFO_NUM_LOGICAL_CPUS,
1363         pSys->cpuInfo.numLogicalCpus);
1364 
1365     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_CPU_INFO
1366         prbEncNestedEnd(pPrbEnc),
1367         External_Cleanup);
1368 
1369     //GPU Info
1370     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1371         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_INFO),
1372         External_Cleanup);
1373 
1374     // Count the number of GPUs and List the gpuIds
1375     numGpus = 0;
1376     for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1377     {
1378         const NvU32 gidFlags =
1379             DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _BINARY) |
1380             DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE, _SHA1);
1381 
1382         pGpu = gpumgrGetGpu(gpu);
1383 
1384         if (pGpu)
1385         {
1386             numGpus++;
1387 
1388             prbEncAddUInt32(pPrbEnc,
1389                 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_ID,
1390                 pGpu->gpuId);
1391 
1392             nvStatus = gpuGetGidInfo(pGpu, &pGidString,
1393                 &gidStrlen, gidFlags);
1394             if (NV_OK == nvStatus)
1395             {
1396                 prbEncAddBytes(pPrbEnc,
1397                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1398                     pGidString, gidStrlen);
1399                 portMemFree(pGidString);
1400             }
1401             else if (pGpu->gpuUuid.isInitialized)
1402             {
1403                 prbEncAddBytes(pPrbEnc,
1404                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1405                     pGpu->gpuUuid.uuid, sizeof(pGpu->gpuUuid.uuid));
1406             }
1407             else
1408             {
1409                 prbEncAddString(pPrbEnc,
1410                     NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1411                     GPU_NA_UUID);
1412             }
1413 
1414             prbEncAddUInt32(pPrbEnc,
1415                 NVDEBUG_SYSTEMINFO_GPUINFO_DEVICE_ID,
1416                 pGpu->idInfo.PCIDeviceID);
1417 
1418             prbEncAddUInt32(pPrbEnc,
1419                 NVDEBUG_SYSTEMINFO_GPUINFO_PMCBOOT0,
1420                 pGpu->chipId0);
1421 
1422             prbEncAddUInt32(pPrbEnc,
1423                 NVDEBUG_SYSTEMINFO_GPUINFO_SUBDEV_ID,
1424                 pGpu->idInfo.PCISubDeviceID);
1425         }
1426     }
1427 
1428     prbEncAddUInt32(pPrbEnc,
1429         NVDEBUG_SYSTEMINFO_GPUINFO_NUM_GPUS,
1430         numGpus);
1431 
1432     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_INFO
1433         prbEncNestedEnd(pPrbEnc),
1434         External_Cleanup);
1435 
1436     //OS Info
1437     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1438         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_OS_INFO),
1439         External_Cleanup);
1440 
1441     nvStatus = osGetVersionDump(pPrbEnc);
1442     if (nvStatus != NV_OK)
1443         goto External_Cleanup;
1444 
1445     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_OS_INFO
1446         prbEncNestedEnd(pPrbEnc),
1447         External_Cleanup);
1448 
1449     // Driver Info
1450     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1451         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_DRIVER_INFO),
1452         External_Cleanup);
1453 
1454     sizeStr = (sizeof("RELEASE") < sizeof(NV_DISPLAY_DRIVER_TITLE) ?
1455         sizeof("RELEASE") :
1456         sizeof(NV_DISPLAY_DRIVER_TITLE));
1457 
1458     if (portMemCmp(NV_DISPLAY_DRIVER_TITLE, "RELEASE", sizeStr) == 0)
1459         bRelease = NV_TRUE;
1460     else
1461         bRelease = NV_FALSE;
1462 
1463     prbEncAddBool(pPrbEnc,
1464         NVDEBUG_SYSTEMINFO_DRIVERINFO_IS_RELEASE,
1465         bRelease);
1466 
1467     prbEncAddString(pPrbEnc,
1468         NVDEBUG_SYSTEMINFO_DRIVERINFO_VERSION,
1469         NV_VERSION_STRING);
1470 
1471     prbEncAddString(pPrbEnc,
1472         NVDEBUG_SYSTEMINFO_DRIVERINFO_BRANCH,
1473         NV_BUILD_BRANCH_VERSION);
1474 
1475     prbEncAddUInt32(pPrbEnc,
1476         NVDEBUG_SYSTEMINFO_DRIVERINFO_CHANGELIST,
1477         NV_LAST_OFFICIAL_CHANGELIST_NUM);
1478 
1479     // Only write previous driver version if loaded more than once.
1480     if (pRcDB->driverLoadCount > 1)
1481     {
1482         if (pRcDB->previousDriverVersion != NULL)
1483         {
1484             prbEncAddString(pPrbEnc,
1485                 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_VERSION,
1486                 pRcDB->previousDriverVersion);
1487         }
1488 
1489         if (pRcDB->previousDriverBranch != NULL)
1490         {
1491             prbEncAddString(pPrbEnc,
1492                 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_BRANCH,
1493                 pRcDB->previousDriverBranch);
1494         }
1495 
1496         prbEncAddUInt32(pPrbEnc,
1497             NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_CHANGELIST,
1498             pRcDB->prevDriverChangelist);
1499     }
1500 
1501     prbEncAddUInt32(pPrbEnc,
1502         NVDEBUG_SYSTEMINFO_DRIVERINFO_LOAD_COUNT,
1503         pRcDB->driverLoadCount);
1504 
1505     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_DRIVER_INFO
1506         prbEncNestedEnd(pPrbEnc),
1507         External_Cleanup);
1508 
1509     // Dump an table of
1510     // Master GPU -- gpuId
1511     // List all gpus involved by gpuIds
1512     portMemSet(bGpuDone, NV_FALSE, sizeof(bGpuDone));
1513     for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1514     {
1515         pGpu = gpumgrGetGpu(gpu);
1516 
1517         if ((pGpu) && (bGpuDone[gpu] == NV_FALSE))
1518         {
1519             pParent = gpumgrGetParentGPU(pGpu);
1520 
1521             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1522                 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_CONFIG),
1523                 External_Cleanup);
1524 
1525             prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_MASTER_ID, pParent->gpuId);
1526             gpuMask = gpumgrGetGpuMask(pGpu);
1527             gpuIndex = 0;
1528             pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1529             while (pGpu)
1530             {
1531                 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_GPU_ID, pGpu->gpuId);
1532 
1533                 // gpuIndex is either the next or the MAX
1534                 bGpuDone[gpuIndex - 1] = NV_TRUE;
1535                 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1536             }
1537 
1538             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_CONFIG
1539                 prbEncNestedEnd(pPrbEnc),
1540                 External_Cleanup);
1541         }
1542     }
1543 
1544     // Error state
1545     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1546         prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_ERROR_STATE),
1547         External_Cleanup);
1548 
1549     prbEncAddUInt32(pPrbEnc,
1550         NVDEBUG_SYSTEMINFO_ERRORSTATE_BUGCHECK_CODE,
1551         pNvDumpState->bugCheckCode);
1552 
1553     prbEncAddBool(pPrbEnc,
1554         NVDEBUG_SYSTEMINFO_ERRORSTATE_GOT_RM_LOCK,
1555         pNvDumpState->bRMLock);
1556 
1557     prbEncAddUInt32(pPrbEnc,
1558         NVDEBUG_SYSTEMINFO_ERRORSTATE_DUMP_BUFFER_SIZE,
1559         pNvDumpState->initialbufferSize);
1560 
1561     //
1562     // prbEncNestedEnd for NVDEBUG_SYSTEMINFO_ERROR_STATE and
1563     // NVDEBUG_NVDUMP_SYSTEM_INFO are handled by prbEncUnwindNesting.
1564     //
1565 
1566 External_Cleanup:
1567     // Unwind the protobuf to the correct depth.
1568     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
1569         prbEncUnwindNesting(pPrbEnc, startingDepth));
1570 
1571     return nvStatus;
1572 }
1573 
1574 //
1575 // Routine to dump RcDB Debug Info
1576 //
1577 NV_STATUS
1578 rcdbDumpSystemFunc_IMPL
1579 (
1580     OBJRCDB *pRcDB,
1581     PRB_ENCODER *pPrbEnc,
1582     NVD_STATE *pNvDumpState
1583 )
1584 {
1585     OBJGPU  *pGpu = gpumgrGetSomeGpu();
1586 
1587     switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1588     {
1589     case NVD_GPU_GENERATED:
1590     case NVD_SKIP_ZERO:
1591         // don't report on these internal codes.
1592         return NV_OK;
1593         break;
1594     }
1595 
1596     rcdbDumpJournal(pRcDB, pGpu, pPrbEnc, pNvDumpState, NVDEBUG_NVDUMP_DCL_MSG);
1597     if (pGpu != NULL)
1598     {
1599         rcdbDumpErrorCounters(pRcDB, pGpu, pPrbEnc);
1600     }
1601     else
1602     {
1603         NV_PRINTF(LEVEL_WARNING,
1604                   "no GPU - won't dump ring buffers or journal\n");
1605     }
1606 
1607     return NV_OK;
1608 }
1609 
1610 static NvU32
1611 _rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD   *pList, NVD_STATE *pNvDumpState)
1612 {
1613     OBJSYS                *pSys          = SYS_GET_INSTANCE();
1614     Journal               *pRcDB         = SYS_GET_RCDB(pSys);
1615     SYS_ERROR_INFO        *pSysErrorInfo = &pRcDB->ErrorInfo;
1616     RMPRBERRORELEMENT_V2*  pPrbErrorElement;
1617     RMCD_ERROR_BLOCK*      pErrorBlock;
1618     NV_STATUS              status = NV_OK;
1619 
1620     //
1621     // If we are called from the OCA dump, make sure we have the rm lock.
1622     // TO DO:  Try to dump as much as possible without the lock.
1623     //
1624     if (!pNvDumpState->bRMLock)
1625         return NV_OK;
1626 
1627     // Get Past Exceptions
1628     pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
1629     while (NULL != pPrbErrorElement)
1630     {
1631         pErrorBlock = pPrbErrorElement->ErrorHeader.pErrorBlock;
1632         switch (pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType)
1633         {
1634             case RmPrbErrorInfo_V2:
1635                 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1636                 break;
1637 
1638             case RmPrbFullDump_V2:
1639                 //
1640                 // Full crash dumps are a single NvDebug.NvDump message, and
1641                 // should be contained in a single block.
1642                 //
1643                 if (pErrorBlock != NULL)
1644                 {
1645                     if (pErrorBlock->pNext != NULL)
1646                     {
1647                         NV_PRINTF(LEVEL_WARNING,
1648                                   "only one error block expected!\n");
1649                     }
1650                     _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1651                 }
1652                 break;
1653             default:
1654                 // Can only handle protobuf formatted messages
1655                 NV_PRINTF(LEVEL_ERROR, "unknown error element type: %d\n",
1656                           pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType);
1657                 break;
1658         }
1659         pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pPrbErrorElement->ErrorHeader.pNextError;
1660     }
1661     return status;
1662 }
1663 
1664 static void
1665 _rcdbDumpCommonJournalRecord
1666 (
1667     PRB_ENCODER               *pPrbEnc,
1668     const PRB_FIELD_DESC      *pFieldDesc,
1669     RmRCCommonJournal_RECORD  *pRec
1670 )
1671 {
1672     NV_STATUS nvStatus = NV_OK;
1673 
1674     NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1675         prbEncNestedStart(pPrbEnc, pFieldDesc));
1676 
1677     if (nvStatus == NV_OK)
1678     {
1679         if (pRec->timeStamp != 0)
1680             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_TIME_STAMP, pRec->timeStamp);
1681         if (pRec->GPUTag != 0)
1682             prbEncAddUInt32(pPrbEnc, JOURNAL_COMMON_GPU_TAG,    pRec->GPUTag);
1683         if (pRec->CPUTag != 0)
1684             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_CPU_TAG,    pRec->CPUTag);
1685         if (pRec->stateMask != 0)
1686             prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_STATE_MASK, pRec->stateMask);
1687         NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1688     }
1689 }
1690 
1691 static void
1692 rcdbDumpCommonAssertRecord
1693 (
1694     PRB_ENCODER              *pPrbEnc,
1695     NVD_STATE                *pNvDumpState,
1696     RmRCCommonAssert_RECORD  *pRec,
1697     NvU32                     type
1698 )
1699 {
1700     NvU32 i;
1701 
1702     prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_TYPE,                 type);
1703 
1704     if (pRec->lastTimeStamp != 0)
1705         prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_LAST_TIME_STAMP,  pRec->lastTimeStamp);
1706 
1707     prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_BREAKPOINT_ADDR_HINT, pRec->breakpointAddrHint);
1708 
1709     // if there is a line number, add it to the message.
1710     if (pRec->lineNum != NV_RM_ASSERT_UNKNOWN_LINE_NUM)
1711         prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_SOURCE_LINE, pRec->lineNum);
1712 
1713     if (pRec->count != 1)
1714         prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_COUNT,            pRec->count);
1715 
1716     for (i = 0; i < NV_ARRAY_ELEMENTS(pRec->callStack); i++)
1717     {
1718         if (pRec->callStack[i] == 0)
1719             break;
1720 
1721         prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_CALL_STACK, pRec->callStack[i]);
1722     }
1723 }
1724 
1725 static NV_STATUS
1726 _rcdbDumpDclMsgRecord(
1727     PRB_ENCODER *pPrbEnc,
1728     NVD_STATE *pNvDumpState,
1729     const PRB_FIELD_DESC *pFieldDesc,
1730     RmRCCommonJournal_RECORD *pDclRecord
1731     )
1732 {
1733     NV_STATUS nvStatus = NV_OK;
1734 
1735     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1736         prbEncNestedStart(pPrbEnc, pFieldDesc));
1737 
1738     _rcdbDumpCommonJournalRecord(pPrbEnc, DCL_DCLMSG_COMMON, pDclRecord);
1739 
1740     switch (pDclRecord->Header.cRecordType)
1741     {
1742         case RmRC2SwDbgBreakpoint_V3:
1743         case RmRC2SwRmAssert_V3:
1744         {
1745             RmRC2SwRmAssert3_RECORD* pRecord = (RmRC2SwRmAssert3_RECORD*)pDclRecord;
1746 
1747             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1748                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1749             if (nvStatus == NV_OK)
1750             {
1751                 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState,
1752                     &pRecord->commonAssert, pDclRecord->Header.cRecordType);
1753 
1754                 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_LEVEL, pRecord->level);
1755                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1756             }
1757             break;
1758         }
1759         case RmRC2GpuTimeout_V3:
1760         {
1761             RmRC2GpuTimeout3_RECORD* pRecord = (RmRC2GpuTimeout3_RECORD*)pDclRecord;
1762 
1763             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1764                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1765             if (nvStatus == NV_OK)
1766             {
1767                 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState, pRecord, pDclRecord->Header.cRecordType);
1768                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1769             }
1770             break;
1771         }
1772         case RmBadRead_V2:
1773         {
1774             RmRC2BadRead2_RECORD* pRecord = (RmRC2BadRead2_RECORD*)pDclRecord;
1775 
1776             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1777                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BADREAD));
1778             if (nvStatus == NV_OK)
1779             {
1780                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MEMORY_SPACE, pRecord->MemorySpace);
1781                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_OFFSET, pRecord->Offset);
1782                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MASK, pRecord->Mask);
1783                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_VALUE, pRecord->Value);
1784                 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_REASON, pRecord->Reason);
1785                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1786             }
1787             break;
1788         }
1789         case RmDclMsg:
1790         {
1791             RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1792             // Add the bytes after RM_DATA_COLLECTION_RECORD
1793             prbEncAddBytes(pPrbEnc, pRecord->fieldDesc, (void *) (pRecord + 1),
1794                 pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1795             break;
1796         }
1797         case RmJournalEngDump:
1798         {
1799             RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1800             // Add the bytes after RM_DATA_COLLECTION_RECORD
1801             prbEncCatMsg(pPrbEnc, (void *)(pRecord + 1),
1802                     pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1803             break;
1804         }
1805         case RmJournalBugCheck:
1806         {
1807             RmJournalBugcheck_RECORD* pRecord = (RmJournalBugcheck_RECORD*)pDclRecord;
1808             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1809                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BUGCHECK));
1810             if (nvStatus == NV_OK)
1811             {
1812                 prbEncAddUInt32(pPrbEnc, JOURNAL_BUGCHECK_CODE, pRecord->bugCheckCode);
1813                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1814             }
1815             break;
1816         }
1817         case RmPrbErrorInfo_V2:
1818         case RmPrbFullDump_V2:
1819         {
1820             RMPRBERRORELEMENT_V2*   pRecord = (RMPRBERRORELEMENT_V2*)((NvU8 *)pDclRecord
1821                                                 - NV_OFFSETOF(RMPRBERRORELEMENT_V2, RmPrbErrorData));
1822             RMCD_ERROR_BLOCK*       pErrorBlock;
1823 
1824             for (pErrorBlock = pRecord->ErrorHeader.pErrorBlock;
1825                 (pErrorBlock != NULL); pErrorBlock = pErrorBlock->pNext)
1826             {
1827                     prbEncCatMsg(pPrbEnc, (void *)pErrorBlock->pBlock,
1828                                     pErrorBlock->blockSize);
1829             }
1830             break;
1831         }
1832         case RmNocatReport:
1833         {
1834             // currently not added to the OCA dump
1835             break;
1836         }
1837 
1838         default:
1839             // These are the only ones we know about
1840             NV_PRINTF(LEVEL_ERROR,
1841                         "unknown Dcl Record entry type: %d\n",
1842                         pDclRecord->Header.cRecordType);
1843             break;
1844     }
1845 
1846     NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1847     return 0;
1848 }
1849 
1850 static NvU32
1851 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord)
1852 {
1853     RmRCCommonJournal_RECORD *pCurrentRecord = pList;
1854     RmRCCommonJournal_RECORD *pNextRecord;
1855 
1856     if ((NULL != pList) && (NULL != pRecord))
1857     {
1858         for (pNextRecord = (RmRCCommonJournal_RECORD *)pList->pNext; pNextRecord != pList; pNextRecord = (RmRCCommonJournal_RECORD *)pNextRecord->pNext)
1859         {
1860             if (pRecord->timeStamp  < pNextRecord->timeStamp)
1861             {
1862                 break;
1863             }
1864             pCurrentRecord = pNextRecord;
1865         }
1866         pRecord->pNext = pCurrentRecord->pNext;
1867         pCurrentRecord->pNext = (NvU8 *)pRecord;
1868     }
1869     return 0;
1870 }
1871 
1872 // Todo: format the records into a protobuf DCL record at the source
1873 static NvU32
1874 rcdbInsertRingBufferToList(
1875     Journal                    *pRcDB,
1876     RmRCCommonJournal_RECORD   *pList,
1877     RING_BUFFER_LOG            *pRingBuffer
1878 )
1879 {
1880     RmRCCommonJournal_RECORD *pCommon;
1881     NvU32 recordSize;
1882     NvU32 i;
1883 
1884     recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, pRingBuffer->entryType);
1885 
1886     //
1887     // Order does not matter here because the record will be inserted into the
1888     // list based on the time of the record, not its postion in the buffer.
1889     //
1890     for (i = 0; i < pRingBuffer->numEntries; i++)
1891     {
1892         pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (recordSize * i));
1893 
1894         _rcdbInsertJournalRecordToList (pList, pCommon);
1895     }
1896 
1897     return 0; // return value should be discarded
1898 }
1899 
1900 static NvU32
1901 rcdbInsertRingBufferCollectionToList(
1902     Journal                    *pRcDB,
1903     RmRCCommonJournal_RECORD   *pList)
1904 {
1905     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1906     RING_BUFFER_LOG *pCurrentBuffer;
1907     NvU32 i;
1908 
1909 
1910     pCurrentBuffer = pRingBufferColl->pFirstEntry;
1911     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1912     {
1913         NvU32 recSize = pCurrentBuffer->bufferSize;
1914 
1915         NV_ASSERT(pCurrentBuffer->maxEntries *
1916                   rcdbGetOcaRecordSizeWithHeader(pRcDB, pCurrentBuffer->entryType) ==
1917                   pCurrentBuffer->bufferSize);
1918 
1919         if (recSize > 0)
1920         {
1921             rcdbInsertRingBufferToList (pRcDB, pList, pCurrentBuffer);
1922         }
1923         pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1924     }
1925 
1926     // Assert that we traversed through the entire list.
1927     NV_ASSERT(pCurrentBuffer == NULL);
1928 
1929     // return value should be ignored
1930     return 0;
1931 }
1932 
1933 NvU32
1934 rcdbDumpJournal_IMPL
1935 (
1936     OBJRCDB *pRcDB,
1937     OBJGPU *pGpu,
1938     PRB_ENCODER *pPrbEnc,
1939     NVD_STATE *pNvDumpState,
1940     const PRB_FIELD_DESC *pFieldDesc
1941 )
1942 {
1943     OS_DRIVER_BLOCK DriverBlock;
1944     EVENT_JOURNAL *pJournal = &pRcDB->Journal;
1945     NvU8 *pJournalBuff      = pJournal->pBuffer;
1946     RmRCCommonJournal_RECORD *pRecord;
1947     NvU32 recSize;
1948     NV_STATUS nvStatus = NV_OK;
1949     RmRCCommonJournal_RECORD List;
1950 
1951     // It is OK to dump the journal entries without the RM lock.
1952     // No need to check pRcDB->nvDumpState.bNoRMLock;
1953 
1954     recSize = pJournal->BufferSize - pJournal->BufferRemaining;
1955 
1956     if (NULL != pGpu)
1957     {
1958         //
1959         // Add RVA Header, even when there are no journal records.
1960         // This header is required to resolve code addresses using the PDB file.
1961         // We can log code addresses outside of the journal entries.
1962         //
1963         NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedStart(pPrbEnc, pFieldDesc));
1964         if (nvStatus == NV_OK)
1965         {
1966             NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1967                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_RVAHEADER));
1968             if (nvStatus == NV_OK)
1969             {
1970                 portMemSet(&DriverBlock, 0x00, sizeof(DriverBlock));
1971                 osGetDriverBlock(pGpu->pOsGpuInfo, &DriverBlock);
1972                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_DRIVER_START, (NvU64)DriverBlock.driverStart);
1973                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_OFFSET, DriverBlock.offset);
1974                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_POINTER_SIZE, sizeof(pJournal));
1975                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_HIGH, *((NvU64*) DriverBlock.unique_id));
1976                 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_LOW, *((NvU64*) (DriverBlock.unique_id + 8)));
1977                 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_AGE, DriverBlock.age);
1978                 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1979             }
1980             NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1981         }
1982     }
1983 
1984     // init the list to an empty state
1985     portMemSet(&List, 0x00, sizeof(List));
1986     List.pNext = (NvU8 *)&List;
1987 
1988     //
1989     // Don't dump the ring buffers if something is adding to them.
1990     // If we can dump the ring buffers, hold the lock for them until the
1991     // dump is complete to insure that a record is not changed mid-dump.
1992     //
1993     if (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1)
1994     {
1995         //
1996         // If IRQL is low, spin until it gets available
1997         //
1998         if (!osIsRaisedIRQL() && (NULL != pGpu))
1999         {
2000             RMTIMEOUT         timeout;
2001             NV_STATUS         status = NV_OK;
2002             gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
2003             do {
2004                 portAtomicDecrementS32(&concurrentRingBufferAccess);
2005 
2006                 if (NV_ERR_TIMEOUT == status)
2007                 {
2008                     NV_PRINTF(LEVEL_ERROR,
2009                               "timed out waiting for Rm journal ring buffer to be available\n");
2010                     DBG_BREAKPOINT();
2011                     return 0;
2012                 }
2013                 status = gpuCheckTimeout(pGpu, &timeout);
2014                 osSpinLoop();
2015             } while (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1);
2016         }
2017         else
2018         {
2019             NV_ASSERT_FAILED("Ring Buffer unavailable for dump at high irql.");
2020         }
2021     }
2022 
2023     rcdbInsertRingBufferCollectionToList (pRcDB, &List);
2024 
2025     _rcdbInsertErrorHistoryToList(&List, pNvDumpState);
2026 
2027     // Skip if size is smaller than a header
2028     while (recSize > sizeof(RmRCCommonJournal_RECORD))
2029     {
2030         pRecord = (RmRCCommonJournal_RECORD *)pJournalBuff;
2031 
2032         if (pRecord->Header.cRecordGroup != RmGroup)
2033         {
2034             // We only log RM related data
2035             NV_ASSERT(pRecord->Header.cRecordGroup == RmGroup);
2036             break;
2037         }
2038 
2039         // Just a safety net...
2040         if (pRecord->Header.wRecordSize > recSize)
2041         {
2042             break;
2043         }
2044         _rcdbInsertJournalRecordToList (&List, pRecord);
2045 
2046         recSize -= pRecord->Header.wRecordSize;
2047         pJournalBuff += pRecord->Header.wRecordSize;
2048     }
2049 
2050 
2051     // dump out the records that have been added to the list.
2052     for (pRecord = (RmRCCommonJournal_RECORD *)List.pNext; pRecord != &List; pRecord = (RmRCCommonJournal_RECORD *)pRecord->pNext)
2053     {
2054         _rcdbDumpDclMsgRecord(pPrbEnc, pNvDumpState, pFieldDesc, pRecord);
2055     }
2056     portAtomicDecrementS32(&concurrentRingBufferAccess);
2057 
2058     // return value should be ignored
2059     return 0;
2060 }
2061 
2062 NvU32
2063 rcdbDumpErrorCounters_IMPL(Journal *pRcDB, OBJGPU *pGpu, PRB_ENCODER *pPrbEnc)
2064 {
2065     NvU32                   i;
2066     NvU32                   rcErrTyp = RC_ERROR_COUNTER_TYPE_INVALID;
2067     NV_STATUS               nvStatus = NV_OK;
2068     NvU8                    startingDepth = prbEncNestingLevel(pPrbEnc);
2069 
2070     // Opens NVDEBUG_NVDUMP_DCL_MSG
2071     NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2072         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_DCL_MSG),
2073         cleanupAndExit);
2074 
2075     for (i = 0; i <= RC_ERROR_COUNTER_OTHER_INDEX; i++)
2076     {
2077         // For Counters
2078         rcErrTyp = pRcDB->rcErrorCounterArray[i].rcErrorType;
2079         if (rcErrTyp != RC_ERROR_COUNTER_TYPE_INVALID)
2080         {
2081             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2082                 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_RCCOUNTER),
2083                 cleanupAndExit);
2084 
2085             // Write Power Event
2086             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCERRORTYPE, rcErrTyp);
2087 
2088             // Write Power State
2089             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_COUNT, pRcDB->rcErrorCounterArray[i].rcErrorCount);
2090 
2091             // Dump the channel ID and the last time when this error occurred on this channel ID
2092             prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCLASTCHID, pRcDB->rcErrorCounterArray[i].rcLastCHID);
2093             prbEncAddUInt64(pPrbEnc, RC_RCCOUNTER_RCLASTTIME, pRcDB->rcErrorCounterArray[i].rcLastTime);
2094 
2095             NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2096                 prbEncNestedEnd(pPrbEnc),
2097                 cleanupAndExit);
2098         }
2099     } // For Counters
2100 
2101     // Close NVDEBUG_NVDUMP_DCL_MSG handled by prbEncUnwindNesting.
2102 
2103 cleanupAndExit:
2104     // Unwind the protobuff to inital depth
2105     NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
2106         prbEncUnwindNesting(pPrbEnc, startingDepth));
2107 
2108     return 0;
2109 }
2110 
2111 static void
2112 _rcdbAddRmGpuDumpCallback
2113 (
2114     void *pData
2115 )
2116 {
2117     OBJSYS *pSys = SYS_GET_INSTANCE();
2118     NV_STATUS status;
2119 
2120     NvU32 gpuInstance = *((NvU32 *)pData);
2121     status = osAcquireRmSema(pSys->pSema);
2122     if (status == NV_OK)
2123     {
2124         // LOCK: acquire API lock
2125         status = rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_DIAG);
2126         if (status == NV_OK)
2127         {
2128             // LOCK: acquire GPUs lock
2129             status = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE,
2130                                        RM_LOCK_MODULES_DIAG);
2131             if (status == NV_OK)
2132             {
2133                 Journal *pRcDB = SYS_GET_RCDB(pSys);
2134                 OBJGPU  *pGpu = gpumgrGetGpu(gpuInstance);
2135 
2136                 //
2137                 // Mark the Journal object as in the deferred dump path so we won't
2138                 // re-attempt again.
2139                 //
2140                 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_TRUE);
2141 
2142                 status = rcdbAddRmGpuDump(pGpu);
2143                 NV_ASSERT(status == NV_OK);
2144 
2145                 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_FALSE);
2146 
2147                 // UNLOCK: release GPUs lock
2148                 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
2149             }
2150             else
2151             {
2152                 NV_PRINTF(LEVEL_ERROR, "failed to acquire the GPU locks!\n");
2153             }
2154             // UNLOCK: release API lock
2155             rmapiLockRelease();
2156         }
2157         else
2158         {
2159             NV_PRINTF(LEVEL_ERROR, "failed to acquire the API lock!\n");
2160         }
2161         osReleaseRmSema(pSys->pSema, NULL);
2162     }
2163     else
2164     {
2165         NV_PRINTF(LEVEL_ERROR, "failed to acquire the OS semaphore!\n");
2166     }
2167 }
2168 
2169 static NV_STATUS
2170 nvdDebuggerBufferCallback(void *pEncoder, NvBool bBufferFull)
2171 {
2172     if (bBufferFull)
2173     {
2174         nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_BUFFER_FULL;
2175     }
2176     else
2177     {
2178         nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_END_OF_MSG;
2179     }
2180 
2181     return NV_OK;
2182 }
2183 
2184 /*!
2185  * @brief NvDebug kernel debugger dump control
2186  *
2187  * Allows external kernel debuggers to control the RM's dump interface
2188  * without assuming anything about the current system state.
2189  *
2190  * WARNING! This function should never be called directly!
2191  *
2192  * If correctly setup, a kernel debugger will place a processor
2193  * hardware watchpoint on the nvDumpConfig.handshake variable.
2194  * Each time this is written to, the debugger will break and get a chance
2195  * to examine the rest of the nvDumpConfig state.
2196  *
2197  * @return This function should never return! External debugger should abort it!
2198  */
2199 static void
2200 nvdDebuggerControlFunc(void)
2201 {
2202     OBJSYS        *pSys = SYS_GET_INSTANCE();
2203     Journal       *pRcDB = SYS_GET_RCDB(pSys);
2204     OBJGPU        *pGpu = NULL;
2205     NvDebugDump   *pNvd = NULL;
2206     NVDUMP_BUFFER *pBuffer = (NVDUMP_BUFFER *)&nvDumpConfig.buffer; // discard volatile
2207 
2208     // Process actions while debugger provides work to do.
2209     while (nvDumpConfig.dumpStatus != NVDUMP_STATUS_IDLE)
2210     {
2211         nvDumpConfig.rmStatus = NV_OK;
2212 
2213         NV_PRINTF(LEVEL_INFO,
2214                   "Dump triggered: gpuSelect=%u, component=%u,  dumpStatus=%u\n",
2215                   nvDumpConfig.gpuSelect, nvDumpConfig.component,
2216                   nvDumpConfig.dumpStatus);
2217 
2218         if (NVDUMP_IS_GPU_COMPONENT(nvDumpConfig.component))
2219         {
2220             pGpu = gpumgrGetGpu(nvDumpConfig.gpuSelect);
2221             pNvd = GPU_GET_NVD(pGpu);
2222 
2223             switch (nvDumpConfig.dumpStatus)
2224             {
2225                 case NVDUMP_STATUS_COUNT_REQUESTED:
2226                     nvDumpConfig.rmStatus = nvdDumpComponent(
2227                         pGpu, pNvd, nvDumpConfig.component, pBuffer,
2228                         NVDUMP_BUFFER_COUNT, NULL);
2229                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2230                     break;
2231                 case NVDUMP_STATUS_DUMP_REQUESTED:
2232                     nvDumpConfig.rmStatus = nvdDumpComponent(
2233                         pGpu, pNvd, nvDumpConfig.component, pBuffer,
2234                         NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2235                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2236                     break;
2237                 default:
2238                     NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2239                               nvDumpConfig.dumpStatus);
2240                     nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2241                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2242                     break;
2243             }
2244         }
2245         else if (NVDUMP_IS_SYS_COMPONENT(nvDumpConfig.component))
2246         {
2247             switch (nvDumpConfig.dumpStatus)
2248             {
2249                 case NVDUMP_STATUS_COUNT_REQUESTED:
2250                     nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2251                         nvDumpConfig.component, pBuffer,
2252                         NVDUMP_BUFFER_COUNT, NULL);
2253                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2254                     break;
2255                 case NVDUMP_STATUS_DUMP_REQUESTED:
2256                     nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2257                         nvDumpConfig.component, pBuffer,
2258                         NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2259                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2260                     break;
2261                 default:
2262                     NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2263                               nvDumpConfig.dumpStatus);
2264                     nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2265                     nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2266 
2267                     break;
2268             }
2269         }
2270         else
2271         {
2272              NV_PRINTF(LEVEL_ERROR, "Invalid component %u\n",
2273                        nvDumpConfig.component);
2274              nvDumpConfig.rmStatus = NV_ERR_INVALID_PARAM_STRUCT;
2275              nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2276         }
2277     }
2278 
2279     // Ensure we really don't exit this function without debugger.
2280     while (1)
2281     {
2282         NV_PRINTF(LEVEL_ERROR, "Should never reach this point!\n");
2283         DBG_BREAKPOINT();
2284     }
2285 }
2286 
2287 /*!
2288  * @brief   Release Build NV_ASSERT function
2289  *
2290  * @details Called by NV_ASSERT when the assertion fails.
2291  *          By putting this logic in its own function, we save on binary size.
2292  */
2293 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
2294 static void _rcdbRmAssert(NvU32 level, NvU32 lineNum, NvU64 ip)
2295 {
2296     RmRC2SwRmAssert3_RECORD* pRec = NULL;
2297     if (rcdbAddAssertJournalRecWithLine(NULL, lineNum, (void **)&pRec, RmGroup,
2298         RmRC2SwRmAssert_V3, sizeof(RmRC2SwRmAssert3_RECORD),
2299         level, ip) == NV_OK)
2300     {
2301         pRec->level = level;
2302     }
2303 
2304 #if !defined(DEBUG) && !defined(QA_BUILD)
2305     {
2306         OBJSYS *pSys = SYS_GET_INSTANCE();
2307 
2308         // Add assert to NvLog.  But skip when nvLog asserts to avoid stack overflow.
2309         if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2310         {
2311             // check for GPU lost.
2312             rcdProbeAllGpusPresent(ip);
2313         }
2314         portAtomicDecrementS32(&nvLogRecursion);
2315 
2316         if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_ASSERT) &
2317             DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2318         {
2319             REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release NV_ASSERT Break\n");
2320         }
2321     }
2322 
2323     // If enabled bugcheck on assert
2324     osDbgBugCheckOnAssert();
2325 
2326 #endif
2327 }
2328 
2329 //
2330 // Some param-less wrappers for rcdbXxxEx() functions.
2331 // If the params are not needed, calling these functions saves on binary size
2332 //
2333 void rcdbRmAssert(NvU32 LineNum, NvU64 ip) {  _rcdbRmAssert(0, LineNum, ip); }
2334 void rcdbRmAssertStatus(NvU32 status, NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(status, LineNum, ip); }
2335 
2336 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
2337 
2338 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2339 
2340 /*!
2341  * @brief   Release Build DBGBREAKPOINT() function
2342  *
2343  * @details Called by DBGBREAKPOINT when the assertion fails.
2344  *          By putting this logic in its own function, we save on binary size.
2345  */
2346 static void _rcdbDbgBreakEx(void *pGpu, NvU32 lineNum, NvU32 level, NvU64 ip)
2347 {
2348     RmRC2SwRmAssert3_RECORD* pRec = NULL;
2349     if (rcdbAddAssertJournalRecWithLine(pGpu, lineNum, (void**)&pRec, RmGroup,
2350          RmRC2SwDbgBreakpoint_V3, sizeof(RmRC2SwRmAssert3_RECORD), level, ip) == NV_OK)
2351     {
2352         pRec->level = level;
2353     }
2354 
2355 #if !defined(DEBUG) && !defined(QA_BUILD)
2356     {
2357         OBJSYS *pSys = SYS_GET_INSTANCE();
2358 
2359         // Add assert to NvLog.  But skip when nvLog asserts to avoid stack overflow.
2360         if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2361         {
2362             NV_PRINTF(LEVEL_NOTICE, "Breakpoint at 0x%llx.\n", ip);
2363         }
2364         portAtomicDecrementS32(&nvLogRecursion);
2365 
2366         if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_DBG_BREAK) &
2367             DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2368         {
2369             REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release Debug Break\n");
2370         }
2371     }
2372 #endif
2373 
2374     // If enabled bugcheck on assert
2375     osDbgBugCheckOnAssert();
2376 }
2377 
2378 void rcdbDbgBreak(NvU64 ip)                             { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0,      ip); }
2379 void rcdbDbgBreakGpu(void *pGpu, NvU64 ip)              { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0,      ip); }
2380 void rcdbDbgBreakStatus(NvU32 status, NvU64 ip)         { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
2381 void rcdbDbgBreakEx(void *pGpu, NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
2382 
2383 #endif
2384 
2385 NV_STATUS
2386 rcdbAddRmEngDump
2387 (
2388     OBJGPU  *pGpu,
2389     NvU32 component
2390 )
2391 {
2392     OBJSYS          *pSys     = SYS_GET_INSTANCE();
2393     Journal         *pRcDB    = SYS_GET_RCDB(pSys);
2394     NvDebugDump     *pNvd     = GPU_GET_NVD(pGpu);
2395     NVDUMP_BUFFER    nvDumpBuffer = {0};
2396     RM_DATA_COLLECTION_RECORD *pRec;
2397     NV_STATUS        rmStatus;
2398     NvU16            totalSize;
2399 
2400     nvDumpBuffer.size = NVDUMP_MAX_DUMP_SIZE;
2401 
2402     rmStatus = nvdDumpComponent(pGpu, pNvd, component, &nvDumpBuffer,
2403                NVDUMP_BUFFER_ALLOCATE, NULL);
2404     if (rmStatus != NV_OK)
2405     {
2406         goto rcdbAddRmEngDump_error_handle;
2407     }
2408 
2409     totalSize = (NvU16)(nvDumpBuffer.curNumBytes + sizeof(*pRec));
2410     //align to 8 bytes to keep the readability of RM journal
2411     totalSize = (totalSize + 0x7) & ~0x7;
2412     // check for overflow
2413     if (((NvU32)totalSize) < nvDumpBuffer.curNumBytes + sizeof(*pRec))
2414     {
2415         goto rcdbAddRmEngDump_error_handle;
2416     }
2417 
2418     rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pRec, RmGroup,
2419                                        RmJournalEngDump, totalSize);
2420     if (rmStatus != NV_OK)
2421     {
2422         goto rcdbAddRmEngDump_error_handle;
2423     }
2424     rcdbSetCommonJournalRecord(pGpu, &pRec->common);
2425 
2426     // copy the dump buffer right after the RM_DATA_COLLECTION_RECORD struct
2427     portMemCopy((void *)(pRec + 1), nvDumpBuffer.curNumBytes, NvP64_VALUE(nvDumpBuffer.address), nvDumpBuffer.curNumBytes);
2428 
2429     pRec->fieldDesc = NVDEBUG_NVDUMP_GPU_INFO;
2430 
2431 rcdbAddRmEngDump_error_handle:
2432     if (nvDumpBuffer.address != NvP64_NULL)
2433     {
2434         portMemFree(NvP64_VALUE(nvDumpBuffer.address));
2435     }
2436 
2437     return rmStatus;
2438 }
2439 
2440 
2441 // Finds the ring buffer for a corresponding type. Returns error if not allocated.
2442 static void
2443 rcdbFindRingBufferForType
2444 (
2445     Journal *pRcDB,
2446     RMCD_RECORD_TYPE recType,
2447     RING_BUFFER_LOG **ppRingBuffer
2448 )
2449 {
2450     NvU32 i;
2451     RING_BUFFER_LOG *pCurrentRingBuffer = NULL;
2452     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2453 
2454     NV_ASSERT(ppRingBuffer != NULL);
2455     *ppRingBuffer = NULL;
2456 
2457     //
2458     // Loop through our ring buffer collection, and find the
2459     // ring buffer corresponding to our type.
2460     //
2461     pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2462     for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2463     {
2464         NV_ASSERT(pCurrentRingBuffer != NULL);
2465         if (pCurrentRingBuffer->entryType == recType)
2466         {
2467             *ppRingBuffer = pCurrentRingBuffer;
2468             return;
2469         }
2470         pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2471     }
2472 
2473     NV_PRINTF(LEVEL_INFO, "Ring Buffer not found for type %d\n", recType);
2474     return;
2475 }
2476 
2477 //
2478 // Creates a ring buffer capable of holding "maxEntries" number of entries, and
2479 // adds it to the ring buffer collection.
2480 // Returns a pointer to the created ring buffer so that individual modules can
2481 // examine the data on-demand easily.
2482 //
2483 //PRINT_BUFFER_LOG
2484 NvU8 *
2485 rcdbCreateRingBuffer_IMPL
2486 (
2487     Journal *pRcDB,
2488     RMCD_RECORD_TYPE type,
2489     NvU32   maxEntries
2490 )
2491 {
2492     NV_STATUS status;
2493     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2494     RING_BUFFER_LOG *pRingBuffer;
2495     NvU8*           pBuffer = NULL;
2496     NvU32 bufferSize, entrySize;
2497 
2498     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2499 
2500     entrySize = rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2501     if (entrySize == 0)
2502     {
2503         NV_ASSERT(entrySize != 0);
2504         return NULL;
2505     }
2506 
2507     // We need to store maxEntries number of entries. Check for overflow too
2508     if (portSafeMulU32(maxEntries, entrySize, &bufferSize) == NV_FALSE)
2509     {
2510         return NULL;
2511     }
2512 
2513     if (pRingBuffer != NULL)
2514     {
2515         NvU32 totalSize;
2516 
2517         if (portSafeAddU32(bufferSize, pRingBuffer->bufferSize, &totalSize) == NV_FALSE)
2518         {
2519             return NULL;
2520         }
2521 
2522         bufferSize = totalSize;
2523         pRingBuffer->refCount++;
2524 
2525         //
2526         // XXX The collect-all design of the ring buffers allows for
2527         // interleaved entries for different GPUs. This makes it
2528         // hard to dynamically shrink any given ring buffer as GPUs are
2529         // torn down, and requires that an upper bound be placed on
2530         // the buffer's size.
2531         //
2532         // The upper bound, as chosen, is somewhat arbitrary, but at
2533         // the time of this writing, consistent with the use of
2534         // this interface (i.e. the number of entries for each type is
2535         // the same for each GPU).
2536         //
2537         if (bufferSize > pRingBuffer->maxBufferSize)
2538              return NULL;
2539     }
2540     else
2541     {
2542         pRingBuffer = portMemAllocNonPaged(sizeof(RING_BUFFER_LOG));
2543         if (pRingBuffer == NULL)
2544         {
2545             status = NV_ERR_NO_MEMORY;
2546             NV_ASSERT(status == NV_OK);
2547             return NULL;
2548         }
2549 
2550         portMemSet(pRingBuffer, 0x00, sizeof(*pRingBuffer));
2551         pRingBuffer->refCount = 1;
2552     }
2553 
2554     pBuffer = portMemAllocNonPaged(bufferSize);
2555     if (pBuffer == NULL)
2556     {
2557         status = NV_ERR_NO_MEMORY;
2558         NV_ASSERT(status == NV_OK);
2559         pRingBuffer->refCount--;
2560         if (pRingBuffer->pBuffer == NULL)
2561         {
2562             portMemFree(pRingBuffer);
2563         }
2564         return NULL;
2565     }
2566 
2567     // Now, initialize the entries the RING_BUFFER structure.
2568     pRingBuffer->maxEntries += maxEntries;
2569 
2570     // Add the ring buffer to the beginning of the ring buffer collection.
2571     if (pRingBuffer->pBuffer == NULL)
2572     {
2573         if (portSafeMulU32(bufferSize, NV_MAX_DEVICES, &pRingBuffer->maxBufferSize) == NV_FALSE)
2574         {
2575             pRingBuffer->refCount--;
2576             if (pRingBuffer->pBuffer == NULL)
2577             {
2578                 portMemFree(pRingBuffer);
2579             }
2580 
2581             portMemFree(pBuffer);
2582             return NULL;
2583         }
2584 
2585         pRingBuffer->maxBufferSize = (bufferSize * NV_MAX_DEVICES);
2586         pRingBuffer->entryType = type;
2587         pRingBuffer->pNextRingBuffer = pRingBufferColl->pFirstEntry;
2588         pRingBufferColl->pFirstEntry = pRingBuffer;
2589         pRingBufferColl->NumRingBuffers++;
2590     }
2591     else
2592     {
2593         NvU32 copySize;
2594 
2595         if (portSafeSubU32(bufferSize, pRingBuffer->bufferSize, &copySize) == NV_FALSE)
2596         {
2597             pRingBuffer->refCount--;
2598             if (pRingBuffer->pBuffer == NULL)
2599             {
2600                 portMemFree(pRingBuffer);
2601             }
2602 
2603             portMemFree(pBuffer);
2604             return NULL;
2605         }
2606 
2607         portMemCopy(pBuffer, copySize, pRingBuffer->pBuffer, copySize);
2608         portMemFree(pRingBuffer->pBuffer);
2609     }
2610 
2611     pRingBuffer->bufferSize = bufferSize;
2612     pRingBuffer->pBuffer = pBuffer;
2613     return (NvU8 *)pRingBuffer;
2614 }
2615 
2616 void
2617 rcdbDestroyRingBuffer_IMPL
2618 (
2619     Journal *pRcDB,
2620     RMCD_RECORD_TYPE type
2621 )
2622 {
2623     RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2624     RING_BUFFER_LOG *pRingBuffer, *pCurrentRingBuffer;
2625     NvU32 i;
2626 
2627     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2628     if (pRingBuffer == NULL)
2629         return;
2630 
2631     if (--pRingBuffer->refCount > 0)
2632         return;
2633 
2634     pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2635     if (pCurrentRingBuffer == pRingBuffer)
2636     {
2637         pRingBufferColl->pFirstEntry = pCurrentRingBuffer->pNextRingBuffer;
2638     }
2639     else
2640     {
2641         for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2642         {
2643             if (pCurrentRingBuffer->pNextRingBuffer == pRingBuffer)
2644             {
2645                 pCurrentRingBuffer->pNextRingBuffer =
2646                     pRingBuffer->pNextRingBuffer;
2647                 break;
2648             }
2649             pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2650         }
2651     }
2652 
2653     portMemFree(pRingBuffer->pBuffer);
2654     portMemFree(pRingBuffer);
2655 
2656     pRingBufferColl->NumRingBuffers--;
2657 }
2658 
2659 /*
2660 ** _rcdbAllocRecFromRingBuffer allocates a buffer entry from the
2661 **  specified ring buffer.
2662 **
2663 **  parameters:
2664 **      pGpu            a pointer to the GPU object associated with the entry.
2665 **      pRcdb           a pointer toe the Journal that contains the ring buffers
2666 **      type            the record type to locate a buffer for.
2667 **      recordSize      the size of the expected record
2668 **
2669 **  notes:
2670 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
2671 **      failure to do so can result in concurrency issues.
2672 */
2673 RmRCCommonJournal_RECORD *
2674 _rcdbAllocRecFromRingBuffer
2675 (
2676     OBJGPU             *pGpu,
2677     Journal            *pRcDB,
2678     RMCD_RECORD_TYPE    type
2679 )
2680 {
2681     RING_BUFFER_LOG    *pRingBuffer = NULL;
2682     NvU32               newItemIndex;
2683     RmRCCommonJournal_RECORD
2684                        *pCommon = NULL;
2685 
2686     // Find the ring buffer for this entry in the collection.
2687     rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2688 
2689     if (pRingBuffer == NULL)
2690     {
2691         NV_ASSERT(0);
2692         //
2693         // There is no ring buffer allocated for this type.
2694         // Nothing we can do about it.
2695         //
2696         return NULL;
2697     }
2698 
2699     newItemIndex = (pRingBuffer->numEntries + pRingBuffer->headIndex) % pRingBuffer->maxEntries;
2700 
2701     // prepend the rmJournalCommon record to record.
2702     pCommon = (RmRCCommonJournal_RECORD*)(pRingBuffer->pBuffer + (rcdbGetOcaRecordSizeWithHeader(pRcDB, type) * newItemIndex));
2703     pCommon->Header.cRecordGroup = RmGroup;
2704     pCommon->Header.cRecordType = type;
2705     pCommon->Header.wRecordSize = (NvU16)rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2706     rcdbSetCommonJournalRecord(pGpu, pCommon);
2707 
2708     // Increment the number of entries or advance the head index.
2709     if (pRingBuffer->numEntries < pRingBuffer->maxEntries)
2710     {
2711         ++pRingBuffer->numEntries;
2712     }
2713     else
2714     {
2715         ++(pRingBuffer->headIndex);
2716         if (pRingBuffer->headIndex >= pRingBuffer->maxEntries)
2717         {
2718             pRingBuffer->headIndex = 0;
2719         }
2720     }
2721     return pCommon;
2722 }
2723 
2724 /*
2725 ** rcdbAddRecToRingBuffer_IMPL allocates a buffer entry from the
2726 **  specified ring buffer & copies the supplied data buffer into it.
2727 **
2728 **  parameters:
2729 **      pGpu            a pointer to the GPU object associated with the entry.
2730 **      pRcdb           a pointer toe the Journal that contains the ring buffers
2731 **      type            the record type to locate a buffer for.
2732 **      recordSize      the size of the expected record
2733 **      pRecord         a pointer to the data that will populate the new ring buffer entry.
2734 **
2735 **  notes:
2736 */
2737 RmRCCommonJournal_RECORD *
2738 rcdbAddRecToRingBuffer_IMPL
2739 (
2740     OBJGPU             *pGpu,
2741     Journal            *pRcDB,
2742     RMCD_RECORD_TYPE    type,
2743     NvU32               recordSize,
2744     NvU8               *pRecord
2745 )
2746 {
2747     RmRCCommonJournal_RECORD *pCommon = NULL;
2748 
2749     NV_ASSERT(recordSize == _rcdbGetOcaRecordSize(pRcDB, type));
2750 
2751     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
2752     {
2753         pCommon = _rcdbAllocRecFromRingBuffer(pGpu, pRcDB, type);
2754         if (pCommon != NULL)
2755         {
2756             // copy the record to follow the common header.
2757             portMemCopy(&(pCommon[1]), recordSize, pRecord, recordSize);
2758         }
2759     }
2760     portAtomicDecrementS32(&concurrentRingBufferAccess);
2761 
2762     return pCommon;
2763 }
2764 
2765 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type)
2766 {
2767     switch(type)
2768     {
2769         case RmRcDiagReport:
2770             return sizeof(RmRcDiag_RECORD);
2771             break;
2772         case RmNocatReport:
2773             return sizeof(RM_NOCAT_JOURNAL_ENTRY);
2774             break;
2775         default:
2776             return 0;
2777     }
2778 }
2779 
2780 NvU32 rcdbGetOcaRecordSizeWithHeader_IMPL(Journal *pRcDB, RMCD_RECORD_TYPE type)
2781 {
2782     NvU32 recSz;
2783 
2784     recSz = _rcdbGetOcaRecordSize(pRcDB, type);
2785     if (0 < recSz)
2786     {
2787         recSz += sizeof(RmRCCommonJournal_RECORD);
2788     }
2789 
2790     //
2791     // On architecture like RISC-V, loads/stores need to be aligned to the
2792     // request size (1, 2, 4, 8-byte). Here, OCA record and header are stored
2793     // in a ring buffer, hence total recSz needs to be 8-byte aligned for both
2794     // producer (GSP RM) and consumer (CPU RM) of this data.
2795     //
2796     return NV_ALIGN_UP(recSz, 8);
2797 }
2798 
2799 NV_STATUS
2800 rcdbAddRmGpuDump
2801 (
2802     OBJGPU *pGpu
2803 )
2804 {
2805     NV_STATUS           status = NV_OK;
2806     OBJSYS             *pSys               = SYS_GET_INSTANCE();
2807     Journal            *pRcDB              = SYS_GET_RCDB(pSys);
2808     NvDebugDump        *pNvd               = GPU_GET_NVD(pGpu);
2809     NVD_STATE          *pNvDumpState       = &pRcDB->nvDumpState;
2810     SYS_ERROR_INFO     *pSysErrorInfo      = &pRcDB->ErrorInfo;
2811     RMPRBERRORELEMENT_V2 *pPrbErrorInfo    = NULL;
2812     RMPRBERRORELEMENT_V2 *pErrorList       = NULL;
2813     RMCD_ERROR_BLOCK   *pNewErrorBlock     = NULL;
2814     RMERRORHEADER      *pErrorHeader       = NULL;
2815     PRB_ENCODER         prbEnc;
2816     NvU32               bufferUsed;
2817     NvU8               *pBuf               = NULL;
2818 
2819     //
2820     // The deferred dump codepath will block out other dumps until the DPC can
2821     // be executed. If this is the deferred callback attempting to do the dump,
2822     // carry on.
2823     //
2824     if (pNvDumpState->bDumpInProcess &&
2825         !pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2826     {
2827         return NV_ERR_STATE_IN_USE;
2828     }
2829 
2830     prbEnc.depth = 0;
2831     pNvDumpState->bDumpInProcess    = NV_TRUE;
2832     pNvDumpState->nvDumpType        = NVD_DUMP_TYPE_OCA;
2833     pNvDumpState->bRMLock           = rmapiLockIsOwner();
2834 
2835     rcdbDumpInitGpuAccessibleFlag(pGpu, pRcDB);
2836 
2837     //
2838     // General process:
2839     //  1. Start the protobuf encoder in ALLOCATE mode, and dump the data
2840     //  2. Allocate an error element to stick in the Journal list
2841     //  3. Add the protobuf dump to the error element
2842     //  4. Put the error element at the end of the error list on OBJRCDB
2843     //
2844     status = prbEncStartAlloc(&prbEnc, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE,
2845                               NULL);
2846     if (status != NV_OK)
2847     {
2848         //
2849         // If we couldn't allocate the memory, it may be because we're at a
2850         // raised IRQL. It's not a great idea to be gathering a bunch of state
2851         // from the interrupt context anyway, so queue a work item to come back
2852         // later and try again.
2853         //
2854         NvU32 *pGpuInstance = NULL;
2855 
2856         //
2857         // If that's what we've already done and we're still failing, bail out
2858         // to avoid an infinite fail/queue-work-item loop.
2859         //
2860         if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2861         {
2862             NV_PRINTF(LEVEL_ERROR,
2863                       "deferred GPU dump encoder init failed (status = 0x%x)\n",
2864                       status);
2865             goto done;
2866         }
2867 
2868         NV_PRINTF(LEVEL_INFO, "deferring GPU dump for normal context\n");
2869 
2870         //
2871         // This will be freed by the OS work item layer. We pass the GPU
2872         // instance as the data separately because if the GPU has fallen off
2873         // the bus, the OS layer may refuse to execute work items attached to
2874         // it. Instead, use the system work item interface and handle the GPU
2875         // ourselves.
2876         //
2877         pGpuInstance = portMemAllocNonPaged(sizeof(NvU32));
2878         if (pGpuInstance == NULL)
2879         {
2880             status = NV_ERR_NO_MEMORY;
2881             goto done;
2882         }
2883 
2884         *pGpuInstance = gpuGetInstance(pGpu);
2885         status = osQueueSystemWorkItem(_rcdbAddRmGpuDumpCallback,
2886                                        pGpuInstance);
2887         if (status != NV_OK)
2888         {
2889             portMemFree(pGpuInstance);
2890             goto done;
2891         }
2892 
2893         //
2894         // Since we've queued the work item, leave the dump state marked as in
2895         // use to prevent other interrupts and codepaths from attempting to
2896         // initiate the dump and/or queue a new work item.
2897         //
2898         return NV_WARN_MORE_PROCESSING_REQUIRED;
2899     }
2900 
2901     status = nvdDumpAllEngines(pGpu, pNvd, &prbEnc, pNvDumpState);
2902     if (status != NV_OK)
2903     {
2904         //
2905         // If the dump failed somewhere, unwind the encoder and then drop
2906         // through to finish it out so we can get the pointer to the
2907         // allocated buffer to free.
2908         //
2909         while (prbEnc.depth > 1)
2910         {
2911             prbEncNestedEnd(&prbEnc);
2912         }
2913     }
2914 
2915     bufferUsed = prbEncFinish(&prbEnc, (void **)&pBuf);
2916 
2917     if (status != NV_OK)
2918     {
2919         goto done;
2920     }
2921 
2922     // Allocate and initialize the error element
2923     pPrbErrorInfo = portMemAllocNonPaged(sizeof(RMPRBERRORELEMENT_V2));
2924     if (pPrbErrorInfo == NULL)
2925     {
2926         status = NV_ERR_NO_MEMORY;
2927         goto done;
2928     }
2929 
2930     portMemSet(pPrbErrorInfo, 0, sizeof(RMPRBERRORELEMENT_V2));
2931     pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordGroup = RmGroup;
2932     pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordType  = RmPrbFullDump_V2;
2933     pPrbErrorInfo->RmPrbErrorData.common.Header.wRecordSize  = sizeof(RMPRBERRORELEMENT_V2);
2934     rcdbSetCommonJournalRecord(pGpu, &(pPrbErrorInfo->RmPrbErrorData.common));
2935     pErrorHeader = &pPrbErrorInfo->ErrorHeader;
2936     pErrorHeader->pErrorBlock = NULL;
2937 
2938     //
2939     // Allocate and initialize the error "block" associated with this protobuf
2940     // dump
2941     //
2942     pNewErrorBlock = portMemAllocNonPaged(sizeof(RMCD_ERROR_BLOCK));
2943     if (pNewErrorBlock == NULL)
2944     {
2945         status = NV_ERR_NO_MEMORY;
2946         goto done;
2947     }
2948 
2949     portMemSet(pNewErrorBlock, 0, sizeof(RMCD_ERROR_BLOCK));
2950     pNewErrorBlock->pBlock = pBuf;
2951     pNewErrorBlock->blockSize = bufferUsed;
2952     pNewErrorBlock->pNext = NULL;
2953     pErrorHeader->pErrorBlock = pNewErrorBlock;
2954 
2955     // Add the error element to the Journal list
2956     if (pSysErrorInfo->pErrorList != NULL)
2957     {
2958         pErrorList = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
2959         while (pErrorList->ErrorHeader.pNextError != NULL)
2960         {
2961             pErrorList = (RMPRBERRORELEMENT_V2*)pErrorList->ErrorHeader.pNextError;
2962         }
2963 
2964         pErrorList->ErrorHeader.pNextError = (RMFIFOERRORELEMENT_V3*)pPrbErrorInfo;
2965     }
2966     else
2967     {
2968         pSysErrorInfo->pErrorList = pPrbErrorInfo;
2969     }
2970 
2971     pSysErrorInfo->ErrorCount++;
2972 
2973 done:
2974     if (status != NV_OK)
2975     {
2976         if (pBuf != NULL)
2977         {
2978             portMemFree(pPrbErrorInfo);
2979             portMemFree(pBuf);
2980         }
2981     }
2982 
2983     pNvDumpState->bDumpInProcess = NV_FALSE;
2984     return status;
2985 }
2986 
2987 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2988 #if !defined(DEBUG) && !defined(QA_BUILD)
2989 /*
2990  */
2991 NvBool
2992 rcdProbeGpuPresent(
2993     OBJGPU  *pGpu,
2994     NvU64    ip
2995 )
2996 {
2997     NvU32       testValue;
2998     NvBool      bFoundLostGpu = NV_FALSE;
2999 
3000     // protect against recursion when probing the GPU.
3001     if (portAtomicIncrementS32(&probeGpuRecursion) == 1)
3002     {
3003         if (NULL != pGpu)
3004         {
3005             // is the GPU we are checking allready reported lost?
3006             if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) &&
3007                 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
3008             {
3009                 testValue = GPU_CHECK_REG_RD32(pGpu, NV_PMC_BOOT_0, (~(pGpu->chipId0)));
3010                 if (testValue == GPU_REG_VALUE_INVALID)
3011                 {
3012                     // there shouldn't be a need to make a journal entry,
3013                     // as that should have been done by GPU_CHECK_REG_RD32
3014 
3015                     // Add GPU lost detection to to NvLog.
3016                     // But skip when nvLog asserts to avoid stack overflow.
3017 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3018                     if (portAtomicIncrementS32(&nvLogRecursion) == 1)
3019 #endif
3020                     {
3021                         NV_PRINTF(LEVEL_ERROR,
3022                                   "found GPU %d (0x%p) inaccessible After assert\n",
3023                                   pGpu->gpuInstance, pGpu);
3024                     }
3025 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3026                     portAtomicDecrementS32(&nvLogRecursion);
3027 #endif
3028                     bFoundLostGpu = NV_TRUE;
3029                 }
3030             }
3031         }
3032     }
3033     portAtomicDecrementS32(&probeGpuRecursion);
3034     return bFoundLostGpu;
3035 }
3036 
3037 NvBool
3038 rcdProbeAllGpusPresent(
3039     NvU64   ip
3040 )
3041 {
3042     NvBool  bFoundLostGpu = NV_FALSE;
3043     OBJGPU *pGpu;
3044     NvU32   gpuMask;
3045     NvU32   gpuIndex = 0;
3046 
3047     gpumgrGetGpuAttachInfo(NULL, &gpuMask);
3048     pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3049     while (pGpu)
3050     {
3051         bFoundLostGpu = bFoundLostGpu  || rcdProbeGpuPresent(pGpu, ip);
3052         pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3053     }
3054     return bFoundLostGpu;
3055 }
3056 #endif // !defined(DEBUG) && !defined(QA_BUILD)
3057 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
3058 
3059 void
3060 rcdbAddCrashedFalcon
3061 (
3062     Falcon *pFlcn
3063 )
3064 {
3065     OBJSYS *pSys = SYS_GET_INSTANCE();
3066     Journal *pRcDB = SYS_GET_RCDB(pSys);
3067 
3068     pRcDB->pCrashedFlcn = pFlcn;
3069 }
3070 
3071 
3072 /*
3073 ** _rcdbNocatCollectContext records the context of the GPU at the time the error is reported.
3074 **
3075 **  parameters:
3076 **      pGpu        pointer to GPU to be reported on.
3077 **      pContext    pointer to context structure to be filled in.
3078 **
3079 **   returns:
3080 **      NV_ERR_INVALID_ARGUMENT -- pContext is NULL
3081 */
3082 NV_STATUS
3083 _rcdbNocatCollectContext(OBJGPU *pGpu, Journal* pRcdb, NV2080_NOCAT_JOURNAL_GPU_STATE* pContext)
3084 {
3085     NV2080_NOCAT_JOURNAL_GPU_STATE* pContextCache = NULL;
3086     const char *pTag;
3087 
3088     if (pRcdb == NULL)
3089     {
3090         return NV_ERR_INVALID_ARGUMENT;
3091     }
3092 
3093     // determine which tag to use.
3094     if (pRcdb->nocatJournalDescriptor.tag[0] != '\0')
3095     {
3096         pTag = (char *)pRcdb->nocatJournalDescriptor.tag;
3097     }
3098     else
3099     {
3100         pTag = NOCAT_DEFAULT_TAG_VALUE_STR;
3101     }
3102     if (pGpu == NULL)
3103     {
3104         // w/o a GPU the only thing we can do is set the tag.
3105         if (pContext != NULL)
3106         {
3107             portMemSet(pContext, 0, sizeof(*pContext));
3108 
3109                 portStringCopy((char *)pContext->tag,
3110                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3111                     pTag,
3112                     portStringLength(pTag) + 1);
3113         }
3114         return NV_OK;
3115     }
3116 #if NOCAT_COLLECT_PERF
3117     pGpuCache = &(pGpu->nocatGpuCache);
3118 #endif
3119     pContextCache = &(pRcdb->nocatJournalDescriptor.nocatGpuState);
3120 
3121     // insert tag if we have one.
3122     portStringCopy((char *)pContextCache->tag,
3123         NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3124         pTag,
3125         portStringLength(pTag) + 1);
3126 
3127     if (!pContextCache->bValid)
3128     {
3129         pContextCache->deviceId = (NvU16)(DRF_VAL(_PCI, _DEVID, _DEVICE, pGpu->idInfo.PCIDeviceID));
3130         pContextCache->vendorId = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCIDeviceID));
3131         pContextCache->subsystemVendor = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCISubDeviceID));
3132         pContextCache->subsystemId = (NvU16)(DRF_VAL(_PCI, _SUBID, _DEVICE, pGpu->idInfo.PCISubDeviceID));
3133         pContextCache->revision = pGpu->idInfo.PCIRevisionID;
3134         pContextCache->type = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_MOBILE);
3135         pContextCache->bMsHybrid = FLD_TEST_DRF(_JT_FUNC, _CAPS, _MSHYB_ENABLED, _TRUE,
3136             pGpu->acpiMethodData.jtMethodData.jtCaps);
3137 
3138         portStringCopy((char *)pContextCache->vbiosProject, NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3139             NOCAT_UNKNOWN_STR, portStringLength(NOCAT_UNKNOWN_STR) + 1);
3140 
3141         if (!osIsRaisedIRQL())
3142         {
3143             NV_STATUS status = pGpu->acpiMethodData.capsMethodData.status;
3144             if (status == NV_OK)
3145             {
3146                 pContextCache->bOptimus =
3147                     FLD_TEST_DRF(OP_FUNC, _OPTIMUSCAPS, _OPTIMUS_CAPABILITIES,
3148                         _DYNAMIC_POWER_CONTROL, pGpu->acpiMethodData.capsMethodData.optimusCaps);
3149             }
3150 
3151             pContextCache->bValid = NV_TRUE;
3152         }
3153     }
3154     if (pContext != NULL)
3155     {
3156         portMemSet(pContext, 0, sizeof(*pContext));
3157 
3158         *pContext = *pContextCache;
3159 
3160         pContext->bFullPower = gpuIsGpuFullPower(pGpu);
3161         pContext->bInGc6Reset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET);
3162         pContext->bInFullchipReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET);
3163         pContext->bInSecBusReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET);
3164     }
3165     return NV_OK;
3166 }
3167 
3168 /*
3169 **  _rcdbSetTdrReason translates the reason code to a string & puts that string
3170 **  in the provided buffer.
3171 **
3172 **  parameters:
3173 **      tdrReason       the reason code for the TDR
3174 **      pTdrReasonStr   pointer to the place to copy the reason string to
3175 **      maxLen          the size of the buffer pointed to in pTdrReasonStr.
3176 **
3177 */
3178 void _rcdbSetTdrReason
3179 (
3180     Journal            *pRcdb,
3181     NvU32               tdrReason,
3182     char               *pTdrReasonStr,
3183     NvU32               maxLen
3184 )
3185 {
3186     const char *pTmpStr;
3187 
3188     // validate inputs.
3189     if (pRcdb == NULL)
3190     {
3191         return;
3192     }
3193 
3194     // is there a string buffer & is it large enough to hold more than a NULL string
3195     if ((pTdrReasonStr == NULL) || (maxLen < 2))
3196     {
3197         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3198         return;
3199     }
3200     switch (tdrReason)
3201     {
3202     case NV2080_CTRL_NOCAT_TDR_TYPE_NONE:
3203         pTmpStr = NOCAT_NA_STR;
3204         break;
3205     case NV2080_CTRL_NOCAT_TDR_TYPE_LEGACY:
3206         pTmpStr = NOCAT_LEGACY_STR;
3207         break;
3208     case NV2080_CTRL_NOCAT_TDR_TYPE_FULLCHIP:
3209         pTmpStr = NOCAT_FULLCHIP_TDR_STR;
3210         break;
3211     case NV2080_CTRL_NOCAT_TDR_TYPE_BUSRESET:
3212         pTmpStr = NOCAT_BUS_RESET_TDR_STR;
3213         break;
3214     case NV2080_CTRL_NOCAT_TDR_TYPE_GC6_RESET:
3215         pTmpStr = NOCAT_GC6_RESET_TDR_STR;
3216         break;
3217     case NV2080_CTRL_NOCAT_TDR_TYPE_SURPRISE_REMOVAL:
3218         pTmpStr = NOCAT_SURPRISE_REMOVAL_TDR_STR;
3219         break;
3220     case NV2080_CTRL_NOCAT_TDR_TYPE_UCODE_RESET:
3221         pTmpStr = NOCAT_UCODE_RESET_TDR_STR;
3222         break;
3223     default:
3224         pTmpStr = NOCAT_UNKNOWN_STR;
3225         break;
3226     }
3227     portStringCopy(pTdrReasonStr, maxLen,
3228         pTmpStr, portStringLength(pTmpStr) + 1);
3229 }
3230 
3231 /*
3232 ** _rcdbAllocNocatJournalRecord allocates a buffer entry from the Journal ring buffer
3233 **  for the specified type
3234 **
3235 **  parameters:
3236 **      pGpu            a pointer to the GPU object associated with the entry.
3237 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3238 **      type            the record type to locate a buffer for.
3239 **
3240 **  returns a pointer to a record in the ring buffer, or NULL if a record could not be allocated.
3241 **
3242 **  notes:
3243 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3244 **      the lock should be held until access the buffer is completed.
3245 **      failure to do so can result in concurrency issues.
3246 **
3247 **      if successful, the buffer that is returned is cleared & an id assigned.
3248 */
3249 RM_NOCAT_JOURNAL_ENTRY* _rcdbAllocNocatJournalRecord
3250 (
3251     OBJGPU     *pGpu,
3252     OBJRCDB    *pRcdb,
3253     RmRCCommonJournal_RECORD   **ppCommon
3254 )
3255 {
3256     nocatQueueDescriptor   *pDesc = NULL;
3257     RmRCCommonJournal_RECORD* pCommon;
3258     RM_NOCAT_JOURNAL_ENTRY * pNocatEntry = NULL;
3259 
3260     // make sure someone has the lock.
3261     if (concurrentRingBufferAccess == 0)
3262     {
3263         return NULL;
3264     }
3265 
3266     pDesc = &pRcdb->nocatJournalDescriptor;
3267 
3268     // Get the next record from the appropriate nocat ring buffer.
3269     pCommon = _rcdbAllocRecFromRingBuffer(
3270         pGpu,
3271         pRcdb,
3272         RmNocatReport);
3273 
3274     if (pCommon != NULL)
3275     {
3276         // advance the pointer past the common header.
3277         pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3278 
3279         // clear the record & assign an id.
3280         portMemSet(pNocatEntry, 0, sizeof(*pNocatEntry));
3281         pNocatEntry->id = pDesc->nextRecordId++;
3282         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOCATED_IDX]++;
3283         portAtomicIncrementS32(&pNocatEntry->inUse);
3284     }
3285     else
3286     {
3287         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOC_FAILED_IDX]++;
3288     }
3289     if (ppCommon != NULL)
3290     {
3291         *ppCommon = pCommon;
3292     }
3293     return pNocatEntry;
3294 }
3295 
3296 /*
3297 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3298 **      or optionally the oldest record if the requested one is not available.
3299 **
3300 **  parameters:
3301 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3302 **      id              id of the record we are looking for
3303 **      bExactMatch     indicates if we want an exact match, or the closest record.
3304 **      ppCommon        a pointer to a pointer that will hold the pointer to
3305 **                      the common part of the record.
3306 **                      this can be NULL
3307 **      ppReturnedNocatEntry
3308 **                      a pointer to a pointer that will hold the pointer to
3309 **                      the nocat part of the record
3310 **                      this can be NULL
3311 **
3312 **  notes:
3313 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3314 **      the lock should be held until access the buffer is completed.
3315 **      failure to do so can result in concurrency issues.
3316 */
3317 NV_STATUS
3318 _rcdbGetNocatJournalRecord
3319 (
3320     OBJRCDB            *pRcdb,
3321     NvU32               reqId,
3322     NvBool              bExactMatch,
3323     RmRCCommonJournal_RECORD
3324                       **ppReturnedCommon,
3325     RM_NOCAT_JOURNAL_ENTRY
3326                       **ppReturnedNocatEntry
3327 )
3328 {
3329     nocatQueueDescriptor     *pDesc;
3330     RmRCCommonJournal_RECORD *pCommon = NULL;
3331     RM_NOCAT_JOURNAL_ENTRY   *pNocatEntry = NULL;
3332     RING_BUFFER_LOG          *pRingBuffer = NULL;
3333     NvS32                     offset;
3334     NvS32                     idx;
3335 
3336     // make sure someone has the lock.
3337     if (concurrentRingBufferAccess == 0)
3338     {
3339         return NV_ERR_BUSY_RETRY;
3340     }
3341 
3342     // is there anything to do
3343     if ((ppReturnedCommon == NULL) && (ppReturnedNocatEntry == NULL))
3344     {
3345         return NV_OK;
3346     }
3347 
3348     // validate inputs.
3349     if (pRcdb == NULL)
3350     {
3351         return NV_ERR_INVALID_ARGUMENT;
3352     }
3353     pDesc = &pRcdb->nocatJournalDescriptor;
3354 
3355     // assume we will fail
3356     if (ppReturnedCommon != NULL)
3357     {
3358         *ppReturnedCommon = NULL;
3359     }
3360     if (ppReturnedNocatEntry != NULL)
3361     {
3362         *ppReturnedNocatEntry = NULL;
3363     }
3364 
3365     // if there is nothing in the buffer,
3366     // we can't return a record.
3367     if ((pDesc->nextRecordId - pDesc->nextReportedId) == 0)
3368     {
3369         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_RECORDS_IDX]++;
3370         return NV_ERR_OBJECT_NOT_FOUND;
3371     }
3372 
3373     // Find the ring buffer for the diag reports
3374     rcdbFindRingBufferForType(pRcdb, RmNocatReport, &pRingBuffer);
3375     if (pRingBuffer == NULL)
3376     {
3377         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_BUFFER_IDX]++;
3378         return NV_ERR_OBJECT_NOT_FOUND;
3379     }
3380     // determine how far back from the head our record should be.
3381     offset = pDesc->nextRecordId - reqId;
3382 
3383     // start of from the next record we will replace.
3384     // this will be the oldest buffer in the record,
3385     // or the next empty record, either way, we will wrap to the right one
3386     idx = pRingBuffer->headIndex;
3387 
3388     // is the requested record in the buffer?
3389     if ((0 <= offset) && ((NvU16)offset <= pRingBuffer->numEntries))
3390     {
3391         // back out the offset from the newest/empty record.
3392         idx += pRingBuffer->numEntries - offset;
3393         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_MATCH_FOUND_IDX]++;
3394     }
3395     else if (bExactMatch)
3396     {
3397         // the record is not in the buffer, & we weren't asked for the closest match.
3398         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_MATCH_IDX]++;
3399         return NV_ERR_OBJECT_NOT_FOUND;
3400     }
3401     else
3402     {
3403         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_CLOSEST_FOUND_IDX]++;
3404     }
3405     // wrap the idx to the current size of the buffer.
3406     idx %= pRingBuffer->numEntries;
3407 
3408     // get a pointer to the common record & the record from the buffer.
3409     pCommon = (RmRCCommonJournal_RECORD*)(((NvU8*)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcdb, RmNocatReport) * idx));
3410 
3411     // get a pointer to the data that follows the common header, that is the record data.
3412     pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3413     portAtomicIncrementS32(&pNocatEntry->inUse);
3414 
3415     // pass the record along
3416     if (ppReturnedCommon != NULL)
3417     {
3418         *ppReturnedCommon = pCommon;
3419     }
3420     if (ppReturnedNocatEntry != NULL)
3421     {
3422         *ppReturnedNocatEntry = pNocatEntry;
3423     }
3424     return NV_OK;
3425 }
3426 /*
3427 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3428 **      or optionally the oldest record if the requested one is not available.
3429 **
3430 **  parameters:
3431 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3432 **      id              id of the record we are looking for
3433 **      bExactMatch     indicates if we want an exact match, or the closest record.
3434 **      ppCommon        a pointer to a pointer that will hold the pointer to
3435 **                      the common part of the record.
3436 **                      this can be NULL
3437 **      ppReturnedNocatEntry
3438 **                      a pointer to a pointer that will hold the pointer to
3439 **                      the nocat part of the record
3440 **                      this can be NULL
3441 **
3442 **  notes:
3443 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3444 **      the lock should be held until access the buffer is completed.
3445 **      failure to do so can result in concurrency issues.
3446 */
3447 NV_STATUS
3448 _rcdbReleaseNocatJournalRecord
3449 (
3450     RM_NOCAT_JOURNAL_ENTRY  *pNocatEntry
3451 )
3452 {
3453     if (pNocatEntry == NULL)
3454     {
3455         return NV_ERR_INVALID_ARGUMENT;
3456     }
3457     if (portAtomicDecrementS32(&pNocatEntry->inUse) != 0)
3458     {
3459         return NV_ERR_BUSY_RETRY;
3460     }
3461     return NV_OK;
3462 }
3463 
3464 /*
3465 ** _rcdbGetNewestNocatJournalRecordForType returns a pointer to the newest record for the
3466 **  specified type if there is one.
3467 **
3468 **  parameters:
3469 **      pRcdb           a pointer toe the Journal that contains the ring buffers
3470 **      type            type of record we want.
3471 **      ppCommon        a pointer to a pointer that will hold the pointer to
3472 **                      the common part of the record.
3473 **                      this can be NULL
3474 **      ppCommon        a pointer to a pointer that will hold the pointer to
3475 **                      the nocat part of the record
3476 **                      this can be NULL
3477 **
3478 **  notes:
3479 **      it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3480 **      the lock should be held until access the buffer is completed.
3481 **      failure to do so can result in concurrency issues.
3482 */
3483 NV_STATUS
3484 _rcdbGetNewestNocatJournalRecordForType
3485 (
3486     OBJRCDB            *pRcdb,
3487     NvU32               type,
3488     RmRCCommonJournal_RECORD
3489                       **ppReturnedCommon,
3490     RM_NOCAT_JOURNAL_ENTRY
3491                       **ppReturnedNocatEntry
3492 )
3493 {
3494     if (type >= NV2080_NOCAT_JOURNAL_REC_TYPE_COUNT)
3495     {
3496         // we failed
3497         if (ppReturnedCommon != NULL)
3498         {
3499             *ppReturnedCommon = NULL;
3500         }
3501         if (ppReturnedNocatEntry != NULL)
3502         {
3503             *ppReturnedNocatEntry = NULL;
3504         }
3505         return NV_ERR_OBJECT_NOT_FOUND;
3506     }
3507     return _rcdbGetNocatJournalRecord(pRcdb, pRcdb->nocatJournalDescriptor.lastRecordId[type], NV_TRUE,
3508         ppReturnedCommon, ppReturnedNocatEntry);
3509 }
3510 
3511 /*
3512 ** rcdbReportNextNocatJournalEntry fills in the provided Nocat Journal record with the next record
3513 ** to be reported, then updates the last reported id.
3514 **
3515 **  parameters:
3516 **      pReturnedNocatEntry a pointer to the buffer where the journal record will be transferred to
3517 **
3518 **  returns:
3519 **      NV_OK -- the record was successfully updated with the next record to report.
3520 **      NV_ERR_INVALID_ARGUMENT -- the provided pointer is NULL
3521 **      NV_ERR_OBJECT_NOT_FOUND -- we could not locate a record to report.
3522 **
3523 **  notes:
3524 **      we are transferring the record to the target location here instead of passing a pointer
3525 **      to insure the data is transferred while we hold the concurrentRingBufferAccess lock.
3526 **      failure to do so can result in concurrency issues.
3527 **
3528 **      priority is determined by the record journal queue values. the lower value has
3529 **      higher priority.
3530 **
3531 **      now that we have moved from a single entry, to a queue, we need to
3532 **      consume the entry once we report it
3533 **
3534 */
3535 NV_STATUS
3536 rcdbReportNextNocatJournalEntry
3537 (
3538     NV2080_NOCAT_JOURNAL_RECORD
3539                        *pReturnedNocatEntry
3540 )
3541 {
3542     OBJSYS                   *pSys = SYS_GET_INSTANCE();
3543     Journal                  *pRcdb = SYS_GET_RCDB(pSys);
3544     NV_STATUS                 status = NV_ERR_OBJECT_NOT_FOUND;
3545     nocatQueueDescriptor     *pDesc;
3546     RmRCCommonJournal_RECORD *pCommon = NULL;
3547     RM_NOCAT_JOURNAL_ENTRY   *pNocatEntry = NULL;
3548 
3549     // validate inputs.
3550     if (pRcdb == NULL)
3551     {
3552         return NV_ERR_INVALID_ARGUMENT;
3553     }
3554     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REQUESTED_IDX]++;
3555 
3556     if (pReturnedNocatEntry == NULL)
3557     {
3558         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3559         return NV_ERR_INVALID_ARGUMENT;
3560     }
3561     portMemSet(pReturnedNocatEntry, 0, sizeof(*pReturnedNocatEntry));
3562 
3563     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3564     {
3565         pDesc = &pRcdb->nocatJournalDescriptor;
3566         _rcdbGetNocatJournalRecord(pRcdb,
3567             pDesc->nextReportedId, NV_FALSE,
3568             &pCommon, &pNocatEntry);
3569         if ((pCommon != NULL) && (pNocatEntry != NULL))
3570         {
3571             // we have a record, push it into the return buffer
3572             pReturnedNocatEntry->GPUTag = pCommon->GPUTag;
3573 
3574             // copy over the data into the supplied buffer.
3575             pReturnedNocatEntry->loadAddress = pDesc->loadAddress;
3576             pReturnedNocatEntry->timeStamp = pCommon->timeStamp;
3577             pReturnedNocatEntry->stateMask = pCommon->stateMask;
3578             pReturnedNocatEntry->nocatGpuState = pNocatEntry->nocatGpuState;
3579             pReturnedNocatEntry->nocatJournalEntry = pNocatEntry->nocatJournalEntry;
3580 
3581             // check if we lost any records.
3582             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_DROPPED_IDX] +=
3583                 pNocatEntry->id - pDesc->nextReportedId;
3584 
3585             // update the NocatJournalNextReportedId
3586             pDesc->nextReportedId = pNocatEntry->id + 1;
3587             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REPORTED_IDX]++;
3588 
3589             _rcdbReleaseNocatJournalRecord(pNocatEntry);
3590             status = NV_OK;
3591 
3592         }
3593     }
3594     else
3595     {
3596         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
3597         status = NV_ERR_BUSY_RETRY;
3598     }
3599     portAtomicDecrementS32(&concurrentRingBufferAccess);
3600     if ((pRcdb->nocatJournalDescriptor.lockTimestamp != 0) && (rcdbGetNocatOutstandingCount(pRcdb) == 0))
3601     {
3602         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_UNLOCKED_IDX]++;
3603         pRcdb->nocatJournalDescriptor.lockTimestamp = 0;
3604     }
3605     return status;
3606 }
3607 
3608 /*
3609 ** rcdbGetNocatOutstandingCount returns the number of NOCAT events that have
3610 ** been recorded since the last reported record.
3611 **
3612 **  parameters:
3613 **      pRcdb -- a pointer to the Journal object.
3614 **
3615 **  returns:
3616 **      number of NOCAT events that have been recorded since the last reported record.
3617 **      or NV_U32_MAX if a NULL journal object pointer is provided.
3618 **
3619 **  notes:
3620 **      the returned count includes records that have been dropped due to wrapping.
3621 **
3622 */
3623 NvU32
3624 rcdbGetNocatOutstandingCount(Journal *pRcdb)
3625 {
3626     NvU32 count = NV_U32_MAX;
3627     if (pRcdb != NULL)
3628     {
3629         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3630         {
3631             count = pRcdb->nocatJournalDescriptor.nextRecordId
3632                 - pRcdb->nocatJournalDescriptor.nextReportedId;
3633         }
3634         portAtomicDecrementS32(&concurrentRingBufferAccess);
3635     }
3636     return count;
3637 }
3638 
3639 /*
3640 ** _rcdbSendNocatJournalNotification sends an ETW Notification that a NOCAT Journal record has been posted.
3641 **
3642 **  parameters:
3643 **      pGpu -- a pointer to the GPU object associated with teh new entry
3644 **              (may be NULL)
3645 **      pRcdb -- a pointer to the Journal object NOCAT is using.
3646 **      posted -- the number of records posted since the last record that was retrieved.
3647 **      pCommon -- a pointer to the common record header associated with the record.
3648 **      type -- the record type
3649 **
3650 **  returns:
3651 **      NV_OK -- the call to post the record was made.
3652 **          note that the call to post the record does not return a status,
3653 **          so we do not know if the call was successful.
3654 **      NV_ERR_INVALID_ARGUMENT -- one of the required pointers is NULL
3655 **
3656 */
3657 NV_STATUS
3658 _rcdbSendNocatJournalNotification
3659 (
3660     OBJGPU *pGpu,
3661     Journal *pRcdb,
3662     NvU32    posted,
3663     RmRCCommonJournal_RECORD *pCommon,      // todo: pass in timestamp instead of common.
3664     NvU32 type
3665 )
3666 {
3667     if ((pCommon == NULL) || (pRcdb == NULL))
3668     {
3669         return NV_ERR_INVALID_ARGUMENT;
3670     }
3671     RMTRACE_NOCAT(_REPORT_PENDING, (pGpu ? pGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
3672         RmNocatReport,
3673         posted,
3674         type,
3675         rcdbGetNocatOutstandingCount(pRcdb),
3676         pCommon->timeStamp);
3677 
3678     // count the number of notifications.
3679     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NOTIFICATIONS_IDX]++;
3680     return NV_OK;
3681 }
3682 
3683 /*
3684 ** rcdbInitNocatGpuCache_IMPL initializes a per GPU cache held in the GPU object to be used by NOCAT
3685 **
3686 **  parameters:
3687 **      pGpu -- a pointer to the GPU Object the containing the cache
3688 **
3689 **  notes:
3690 **      this function:
3691 **      * caches the driver load address
3692 **      * allocates a small block of memory in the frame buffer for testing
3693 **      * initializes the GPU context cache
3694 **
3695 */
3696 void rcdbInitNocatGpuCache_IMPL(OBJGPU *pGpu)
3697 {
3698     OS_DRIVER_BLOCK         driverBlock;
3699     OBJSYS                 *pSys = SYS_GET_INSTANCE();
3700     Journal                *pRcdb = SYS_GET_RCDB(pSys);
3701 #if NOCAT_PROBE_FB_MEMORY
3702     NvU8                   *pCpuPtr;
3703     NV_STATUS              status;
3704 #endif
3705 
3706     if (pGpu == NULL)
3707     {
3708         return;
3709     }
3710     portMemSet(&driverBlock, 0x00, sizeof(driverBlock));
3711     if (osGetDriverBlock(pGpu->pOsGpuInfo, &driverBlock) == NV_OK)
3712     {
3713         pRcdb->nocatJournalDescriptor.loadAddress = (NvU64)driverBlock.driverStart;
3714     }
3715 
3716 #if NOCAT_PROBE_FB_MEMORY
3717     // Allocate some memory for virtual BAR2 testing
3718     if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_ALL_INST_IN_SYSMEM) && !IsAMODEL(pGpu))
3719     {
3720         memdescCreateExisting(&pGpu->nocatGpuCache.fbTestMemDesc,
3721             pGpu, NOCAT_FBSIZETESTED, ADDR_FBMEM, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE);
3722         memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_102,
3723                         (&pGpu->nocatGpuCache.fbTestMemDesc));
3724         if (status != NV_OK)
3725         {
3726             NV_PRINTF(LEVEL_ERROR, "Could not allocate vidmem for NOCAT bar2 testing\n");
3727             return;
3728         }
3729         pCpuPtr = kbusMapRmAperture_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc);
3730         if (pCpuPtr == NULL)
3731         {
3732             memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3733             memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3734             pGpu->nocatGpuCache.pCpuPtr = NULL;
3735             return;
3736         }
3737         pGpu->nocatGpuCache.pCpuPtr = pCpuPtr;
3738     }
3739 #endif
3740     // initialize the context cache
3741     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3742     {
3743         _rcdbNocatCollectContext(pGpu, pRcdb, NULL);
3744     }
3745     portAtomicDecrementS32(&concurrentRingBufferAccess);
3746 
3747     return;
3748 }
3749 
3750 /*
3751 ** rcdbCleanupNocatGpuCache_IMPL returns per GPU resources used by NOCAT.
3752 **
3753 **  parameters:
3754 **      pGpu -- a pointer to the GPU Object the containing the cache
3755 **
3756 **  notes:
3757 **      This will free up the FB test window if allocated, and clear out the cache
3758 **
3759 */
3760 void rcdbCleanupNocatGpuCache_IMPL(OBJGPU *pGpu)
3761 {
3762 #if NOCAT_PROBE_FB_MEMORY
3763     if (pGpu == NULL)
3764     {
3765         return;
3766     }
3767     if (pGpu->nocatGpuCache.pCpuPtr != NULL)
3768     {
3769         kbusUnmapRmApertureWithFlags_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc,
3770             &pGpu->nocatGpuCache.pCpuPtr, TRANSFER_FLAGS_NONE);
3771         memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3772         memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3773     }
3774     portMemSet(&pGpu->nocatGpuCache, 0, sizeof(pGpu->nocatGpuCache));
3775 #endif
3776 
3777     return;
3778 }
3779 
3780 
3781 
3782 /*
3783 ** rcdbNocatInsertNocatError records a reported NOCAT error
3784 **
3785 **  parameters:
3786 **      pGpu        Pointer to GPU associated with the error
3787 **                  may be NULL if there is no GPU associated with the error
3788 **                  if NULL the primary GPU is used
3789 **      pNewEntry   A pointer to a structure that contains all the available data for the report
3790 */
3791 NvU32
3792 rcdbNocatInsertNocatError(
3793     OBJGPU *pGpu,
3794     NOCAT_JOURNAL_PARAMS *pNewEntry
3795 )
3796 {
3797     OBJSYS                     *pSys = SYS_GET_INSTANCE();
3798     Journal                    *pRcdb = SYS_GET_RCDB(pSys);
3799 #if(NOCAT_PROBE_FB_MEMORY)
3800     NvBool                      bCheckFBState = NV_FALSE;
3801 #endif
3802     RmRCCommonJournal_RECORD   *pCommon = NULL;
3803     RM_NOCAT_JOURNAL_ENTRY     *pNocatEntry = NULL;
3804     NvU32                       id = INVALID_RCDB_NOCAT_ID;
3805     const char                 *pSource = NULL;
3806     NvU32                       diagBufferLen = 0;
3807     const char                 *pFaultingEngine = NULL;
3808     NvBool                      postRecord;
3809     // validate inputs.
3810     if (pRcdb == NULL)
3811     {
3812         return NV_ERR_INVALID_ARGUMENT;
3813     }
3814     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_REQ_IDX]++;
3815     if (pNewEntry == NULL)
3816     {
3817         pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3818         return 0;
3819     }
3820     // assign a timestamp if none was provided
3821     if (pNewEntry->timestamp == 0)
3822     {
3823         pNewEntry->timestamp = osGetTimestamp();
3824     }
3825 
3826     // initially set postRecord based on the current state of the lock;
3827     postRecord = pRcdb->nocatJournalDescriptor.lockTimestamp == 0;
3828 
3829     // perform any record type specific setup
3830     switch (pNewEntry->recType)
3831     {
3832     case NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK:
3833 #if(NOCAT_PROBE_FB_MEMORY)
3834         bCheckFBState = NV_TRUE;
3835 #endif
3836         // fall thru
3837 
3838     case NV2080_NOCAT_JOURNAL_REC_TYPE_TDR:
3839         // lock the journal so we don't wrap over the record we are inserting.
3840         if (pRcdb->nocatJournalDescriptor.lockTimestamp == 0)
3841         {
3842             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCKED_IDX]++;
3843         }
3844         else
3845         {
3846             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCK_UPDATED_IDX]++;
3847         }
3848 
3849         pRcdb->nocatJournalDescriptor.lockTimestamp = pNewEntry->timestamp;
3850         postRecord = NV_TRUE;
3851         break;
3852 
3853     case NV2080_NOCAT_JOURNAL_REC_TYPE_RC:
3854 #if(NOCAT_PROBE_FB_MEMORY)
3855         bCheckFBState = NV_TRUE;
3856 #endif
3857         // set the source
3858         pSource = "RC Error";
3859         break;
3860 
3861     case NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT:
3862         // set the source
3863         pSource = "ASSERT";
3864         break;
3865 
3866     case NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE:
3867         break;
3868 
3869     case NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN:
3870     default:
3871         return 0;
3872         break;
3873     }
3874     // check if we should post the record when locked.
3875     if (!postRecord)
3876     {
3877         if ((NvS64)(pNewEntry->timestamp - pRcdb->nocatJournalDescriptor.lockTimestamp) < 0)
3878         {
3879             // the record predates the lock, so it's Grandfathered in.
3880             postRecord = NV_TRUE;
3881             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_GRANDFATHERED_RECORD_IDX]++;
3882         }
3883         else
3884         {
3885             // we are dropping the record, count that.
3886             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_LOCKED_OUT_IDX]++;
3887         }
3888     }
3889     if (postRecord)
3890     {
3891         // is the buffer available?
3892         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3893         {
3894             // start recording this new record by allocating a record from the buffer.
3895             pNocatEntry = _rcdbAllocNocatJournalRecord(pGpu, pRcdb, &pCommon);
3896             if (pNocatEntry != NULL)
3897             {
3898                 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECTED_IDX]++;
3899 
3900                 // update the time stamp to the one supplied.
3901                 pCommon->timeStamp = pNewEntry->timestamp;
3902 
3903                 // save the record Id for the type.
3904                 pRcdb->nocatJournalDescriptor.lastRecordId[pNewEntry->recType] =
3905                     pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY] =
3906                     pRcdb->nocatJournalDescriptor.nextRecordId - 1;
3907 
3908                 // set the type.
3909                 pNocatEntry->nocatJournalEntry.recType = pNewEntry->recType;
3910 
3911                 // set bugcheck
3912                 pNocatEntry->nocatJournalEntry.bugcheck = pNewEntry->bugcheck;
3913 
3914                 // get context
3915                 _rcdbNocatCollectContext(pGpu, pRcdb, &(pNocatEntry->nocatGpuState));
3916 
3917 #if(NOCAT_PROBE_FB_MEMORY)
3918                 if ((bCheckFBState)
3919                     && (pGpu != NULL)
3920                     && (pGpu->nocatGpuCache.pCpuPtr != NULL)
3921                     // If using Coherent CPU mapping instead of BAR2 do not call VerifyBar2
3922                     && !pGpu->getProperty(pGpu, PDB_PROP_GPU_COHERENT_CPU_MAPPING))
3923                 {
3924                     switch (kbusVerifyBar2_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu),
3925                         &pGpu->nocatGpuCache.fbTestMemDesc, pGpu->nocatGpuCache.pCpuPtr, 0, NOCAT_FBSIZETESTED))
3926                     {
3927                     case NV_OK:                     // everything passed
3928                         break;
3929 
3930                     case NV_ERR_MEMORY_ERROR:       // BAR 0 failed & BAR 2 was not checked, or BAR 2 failed
3931                         // for now we don't know which BAR failed, so mark both.
3932                         // but only one BAR failed.
3933                         // (if BAR 0 Failed, BAR 2 was not checked)
3934                         pCommon->stateMask |=
3935                             NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR0
3936                             | NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR2;
3937                         break;
3938 
3939                     default:                        // some other processing error cause us to not test the BAR
3940                         break;
3941                     }
3942                 }
3943 #endif
3944                 // is there a valid string for source?
3945                 // (non NULL ptr & more than just a termination)
3946                 if ((pNewEntry->pSource != NULL) && (pNewEntry->pSource[0] != '\0'))
3947                 {
3948                     // yes, use that.
3949                     pSource = pNewEntry->pSource;
3950                 }
3951                 // the caller did not supply a source,
3952                 // did we set a default source based on record type?
3953                 else if (pSource == NULL)
3954                 {
3955                     // no, supply the unknown string for source.
3956                     pSource = NOCAT_UNKNOWN_STR;
3957                 }
3958                 portStringCopy((char*)pNocatEntry->nocatJournalEntry.source,
3959                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3960                     pSource,
3961                     portStringLength(pSource) + 1);
3962 
3963                 pNocatEntry->nocatJournalEntry.subsystem = pNewEntry->subsystem;
3964                 pNocatEntry->nocatJournalEntry.errorCode = pNewEntry->errorCode;
3965 
3966                 if ((pNewEntry->pDiagBuffer != NULL) && (pNewEntry->diagBufferLen != 0))
3967                 {
3968                     // checking length here as we don't want portMemCopy to assert
3969                     if (pNewEntry->diagBufferLen < NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer))
3970                     {
3971                         diagBufferLen = pNewEntry->diagBufferLen;
3972                     }
3973                     else
3974                     {
3975                         // make best effort
3976                         diagBufferLen = NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer);
3977                     }
3978                     portMemCopy(pNocatEntry->nocatJournalEntry.diagBuffer,
3979                         sizeof(pNocatEntry->nocatJournalEntry.diagBuffer),
3980                         pNewEntry->pDiagBuffer, diagBufferLen);
3981                 }
3982                 pNocatEntry->nocatJournalEntry.diagBufferLen = diagBufferLen;
3983 
3984                 pFaultingEngine = pNewEntry->pFaultingEngine != NULL ?
3985                     pNewEntry->pFaultingEngine : NOCAT_UNKNOWN_STR;
3986 
3987                 portStringCopy((char*)pNocatEntry->nocatJournalEntry.faultingEngine,
3988                     NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3989                     pFaultingEngine, portStringLength(pFaultingEngine) + 1);
3990 
3991                 _rcdbSetTdrReason(pRcdb, pNewEntry->tdrReason,
3992                     (char*)pNocatEntry->nocatJournalEntry.tdrReason,
3993                     sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
3994 
3995                 _rcdbReleaseNocatJournalRecord(pNocatEntry);
3996             }
3997             else
3998             {
3999                 // record was not allocated, bail.
4000                 postRecord = NV_FALSE;
4001                 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_FAILED_IDX]++;
4002             }
4003         }
4004         else
4005         {
4006             // we are busy, so we can't insert the record, count the record as dropped & count the busy.
4007             postRecord = NV_FALSE;
4008             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4009         }
4010         portAtomicDecrementS32(&concurrentRingBufferAccess);
4011     }
4012 
4013     // no matter what happened, trigger the event to indicate a record was processed.
4014     _rcdbSendNocatJournalNotification(pGpu, pRcdb, postRecord, pCommon, pNewEntry->recType);
4015 
4016     return id;
4017 }
4018 /*
4019 ** rcdbNocatInsertBugcheck is the interface to record a bugcheck NOCAT report
4020 **
4021 **  parameters:
4022 **      deviceInstance  The instance of the GPU associated with the bugcheck.
4023 **      bugcheck        The bugcheck number
4024 */
4025 NvU32
4026 rcdbNocatInsertBugcheck
4027 (
4028     NvU32               deviceInstance,
4029     NvU32               bugCheckCode)
4030 {
4031     NOCAT_JOURNAL_PARAMS newEntry;
4032 
4033     portMemSet(&newEntry, 0, sizeof(newEntry));
4034     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK;
4035     newEntry.bugcheck = bugCheckCode;
4036     newEntry.pSource = "OS";
4037     newEntry.errorCode = bugCheckCode;
4038     return rcdbNocatInsertNocatError(gpumgrGetGpu(deviceInstance), &newEntry);
4039 }
4040 
4041 /*
4042 ** rcdbNocatInitEngineErrorEvent initializes a parameter structure for an engine error event
4043 **
4044 **  parameters:
4045 **      pNewEntry       Pointer to event parameter structure to be initialized
4046 */
4047 NV_STATUS
4048 rcdbNocatInitEngineErrorEvent
4049 (
4050     NOCAT_JOURNAL_PARAMS *pNewEntry
4051 )
4052 {
4053     if (pNewEntry == NULL)
4054     {
4055         return NV_ERR_INVALID_ARGUMENT;
4056     }
4057     portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4058     pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE;
4059     return NV_OK;
4060 }
4061 
4062 /*
4063 ** rcdbNocatInsertEngineError records a reported NOCAT error from an engine,
4064 **
4065 **  parameters:
4066 **      pGpu            Pointer to GPU associated with the error
4067 **                      may be NULL if there is no GPU associated with the error
4068 **                      if NULL the primary GPU is used
4069 **      pSource         A string indicating the reporting source of the error.
4070 **                      if NULL, a default values will be used
4071 **      subsystem       The optional subsystem ID used by the source to identify the error
4072 **      errorCode       The error code
4073 **      pDiagBuffer     A pointer to the diagnostic buffer associated with the error
4074 **                      may be NULL
4075 **      diagBufferLen   The size of the diagnostic buffer
4076 **                      if the size exceeds the supported diagBuffer size, the buffer contents will be truncated to fit.
4077 */
4078 NvU32
4079 rcdbNocatInsertEngineError(
4080     OBJGPU             *pGpu,
4081     const char         *pSource,
4082     NvU32               subsystem,
4083     NvU64               errorCode,
4084     NvU8               *pDiagBuffer,
4085     NvU32               diagBufferLen
4086 )
4087 {
4088     NOCAT_JOURNAL_PARAMS newEntry;
4089 
4090     rcdbNocatInitEngineErrorEvent(&newEntry);
4091     newEntry.pSource = pSource;
4092     newEntry.subsystem = subsystem;
4093     newEntry.errorCode = errorCode;
4094     newEntry.pDiagBuffer = pDiagBuffer;
4095     newEntry.diagBufferLen = diagBufferLen;
4096     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4097 }
4098 
4099 /*
4100 ** rcdbNocatInsertTDRError records an TDR error,
4101 **
4102 **  parameters:
4103 **      pGpu            Pointer to GPU associated with the error
4104 **                      may be NULL if there is no GPU associated with the error
4105 **                      if NULL the primary GPU is used
4106 **      pSource         A string indicating the reporting source of the error.
4107 **                      if NULL, a default values will be used
4108 **      subsystem       The optional subsystem ID used by the source to identify the error
4109 **      errorCode       The error code
4110 **      TDRBucket       The TDR bucket
4111 **      pDiagBuffer     A pointer to the diagnostic buffer associated with the error
4112 **                      may be NULL
4113 **      diagBufferLen   The size of the diagnostic buffer
4114 **                      if the size exceeds the supported diagBuffer size,
4115 **                      the buffer contents will be truncated to fit.
4116 **      tdrReason       A reason code for the TDR
4117 **      pFaultingApp    A pointer to the faulting app name if known
4118 */
4119 NvU32
4120 rcdbNocatInsertTDRError
4121 (
4122     OBJGPU             *pGpu,
4123     const char         *pSource,
4124     NvU32               subsystem,
4125     NvU64               errorCode,
4126     NvU32               TdrBucket,
4127     NvU8               *pDiagBuffer,
4128     NvU32               diagBufferLen,
4129     NvU32               tdrReason,
4130     const char         *pFaultingEngine
4131 )
4132 {
4133     NOCAT_JOURNAL_PARAMS newEntry;
4134 
4135     portMemSet(&newEntry, 0, sizeof(newEntry));
4136     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4137     newEntry.pSource = pSource;
4138     newEntry.subsystem = subsystem;
4139     newEntry.errorCode = errorCode;
4140     newEntry.pDiagBuffer = pDiagBuffer;
4141     newEntry.diagBufferLen = diagBufferLen;
4142     newEntry.pFaultingEngine = pFaultingEngine;
4143     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4144 }
4145 NV_STATUS
4146 rcdbNocatInitRCErrorEvent
4147 (
4148     NOCAT_JOURNAL_PARAMS *pNewEntry
4149 )
4150 {
4151     if (pNewEntry == NULL)
4152     {
4153         return NV_ERR_INVALID_ARGUMENT;
4154     }
4155     portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4156     pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_RC;
4157     pNewEntry->pSource = "RC ERROR";
4158     return NV_OK;
4159 }
4160 
4161 /*
4162 ** _rcdbNocatReportAssert adds an assert record.
4163 **
4164 **  parameters:
4165 **      pGpu        Pointer to GPU associated with the error
4166 **                  may be NULL
4167 **      pAssertRec  A pointer to the assert to report
4168 */
4169 NV_STATUS
4170 _rcdbNocatReportAssert
4171 (
4172     OBJGPU                  *pGpu,
4173     RmRCCommonAssert_RECORD *pAssertRec
4174 )
4175 {
4176     OBJSYS                 *pSys = SYS_GET_INSTANCE();
4177     Journal                *pRcdb = SYS_GET_RCDB(pSys);
4178     NOCAT_JOURNAL_PARAMS    newEntry;
4179     RM_NOCAT_ASSERT_DIAG_BUFFER    diagBuffer;
4180     RM_NOCAT_ASSERT_DIAG_BUFFER   *pDiagData;
4181     NvU32                   idx;
4182     RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
4183     NvU32                   gpuCnt= 0;
4184     OBJGPU                  *pTmpGpu = gpumgrGetGpu(0);
4185 
4186     // validate inputs.
4187     if ((pRcdb == NULL) || (pAssertRec == NULL))
4188     {
4189         return NV_ERR_INVALID_ARGUMENT;
4190     }
4191     if (pGpu == NULL)
4192     {
4193         // we don't have a GPU, if there is only 1 GPU,
4194         // we can safely use it for logging this assert
4195         gpumgrGetGpuAttachInfo(&gpuCnt, NULL);
4196         if (gpuCnt == 1)
4197         {
4198             pGpu = pTmpGpu;
4199         }
4200     }
4201 
4202     // start off assuming we will be recording a report
4203     portMemSet(&newEntry, 0, sizeof(newEntry));
4204     newEntry.timestamp = pAssertRec->common.timeStamp;
4205     newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT;
4206     newEntry.pSource = "ASSERT";
4207 
4208     // save the assert point as the error code.
4209     newEntry.errorCode =
4210         (NvU32)((pAssertRec->breakpointAddrHint - pRcdb->nocatJournalDescriptor.loadAddress)
4211             & 0xffffffff);
4212 
4213     // put the line number in the upper 32 bits.
4214     newEntry.errorCode |= ((NvU64)pAssertRec->lineNum) << 32;
4215 
4216     for (idx = 0; idx < NV_ARRAY_ELEMENTS(pAssertRec->callStack); idx++)
4217     {
4218         diagBuffer.callStack[idx] =
4219             (NvU32)((pAssertRec->callStack[idx] - pRcdb->nocatJournalDescriptor.loadAddress)
4220                 & 0xffffffff);
4221     }
4222     // initialize count
4223     diagBuffer.count = 1;
4224 
4225     // setup the pointer to our diag buffer & its length
4226     newEntry.pDiagBuffer = (NvU8 *)&diagBuffer;
4227 
4228     newEntry.diagBufferLen = sizeof(diagBuffer);
4229 
4230     // is the last thing we logged an assert, & is this the same assert?
4231     if ((pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT]
4232         == pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY])
4233         && (0 == portMemCmp(&pRcdb->nocatJournalDescriptor.lastAssertData,
4234             diagBuffer.callStack,                                       // same stack
4235             sizeof(diagBuffer.callStack))))
4236     {
4237         // it is the same as the last assert we logged. so don't log it again.
4238         // but see if we can increment the counter in an unreported assert.
4239         // check if the last record is also an assert
4240         if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4241         {
4242             // get the last record from the buffer
4243             _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4244                 NV2080_NOCAT_JOURNAL_REC_TYPE_ANY,
4245                 NULL, &pNocatEntry);
4246             if (pNocatEntry != NULL)
4247             {
4248                 // is it an assert?
4249                 if (pNocatEntry->nocatJournalEntry.recType == (NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT))
4250                 {
4251                     // increment the count
4252                     pDiagData = (RM_NOCAT_ASSERT_DIAG_BUFFER*)&pNocatEntry->nocatJournalEntry.diagBuffer;
4253                     pDiagData->count++;
4254                 }
4255                 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4256 
4257             }
4258         }
4259         else
4260         {
4261             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4262         }
4263         portAtomicDecrementS32(&concurrentRingBufferAccess);
4264     }
4265     else
4266     {
4267         // we are logging this assert, save off the stack so we can use it to
4268         // compare against future asserts.
4269         portMemCopy(&pRcdb->nocatJournalDescriptor.lastAssertData,
4270             sizeof(pRcdb->nocatJournalDescriptor.lastAssertData),
4271             &diagBuffer, sizeof(diagBuffer));
4272         rcdbNocatInsertNocatError(pGpu, &newEntry);
4273     }
4274 
4275     return NV_OK;
4276 }
4277 
4278 /*
4279 ** rcdbNocatInsertRMCDErrorEvent creates an event from an RMCD error block
4280 **
4281 **  parameters:
4282 **  pGpu        pointer to GPU object associated with the error
4283 **  recType     the type of event to create
4284 **  pSource     a pointer to the source string
4285 **  subsystem   the subsystem associated with the event.
4286 **  errorCode   error code for the event
4287 **  pFault      pointer to a faulting component identifier associated with the error
4288 */
4289 NvU32 rcdbNocatInsertRMCDErrorEvent(OBJGPU *pGpu, NvU32 recType,
4290     const char *pSource, NvU32 subsystem, NvU64 errorCode, const char *pFault,
4291     RMCD_ERROR_BLOCK *pRcdError)
4292 {
4293     NOCAT_JOURNAL_PARAMS    newEntry;
4294 
4295     portMemSet(&newEntry, 0, sizeof(newEntry));
4296     newEntry.recType = recType;
4297     newEntry.pSource = pSource;
4298     newEntry.subsystem = subsystem;
4299     newEntry.errorCode = errorCode;
4300     newEntry.pFaultingEngine = pFault;
4301     if (pRcdError != NULL)
4302     {
4303         newEntry.pDiagBuffer = (NvU8 * )pRcdError->pBlock;
4304         newEntry.diagBufferLen = pRcdError->blockSize;
4305     }
4306     return rcdbNocatInsertNocatError(pGpu, &newEntry);
4307 }
4308 
4309 /*
4310 ** rcdbSetNocatTdrReason sets the TDR reason code in the most recent TDR record if there is one,
4311 **  otherwise, it creates one with the reason code.
4312 **
4313 **  parameters:
4314 **      pReasonData     the data supplied with including the reason code.
4315 **                      if a TDR record exists, the reason will be added to the existing record,
4316 **                      otherwise the rest of the data will be used to create a new TDR event.
4317 */
4318 NV_STATUS rcdbSetNocatTdrReason
4319 (
4320     NV2080CtrlNocatJournalDataTdrReason *pReasonData
4321 )
4322 {
4323     OBJSYS             *pSys = SYS_GET_INSTANCE();
4324     Journal            *pRcdb = SYS_GET_RCDB(pSys);
4325     RM_NOCAT_JOURNAL_ENTRY* pNocatEntry = NULL;
4326 
4327     // validate inputs.
4328     if ((pRcdb == NULL) || (pReasonData == NULL))
4329     {
4330         return NV_ERR_INVALID_ARGUMENT;
4331     }
4332     pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATE_REQ_IDX]++;
4333 
4334     if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4335     {
4336         // see if there is a TDR record.
4337         _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4338             NV2080_NOCAT_JOURNAL_REC_TYPE_TDR,
4339             NULL, &pNocatEntry);
4340         if (pNocatEntry != NULL)
4341         {
4342             // there is, set the reason.
4343             _rcdbSetTdrReason(pRcdb, pReasonData->reasonCode,
4344                 (char *)pNocatEntry->nocatJournalEntry.tdrReason,
4345                 sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
4346             pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATED_IDX]++;
4347             _rcdbReleaseNocatJournalRecord(pNocatEntry);
4348         }
4349     }
4350     portAtomicDecrementS32(&concurrentRingBufferAccess);
4351 
4352     // if we did not get a TDR record, create one.
4353     // we need to do it after the ring buffers are released.
4354     if (pNocatEntry == NULL)
4355     {
4356         NOCAT_JOURNAL_PARAMS newEntry;
4357 
4358         portMemSet(&newEntry, 0, sizeof(newEntry));
4359         newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4360         newEntry.pSource = (char *)pReasonData->source;
4361         newEntry.subsystem = pReasonData->subsystem;
4362         newEntry.errorCode = pReasonData->errorCode;
4363         newEntry.tdrReason = pReasonData->reasonCode;
4364         return rcdbNocatInsertNocatError(NULL, &newEntry);
4365     }
4366     return NV_OK;
4367 }
4368