1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /***************************** HW State Routines ***************************\
25 * *
26 * RM robust error journal (formerly RCDB) *
27 * *
28 \***************************************************************************/
29
30 #include "gpu_mgr/gpu_mgr.h"
31 #include "nvRmReg.h"
32 #include "nvBldVer.h"
33 #include "nvVer.h"
34 #include "os/os.h"
35 #include "core/system.h"
36 #include "gpu/gpu.h"
37 #include "diagnostics/journal.h"
38 #include "platform/chipset/chipset.h"
39 #include "diagnostics/nv_debug_dump.h"
40 #include "diagnostics/tracer.h"
41 #include "core/locks.h"
42 #include "rmapi/rs_utils.h"
43 #include "vgpu/rpc.h"
44 #include "gpu/bus/kern_bus.h"
45 #include "gpu/mem_mgr/mem_mgr.h"
46 #include "nvdevid.h"
47 #include "nvop.h"
48 #include "jt.h"
49
50
51
52 #include "ctrl/ctrl0000/ctrl0000nvd.h"
53
54 #include "nvlimits.h"
55 #include "Nvcm.h"
56
57 #include "lib/protobuf/prb_util.h"
58 #include "g_all_dcl_pb.h"
59 #include "g_nvdebug_pb.h"
60 #include "nv_ref.h"
61
62 #define NOCAT_UNKNOWN_STR "*** unknown ***"
63 #define NOCAT_NA_STR "N/A"
64 #define NOCAT_LEGACY_STR "LEGACY"
65 #define NOCAT_FULLCHIP_TDR_STR "FULL CHIP RESET"
66 #define NOCAT_BUS_RESET_TDR_STR "BUS RESET"
67 #define NOCAT_GC6_RESET_TDR_STR "GC6 RESET"
68 #define NOCAT_NORMAL_TDR_STR "NORMAL TDR"
69 #define NOCAT_UCODE_RESET_TDR_STR "UCODE RESET"
70 #define NOCAT_SURPRISE_REMOVAL_TDR_STR "SURPRISE REMOVAL"
71 #define NOCAT_DEFAULT_TAG_VALUE_STR "prod"
72 #define NOCAT_DEFAULT_TDR_REASON_SRC_STR "KMD"
73 #define NOCAT_FBSIZETESTED 0x10
74
75 // Reducing size to 4K for reducing non-paged allocations on win8
76 #define NVDUMP_DEBUGGER_BUFFER_SIZE (4 * 1024)
77
78 #define JOURNAL_BUFFER_SIZE_DEFAULT (4 * 1024)
79
80 #define JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES 5
81
82 static void nvdDebuggerControlFunc(void);
83
84 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
85 #if !defined(DEBUG) && !defined(QA_BUILD)
86 static NvBool rcdProbeGpuPresent(OBJGPU *pGpu, NvU64 ip);
87 static NvBool rcdProbeAllGpusPresent(NvU64 ip);
88 static volatile NvS32 probeGpuRecursion = 0;
89 #endif
90 #endif
91 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type);
92 static volatile NvS32 concurrentRingBufferAccess = 0;
93 static volatile NvS32 assertListRecursion = 0;
94 static void rcdbFindRingBufferForType(Journal *pRcDB, RMCD_RECORD_TYPE recType, RING_BUFFER_LOG **ppRingBuffer);
95 static NV_STATUS _rcdbGetNocatJournalRecord(OBJRCDB* pRcdb,
96 NvU32 id, NvBool bExactMatch,
97 RmRCCommonJournal_RECORD** ppReturnedCommon, RM_NOCAT_JOURNAL_ENTRY** ppReturnedJournal);
98 static NV_STATUS _rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY* pReturnedJournal);
99 static NV_STATUS _rcdbNocatReportAssert(OBJGPU* pGpu, RmRCCommonAssert_RECORD* pAssert);
100
101 // Global flag to make sure we never re-enter the nvLog code.
102 #if defined(DEBUG) || defined(ASSERT_BUILD) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
103 static volatile NvS32 nvLogRecursion = 0;
104 #endif
105
106 // NvDump interface config - communicates with external kernel debuggers
107 NVDUMP_EXPORT volatile NV_DECLARE_ALIGNED(NVDUMP_CONFIG, 8) nvDumpConfig =
108 {
109 NVDUMP_CONFIG_SIGNATURE, // sigHead
110 NvP64_NULL, // debuggerControlFuncAddr
111 { NvP64_NULL, NVDUMP_DEBUGGER_BUFFER_SIZE, 0 }, // buffer
112 0, // gpuSelect
113 NVDUMP_COMPONENT_SYS_ALL, // component
114 NVDUMP_STATUS_IDLE, // dumpStatus
115 NV_OK, // rmStatus
116
117 NVDUMP_CONFIG_SIGNATURE // sigTail
118 };
119
120 void
rcdbDestruct_IMPL(Journal * pRcDB)121 rcdbDestruct_IMPL(Journal *pRcDB)
122 {
123 EVENT_JOURNAL *pJournal = &pRcDB->Journal;
124
125 // Deallocate NvDebug debugger dump buffer.
126 if (nvDumpConfig.buffer.address != NvP64_NULL)
127 {
128 portMemFree(NvP64_VALUE(nvDumpConfig.buffer.address));
129 nvDumpConfig.buffer.address = NvP64_NULL;
130 }
131
132 // Delete Journal and Btree
133 if (pJournal->pBuffer != NULL)
134 {
135 portMemFree(pJournal->pBuffer);
136 portMemFree(pJournal->AssertList.ppList);
137
138 // clear journal of anything
139 portMemSet(pJournal, 0, sizeof(EVENT_JOURNAL));
140 }
141
142 rcdbClearErrorHistory(pRcDB);
143
144 rcdbDestroyRingBufferCollection(pRcDB);
145
146 portMemFree(pRcDB->previousDriverVersion);
147 pRcDB->previousDriverVersion = NULL;
148
149 portMemFree(pRcDB->previousDriverBranch);
150 pRcDB->previousDriverBranch = NULL;
151 }
152
153 static void
_initJournal(EVENT_JOURNAL * pJournal,NvU32 size)154 _initJournal(EVENT_JOURNAL *pJournal, NvU32 size)
155 {
156 // verify we are not abandoning any memory allocations.
157 NV_ASSERT(NULL == pJournal->pBuffer);
158 NV_ASSERT(NULL == (NvU8*) pJournal->AssertList.ppList);
159
160 // init the Journal to an empty buffer.
161 pJournal->pBuffer = NULL;
162 pJournal->BufferSize = 0;
163 pJournal->pFree = pJournal->pBuffer;
164 pJournal->BufferRemaining = pJournal->BufferSize;
165 pJournal->pCurrCollection = NULL;
166 pJournal->RecordCount = 0;
167
168 // init the assert list to an empty buffer.
169 pJournal->AssertList.ppList = NULL;
170 pJournal->AssertList.Size = 0;
171 pJournal->AssertList.Count = 0;
172 pJournal->AssertList.QualifyingStackSize = JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES;
173
174 // allocate and initialize journal buffer memory
175 pJournal->pBuffer = portMemAllocNonPaged(size);
176 if (pJournal->pBuffer != NULL )
177 {
178 pJournal->BufferSize = size;
179 pJournal->pFree = pJournal->pBuffer;
180 pJournal->BufferRemaining = pJournal->BufferSize;
181
182 // if the journal is large enough to hold at least one assert record,
183 // init the assert list as well.
184 if (sizeof(RmRCCommonAssert_RECORD) <= pJournal->BufferSize)
185 {
186 pJournal->AssertList.Size = pJournal->BufferSize / sizeof(RmRCCommonAssert_RECORD);
187 pJournal->AssertList.ppList = portMemAllocNonPaged(pJournal->AssertList.Size * sizeof(pJournal->AssertList.ppList[0]));
188 if (pJournal->AssertList.ppList == NULL )
189 {
190 NV_PRINTF(LEVEL_ERROR,
191 "Failure to allocate RC assert tracking buffer \n");
192 pJournal->AssertList.Size = 0;
193 }
194 }
195 }
196 else
197 {
198 NV_PRINTF(LEVEL_ERROR, "Failure to allocate RC journal buffer \n");
199 }
200 }
201
202 NV_STATUS
rcdbConstruct_IMPL(Journal * pRcDB)203 rcdbConstruct_IMPL(Journal *pRcDB)
204 {
205 EVENT_JOURNAL *pJournal = &pRcDB->Journal;
206 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
207 NvU32 i;
208 void *pBuffer;
209
210 // Time parameters
211 NvU32 sec, usec;
212 NvU64 timeStamp;
213 NvU64 systemTime;
214 NvU64 timeStampFreq;
215
216 _initJournal(pJournal, JOURNAL_BUFFER_SIZE_DEFAULT);
217
218 portMemSet(pRingBufferColl, 0x00, sizeof(pRcDB->RingBufferColl));
219
220 pRcDB->BugcheckCount = 0;
221
222 // Allocate NvDebug debugger dump buffer.
223 pBuffer = portMemAllocNonPaged(nvDumpConfig.buffer.size);
224 if (pBuffer != NULL)
225 {
226 nvDumpConfig.buffer.address = NV_SIGN_EXT_PTR_TO_NvP64(pBuffer);
227 }
228 else
229 {
230 NV_PRINTF(LEVEL_ERROR,
231 "failed to allocate NVD debugger dump buffer\n");
232 }
233
234 // Initialize NvDebug debugger function address.
235 nvDumpConfig.debuggerControlFuncAddr = NV_SIGN_EXT_PTR_TO_NvP64(nvdDebuggerControlFunc);
236
237 //
238 // Create RC Diagnostic report Wrap Buffer
239 //
240 if (NULL == rcdbCreateRingBuffer(pRcDB, RmRcDiagReport, MAX_RCDB_RCDIAG_WRAP_BUFF))
241 {
242 NV_PRINTF(LEVEL_ERROR, "failed to allocate RC Diagnostic Ring Buffer\n");
243 }
244 // init the RC error report data
245 pRcDB->RcErrRptNextIdx = 0;
246 pRcDB->RcErrRptRecordsDropped = NV_FALSE;
247
248 // Initialize RC Error Counters.
249 for ( i = 0 ; i < MAX_RC_ERROR_COUNTER ; i++)
250 {
251 pRcDB->rcErrorCounterArray[i].rcErrorType = RC_ERROR_COUNTER_TYPE_INVALID;
252 pRcDB->rcErrorCounterArray[i].rcErrorCount = 0;
253 pRcDB->rcErrorCounterArray[i].rcLastCHID = INVALID_CHID;
254 pRcDB->rcErrorCounterArray[i].rcLastTime = 0;
255 }
256 pRcDB->rcErrorCounterArray[RC_ERROR_COUNTER_OTHER_INDEX].rcErrorType = RC_ERROR_COUNTER_OTHER_TYPE;
257
258 // clear the Nocat Queue descriptors & counters
259 portMemSet(&pRcDB->nocatJournalDescriptor, 0x00, sizeof(pRcDB->nocatJournalDescriptor));
260 portMemSet(pRcDB->nocatJournalDescriptor.lastRecordId, 0xff, sizeof(pRcDB->nocatJournalDescriptor.lastRecordId));
261 pRcDB->nocatJournalDescriptor.nocatLastRecordType = NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN;
262 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks = NOCAT_CACHE_FRESHNESS_PERIOD_MS;
263 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks *= osGetTimestampFreq();
264 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks /= 1000ULL;
265
266 //
267 // Create NOCAT report Wrap Buffer
268 //
269 if (NULL == rcdbCreateRingBuffer(pRcDB, RmNocatReport, MAX_RCDB_NOCAT_WRAP_BUFF))
270 {
271 NV_PRINTF(LEVEL_ERROR, "failed to allocate NOCAT Ring Buffer\n");
272 }
273
274 // Save params for timestamp conversion
275 timeStampFreq = osGetTimestampFreq();
276 timeStamp = osGetTimestamp();
277 osGetCurrentTime(&sec, &usec);
278 systemTime = ((NvU64)sec * 1000000) + (NvU64)usec;
279
280 pRcDB->systemTimeReference = systemTime - ((timeStamp * 1000000) / timeStampFreq);
281 pRcDB->timeStampFreq = timeStampFreq;
282
283 return NV_OK;
284 }
285
286 //
287 // Retrieve the previous driver version from volatile registry entires
288 // and then save the current driver version for next time.
289 //
rcdbSavePreviousDriverVersion_IMPL(OBJGPU * pGpu,Journal * pRcDB)290 NV_STATUS rcdbSavePreviousDriverVersion_IMPL
291 (
292 OBJGPU *pGpu,
293 Journal *pRcDB
294 )
295 {
296 NV_STATUS nvStatus = NV_OK;
297
298 NvU32 regEntrySize = 0;
299 NvU32 changeListNum = NV_LAST_OFFICIAL_CHANGELIST_NUM;
300
301 // Only run this code only once each time the driver is loaded.
302 if (pRcDB->bPrevDriverCodeExecuted)
303 return NV_OK;
304
305 pRcDB->bPrevDriverCodeExecuted = NV_TRUE;
306
307 //
308 // Get the previous driver version information
309 // from volatile registry settings.
310 //
311 nvStatus = osReadRegistryVolatileSize(pGpu,
312 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, ®EntrySize);
313
314 // Early exit if this platform does not support volatile registry.
315 if (nvStatus == NV_ERR_NOT_SUPPORTED)
316 return NV_OK;
317
318 if ((NV_OK == nvStatus) && (0 != regEntrySize))
319 {
320 //
321 // Previous driver version is there, so assume all previous driver
322 // information is there as well.
323 //
324 pRcDB->previousDriverVersion = portMemAllocNonPaged(regEntrySize + 1);
325 if (pRcDB->previousDriverVersion == NULL)
326 {
327 nvStatus = NV_ERR_NO_MEMORY;
328 DBG_BREAKPOINT();
329 goto rcdbSavePreviousDriverVersion_writeRegistry;
330 }
331
332 nvStatus = osReadRegistryVolatile(pGpu,
333 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
334 (NvU8 *)pRcDB->previousDriverVersion,
335 regEntrySize);
336 if (nvStatus != NV_OK)
337 {
338 DBG_BREAKPOINT();
339 goto rcdbSavePreviousDriverVersion_writeRegistry;
340 }
341 pRcDB->previousDriverVersion[regEntrySize] = 0;
342
343 nvStatus = osReadRegistryVolatileSize(pGpu,
344 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, ®EntrySize);
345 if ((nvStatus != NV_OK) || (0 == regEntrySize))
346 {
347 DBG_BREAKPOINT();
348 goto rcdbSavePreviousDriverVersion_writeRegistry;
349 }
350
351 pRcDB->previousDriverBranch = portMemAllocNonPaged(regEntrySize + 1);
352 if (pRcDB->previousDriverBranch == NULL)
353 {
354 nvStatus = NV_ERR_NO_MEMORY;
355 DBG_BREAKPOINT();
356 goto rcdbSavePreviousDriverVersion_writeRegistry;
357 }
358
359 nvStatus = osReadRegistryVolatile(pGpu,
360 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
361 (NvU8 *)pRcDB->previousDriverBranch,
362 regEntrySize);
363 if (nvStatus != NV_OK)
364 {
365 DBG_BREAKPOINT();
366 goto rcdbSavePreviousDriverVersion_writeRegistry;
367 }
368 pRcDB->previousDriverBranch[regEntrySize] = 0;
369
370 nvStatus = osReadRegistryVolatile(pGpu,
371 NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
372 (NvU8 *)&pRcDB->prevDriverChangelist,
373 sizeof(pRcDB->prevDriverChangelist));
374 if (nvStatus != NV_OK)
375 {
376 DBG_BREAKPOINT();
377 goto rcdbSavePreviousDriverVersion_writeRegistry;
378 }
379
380 nvStatus = osReadRegistryVolatile(pGpu,
381 NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
382 (NvU8 *)&pRcDB->driverLoadCount,
383 sizeof(pRcDB->driverLoadCount));
384 if (nvStatus != NV_OK)
385 {
386 DBG_BREAKPOINT();
387 goto rcdbSavePreviousDriverVersion_writeRegistry;
388 }
389 }
390
391 // Always write out the driver info, even if there was an error reading it.
392 rcdbSavePreviousDriverVersion_writeRegistry:
393 pRcDB->driverLoadCount++;
394
395 osWriteRegistryVolatile(pGpu,
396 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION,
397 (NvU8 *)NV_VERSION_STRING,
398 sizeof(NV_VERSION_STRING));
399
400 osWriteRegistryVolatile(pGpu,
401 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH,
402 (NvU8 *)NV_BUILD_BRANCH_VERSION,
403 sizeof(NV_BUILD_BRANCH_VERSION));
404
405 osWriteRegistryVolatile(pGpu,
406 NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST,
407 (NvU8 *)&changeListNum,
408 sizeof(changeListNum));
409
410 osWriteRegistryVolatile(pGpu,
411 NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT,
412 (NvU8 *)&pRcDB->driverLoadCount,
413 sizeof(pRcDB->driverLoadCount));
414
415 return nvStatus;
416 }
417
rcdbAddAssertJournalRecWithLine(void * pVoidGpu,NvU32 lineNum,void ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size,NvU32 level,NvU64 key)418 NV_STATUS rcdbAddAssertJournalRecWithLine(void *pVoidGpu, NvU32 lineNum, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
419 {
420 OBJSYS *pSys;
421 Journal *pRcDB;
422 OBJGPU *pPossibleNULLGpu;
423 JOURNAL_ASSERT_LIST *pAssertList;
424 RmRCCommonAssert_RECORD newAssertRec;
425 RmRCCommonAssert_RECORD *pAssertRec;
426 NV_STATUS rmStatus = NV_ERR_GENERIC;
427 NvU32 i;
428
429 //
430 // Note: we allow NULL pGpu here, as many clients (such as KMD)
431 // do not have access to pGpu. And much of the RM does not provide this either.
432 //
433 pPossibleNULLGpu = reinterpretCast(pVoidGpu, OBJGPU *);
434
435 pSys = SYS_GET_INSTANCE();
436 if (!pSys)
437 {
438 return NV_ERR_INVALID_STATE;
439 }
440
441 pRcDB = SYS_GET_RCDB(pSys);
442 if (!pRcDB)
443 {
444 return NV_ERR_INVALID_STATE;
445 }
446
447 pAssertList = &pRcDB->Journal.AssertList;
448
449 *ppRec = NULL;
450
451 RMTRACE_PROBE4_PRIMTYPE(rcjournal, assertlog, NvU32, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : 0), NvU8, type, NvU32, level, NvU64, key);
452
453 // create a local instance of the Assert record.
454 portMemSet(&newAssertRec, 0x00, sizeof(newAssertRec));
455 rcdbSetCommonJournalRecord(pPossibleNULLGpu, &newAssertRec.common);
456 newAssertRec.count = 1;
457 newAssertRec.breakpointAddrHint = key;
458 newAssertRec.lineNum = lineNum;
459
460 if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_COMPRESS))
461 {
462 // search for a pre-existing assert record with the same stack
463 for (i = 0; i < pAssertList->Count; ++i)
464 {
465 pAssertRec = pAssertList->ppList[i];
466 if ((newAssertRec.breakpointAddrHint == pAssertRec->breakpointAddrHint) &&
467 (0 == portMemCmp(newAssertRec.callStack, pAssertRec->callStack,
468 sizeof(newAssertRec.callStack[0]) * pAssertList->QualifyingStackSize)))
469 {
470 pAssertRec->count++;
471 pAssertRec->lastTimeStamp = newAssertRec.common.timeStamp;
472
473 rmStatus = NV_OK;
474 break;
475 }
476 }
477 }
478
479 if (rmStatus != NV_OK)
480 {
481 // Discard to avoid reentry from messing up record array.
482 if (portAtomicIncrementS32(&assertListRecursion) == 1)
483 {
484 rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pAssertRec, jGroup, type, size);
485 if (NV_OK == rmStatus)
486 {
487 // the Header is filled in when the record is allocated, so update the local instance header.
488 newAssertRec.common.Header = pAssertRec->common.Header;
489 *pAssertRec = newAssertRec;
490 if (pAssertList->Count < pAssertList->Size)
491 {
492 pAssertList->ppList[pAssertList->Count] = pAssertRec;
493 ++(pAssertList->Count);
494 }
495 else
496 {
497 // based on the way the assert list size is calculated this should never happen....
498 NV_PRINTF(LEVEL_ERROR,
499 "failed to insert tracking for assert record\n");
500 }
501 }
502 }
503 portAtomicDecrementS32(&assertListRecursion);
504 }
505
506 if (rmStatus == NV_OK)
507 {
508 RMTRACE_RMJOURNAL(_ASSERTLOG, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
509 type,
510 jGroup,
511 key,
512 pAssertRec->count,
513 pAssertRec->common.timeStamp,
514 pAssertRec->lastTimeStamp);
515 *ppRec = pAssertRec;
516
517 _rcdbNocatReportAssert(pPossibleNULLGpu, pAssertRec);
518 }
519 else
520 {
521 _rcdbNocatReportAssert(pPossibleNULLGpu, &newAssertRec);
522 }
523
524 return rmStatus;
525 }
526
rcdbAddAssertJournalRec(void * pVoidGpu,void ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size,NvU32 level,NvU64 key)527 NV_STATUS rcdbAddAssertJournalRec(void *pVoidGpu, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key)
528 {
529 return rcdbAddAssertJournalRecWithLine(pVoidGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, ppRec, jGroup, type, size, level, key);
530 }
531 // Populate stateMask with flags that represent the power state and other useful things.
_getCommonJournalStateMask(OBJGPU * pGpu)532 static NvU64 _getCommonJournalStateMask(OBJGPU *pGpu)
533 {
534 NvU64 stateMask = REF_NUM(NV_RM_JOURNAL_STATE_MASK_GC6_STATE,
535 pGpu->gc6State.currentState);
536
537 if (!gpuIsGpuFullPower(pGpu))
538 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_FULL_POWER;
539
540 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED))
541 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_CONNECTED;
542
543 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_STANDBY))
544 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_STANDBY;
545
546 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_HIBERNATE))
547 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_HIBERNATE;
548
549 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH))
550 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_PM_CODEPATH;
551
552 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET))
553 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_GC6_RESET;
554
555 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET))
556 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_FULLCHIP_RESET;
557
558 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET))
559 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_SEC_BUS_RESET;
560
561 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_TIMEOUT_RECOVERY))
562 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_TIMEOUT_RECOVERY;
563
564 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
565 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_LOST;
566
567 return stateMask;
568 }
569
570 // Fill in the common portion of the journal structure.
571 void
rcdbSetCommonJournalRecord(OBJGPU * pGpu,RmRCCommonJournal_RECORD * pRec)572 rcdbSetCommonJournalRecord
573 (
574 OBJGPU *pGpu,
575 RmRCCommonJournal_RECORD *pRec
576 )
577 {
578 OS_THREAD_HANDLE threadId;
579
580 pRec->timeStamp = osGetTimestamp();
581 pRec->GPUTag = 0;
582 pRec->CPUTag = 0;
583 pRec->stateMask = 0;
584
585 if (pGpu)
586 {
587 pRec->GPUTag = pGpu->gpuId;
588 pRec->stateMask = _getCommonJournalStateMask(pGpu);
589 }
590
591 if (NV_OK == osGetCurrentThread(&threadId))
592 {
593 pRec->CPUTag = (NvU64)threadId;
594 }
595 }
596
597 NV_STATUS
rcdbAddBugCheckRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,NvU32 bugCheckCode)598 rcdbAddBugCheckRec_IMPL
599 (
600 OBJGPU *pGpu,
601 Journal *pRcDB,
602 NvU32 bugCheckCode
603 )
604 {
605 RmJournalBugcheck_RECORD *pRec;
606 NV_STATUS rmStatus;
607
608 rmStatus = rcdbAllocNextJournalRec(pRcDB,
609 (NVCD_RECORD **)&pRec,
610 RmGroup,
611 RmJournalBugCheck,
612 sizeof(*pRec));
613 if (NV_OK == rmStatus)
614 {
615 rcdbSetCommonJournalRecord(pGpu, &pRec->common);
616 pRec->bugCheckCode = bugCheckCode;
617 }
618
619 pRcDB->BugcheckCount++;
620
621 return rmStatus;
622 }
623
624 NV_STATUS
rcdbAddPowerStateRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,NvU32 powerEvent,NvU32 state,NvU32 fastBootPowerState)625 rcdbAddPowerStateRec_IMPL
626 (
627 OBJGPU *pGpu,
628 Journal *pRcDB,
629 NvU32 powerEvent,
630 NvU32 state,
631 NvU32 fastBootPowerState
632 )
633 {
634 RmPowerState_RECORD newRmDiagWrapBuffRec;
635
636 // Create Records, then write it.
637 newRmDiagWrapBuffRec.powerState = state;
638 newRmDiagWrapBuffRec.powerEvent = powerEvent;
639 newRmDiagWrapBuffRec.fastBootPowerState = fastBootPowerState;
640 rcdbAddRecToRingBuffer(pGpu, pRcDB, RmPowerState,
641 sizeof(RmPowerState_RECORD), (NvU8 *)&newRmDiagWrapBuffRec);
642 return NV_OK;
643 }
644
645 NV_STATUS
rcdbGetRcDiagRecBoundaries_IMPL(Journal * pRcDB,NvU16 * pStart,NvU16 * pEnd,NvU32 owner,NvU32 processId)646 rcdbGetRcDiagRecBoundaries_IMPL
647 (
648 Journal *pRcDB,
649 NvU16 *pStart,
650 NvU16 *pEnd,
651 NvU32 owner,
652 NvU32 processId
653 )
654 {
655 NV_STATUS status = NV_ERR_MISSING_TABLE_ENTRY;
656 RmRCCommonJournal_RECORD *pCommon;
657 RmRcDiag_RECORD *pRecord = NULL;
658 RING_BUFFER_LOG *pRingBuffer = NULL;
659 NvU32 i;
660 NvU16 logicalStartIdx;
661 NvU16 start = 0;
662 NvU16 end = 0;
663 NvBool foundStart = NV_FALSE;
664 NvBool foundEnd = NV_FALSE;
665
666 // scan the buffer to find all the qualified records & return the
667 // first & last indicies of the qualified records found.
668
669 // Get the Diag Report Ring buffer.
670 rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
671
672 // attempt to claim ownership
673 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
674 {
675 // get the logical start of the buffer.
676 logicalStartIdx = pRingBuffer->headIndex;
677
678 // run thru all the entries in the buffer, start to end, until we find the start & end of the range we are looking for.
679 for (i = 0; i < pRingBuffer->numEntries; ++i)
680 {
681 // get a pointer to the record from the buffer.
682 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * ((logicalStartIdx + i) % pRingBuffer->maxEntries)));
683 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
684
685 // check to see if the record qualifies
686 if (((RCDB_RCDIAG_DEFAULT_OWNER != owner) && (pRecord->owner != owner) && (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID != owner))
687 || ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID != processId) && (pRecord->processId != processId)))
688 {
689 continue;
690 }
691 switch (foundStart)
692 {
693 case NV_FALSE:
694 // check if this is a start record.
695 // we want the first record to be a start record to insure that all the reports that are in the range are complete
696 // (I.E. we didn't wrap over the first record of a report)
697 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_FIRST))
698 {
699 // yes save the idx as the first Idx, & note that we found the start of the range.
700 start = pRecord->idx;
701 foundStart = NV_TRUE;
702 }
703 // fall thru to check if the start of the report is also the end of the report.
704
705 case NV_TRUE:
706 // check if this is an end record.
707 // we want the last record in the range to be an end record to insure that all the reports that are in the range are complete
708 // (Note -- in the case of end records, this should only be an issue if we are interrupting the collection of a report)
709 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_LAST))
710 {
711 // save the idx as the last idx & continue scanning until we have checked all the records.
712 // the last idx saved will be the last idx.
713 end = pRecord->idx;
714 foundEnd = foundStart;
715 }
716 break;
717 }
718 }
719 // checking end is sufficient, because end can't be set w/o start being set first.
720 if (foundEnd)
721 {
722 // we found a complete range, mark us as succeeding.
723 status = NV_OK;
724
725 // pass up the results.
726 if (NULL != pEnd)
727 {
728 *pEnd = end;
729 }
730 if (NULL != pStart)
731 {
732 *pStart = start;
733 }
734 }
735 }
736 else
737 {
738 // the buffer is currently busy.
739 status = NV_ERR_BUSY_RETRY;
740 }
741 portAtomicDecrementS32(&concurrentRingBufferAccess);
742 return status;
743 }
744
745 RmRCCommonJournal_RECORD *
rcdbAddRcDiagRec_IMPL(OBJGPU * pGpu,Journal * pRcDB,RmRcDiag_RECORD * pRmDiagWrapBuffRec)746 rcdbAddRcDiagRec_IMPL
747 (
748 OBJGPU *pGpu,
749 Journal *pRcDB,
750 RmRcDiag_RECORD *pRmDiagWrapBuffRec
751 )
752 {
753 RmRCCommonJournal_RECORD *pCommon;
754 NvU32 usec;
755
756 // Create Records, then write it.
757 pRmDiagWrapBuffRec->idx = (pRcDB->RcErrRptNextIdx)++;
758 if (MAX_RCDB_RCDIAG_ENTRIES < pRmDiagWrapBuffRec->count)
759 {
760 NV_ASSERT_FAILED("Diag report to large for buffer");
761 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].offset = 0;
762 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].tag = NV0000_CTRL_CMD_NVD_RCERR_RPT_REG_OVERFLOWED;
763 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].value = pRmDiagWrapBuffRec->count - MAX_RCDB_RCDIAG_ENTRIES + 1;
764 pRmDiagWrapBuffRec->count = MAX_RCDB_RCDIAG_ENTRIES;
765 }
766 osGetCurrentTime(&(pRmDiagWrapBuffRec->timeStamp), &usec);
767
768 pCommon = rcdbAddRecToRingBuffer(pGpu, pRcDB, RmRcDiagReport,
769 sizeof(RmRcDiag_RECORD), (NvU8 *)pRmDiagWrapBuffRec);
770
771 pRcDB->RcErrRptRecordsDropped |= pRcDB->RcErrRptNextIdx >= MAX_RCDB_RCDIAG_WRAP_BUFF;
772 return pCommon;
773 }
774
775 RmRCCommonJournal_RECORD *
rcdbAddRcDiagRecFromGsp_IMPL(OBJGPU * pGpu,Journal * pRcDB,RmRCCommonJournal_RECORD * pCommonGsp,RmRcDiag_RECORD * pRmDiagGsp)776 rcdbAddRcDiagRecFromGsp_IMPL
777 (
778 OBJGPU *pGpu,
779 Journal *pRcDB,
780 RmRCCommonJournal_RECORD *pCommonGsp,
781 RmRcDiag_RECORD *pRmDiagGsp
782 )
783 {
784 RmRCCommonJournal_RECORD *pCommonCpu;
785
786 pCommonCpu = rcdbAddRcDiagRec(pGpu, pRcDB, pRmDiagGsp);
787 if (pCommonCpu)
788 {
789 NV_ASSERT(pCommonCpu->GPUTag == pCommonGsp->GPUTag);
790 pCommonCpu->stateMask |= pCommonGsp->stateMask;
791 }
792
793 return pCommonCpu;
794 }
795
796 NV_STATUS
_rcdbInternalGetRcDiagRec(Journal * pRcDB,NvU16 reqIdx,RmRCCommonJournal_RECORD ** ppRmDiagWrapBuffRec,NvU32 owner,NvU32 processId)797 _rcdbInternalGetRcDiagRec
798 (
799 Journal *pRcDB,
800 NvU16 reqIdx,
801 RmRCCommonJournal_RECORD **ppRmDiagWrapBuffRec,
802 NvU32 owner,
803 NvU32 processId
804 )
805 {
806 RmRCCommonJournal_RECORD *pCommon;
807 RmRcDiag_RECORD* pRecord = NULL;
808 NV_STATUS status = NV_ERR_INVALID_INDEX;
809 RING_BUFFER_LOG *pRingBuffer = NULL;
810
811 NvU32 i;
812
813 // assume we will fail.
814 *ppRmDiagWrapBuffRec = NULL;
815
816 // Find the ring buffer for the diag reports
817 rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer);
818
819 // is the requested record in the buffer?
820 if ((NvU16)(pRcDB->RcErrRptNextIdx - reqIdx) <= pRingBuffer->numEntries)
821 {
822 // calculate the location of the record.
823 // find the record just past the last record in the buffer. to use as the initial offset.
824 i = pRingBuffer->headIndex + pRingBuffer->numEntries;
825
826 // subtract off the diff between the next idx to be used & the requested idx.
827 i -= pRcDB->RcErrRptNextIdx - reqIdx;
828
829 // wrap the offset to the size of the buffer.
830 i %= pRingBuffer->maxEntries;
831
832 // get a pointer to the record from the buffer.
833 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * i));
834 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
835
836 // verify we have the record that was requested.
837 NV_ASSERT_OR_RETURN(pRecord->idx == reqIdx, NV_ERR_INVALID_INDEX);
838
839 // we found the requested Index,
840 // check to see if the record qualifies
841 if (((RCDB_RCDIAG_DEFAULT_OWNER == owner) || (pRecord->owner == owner) || (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner))
842 && ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId) || (pRecord->processId == processId)))
843 {
844 // combination of ANY_OWNER_ID && ANY_PROCESS_ID is not valid
845 if (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner && NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId)
846 {
847 status = NV_ERR_INSUFFICIENT_PERMISSIONS;
848 goto exit;
849 }
850 // we found a record that fully qualifies
851 *ppRmDiagWrapBuffRec = pCommon;
852 status = NV_OK;
853 }
854 else
855 {
856 // we found the record, but it does not pass the filter.
857 status = NV_ERR_INSUFFICIENT_PERMISSIONS;
858 }
859 }
860 exit:
861 return status;
862 }
863
864 NV_STATUS
rcdbGetRcDiagRec_IMPL(Journal * pRcDB,NvU16 reqIdx,RmRCCommonJournal_RECORD ** ppRmDiagWrapBuffRec,NvU32 owner,NvU32 processId)865 rcdbGetRcDiagRec_IMPL
866 (
867 Journal *pRcDB,
868 NvU16 reqIdx,
869 RmRCCommonJournal_RECORD **ppRmDiagWrapBuffRec,
870 NvU32 owner,
871 NvU32 processId
872 )
873 {
874 NV_STATUS status;
875
876 if (ppRmDiagWrapBuffRec == NULL)
877 {
878 return NV_ERR_INVALID_ARGUMENT;
879 }
880
881 *ppRmDiagWrapBuffRec = NULL;
882
883 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
884 {
885 status = _rcdbInternalGetRcDiagRec(pRcDB, reqIdx, ppRmDiagWrapBuffRec, owner, processId);
886 }
887 else
888 {
889 status = NV_ERR_BUSY_RETRY;
890 }
891 portAtomicDecrementS32(&concurrentRingBufferAccess);
892 return status;
893 }
894
895 //
896 // The function to set context data for all the RmRcDiag_RECORDs in a specified range
897 //
898 NV_STATUS
rcdbUpdateRcDiagRecContext_IMPL(Journal * pRcDB,NvU16 rangeStartIdx,NvU16 rangeEndIdx,NvU32 processId,NvU32 owner)899 rcdbUpdateRcDiagRecContext_IMPL
900 (
901 Journal *pRcDB,
902 NvU16 rangeStartIdx,
903 NvU16 rangeEndIdx,
904 NvU32 processId,
905 NvU32 owner
906 )
907 {
908 RmRCCommonJournal_RECORD *pCommon = NULL;
909 RmRcDiag_RECORD* pRecord = NULL;
910 NV_STATUS status = NV_OK;
911 NV_STATUS recStatus = NV_ERR_OUT_OF_RANGE;
912
913 NvU16 i;
914
915 // go from the start index thru the end index.
916 // note we use != because the indicies will wrap.
917 for (i = rangeStartIdx; i != (NvU16)(rangeEndIdx + 1U); i++)
918 {
919 recStatus = rcdbGetRcDiagRec(pRcDB, i, &pCommon, RCDB_RCDIAG_DEFAULT_OWNER, NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID);
920 if (NV_OK != recStatus)
921 {
922 // something went wrong,
923 // record the status & skip this record.
924 status = recStatus;
925 continue;
926 }
927 // get the pointer to the diag record.
928 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]);
929
930 pRecord->owner = owner;
931 pRecord->processId = processId;
932 }
933 return status;
934 }
935
936 //
937 // size must include NVCD_RECORD size too
938 //
rcdbAllocNextJournalRec_IMPL(Journal * pRcDB,NVCD_RECORD ** ppRec,NvU8 jGroup,NvU8 type,NvU16 size)939 NV_STATUS rcdbAllocNextJournalRec_IMPL(Journal *pRcDB, NVCD_RECORD** ppRec, NvU8 jGroup, NvU8 type, NvU16 size)
940 {
941 EVENT_JOURNAL *pJournal = &pRcDB->Journal;
942
943 if ( ppRec == NULL )
944 return NV_ERR_GENERIC;
945
946 if ( pJournal->pBuffer == NULL || pJournal->BufferSize == 0 )
947 return NV_ERR_GENERIC;
948
949 if ( size == 0 || pJournal->BufferRemaining < size )
950 {
951 return NV_ERR_GENERIC;
952 }
953
954 *ppRec = (NVCD_RECORD*)(pJournal->pFree);
955
956 (*ppRec)->cRecordGroup = jGroup;
957 (*ppRec)->cRecordType = type;
958 (*ppRec)->wRecordSize = size;
959
960 if ( pJournal->pCurrCollection )
961 {
962 pJournal->pCurrCollection->NumRecords++;
963 pJournal->pCurrCollection->Header.wRecordSize += size;
964 }
965 else
966 {
967 // standalone record (not part of collection) - increase total count
968 pJournal->RecordCount++;
969 }
970
971 pJournal->pFree += size;
972 pJournal->BufferRemaining -= size;
973
974 return NV_OK;
975 }
976
rcdbClearErrorHistory_IMPL(Journal * pRcDB)977 NV_STATUS rcdbClearErrorHistory_IMPL(Journal *pRcDB)
978 {
979 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo;
980 RMFIFOERRORELEMENT_V3* pFifoErrorInfo;
981 RMFIFOERRORELEMENT_V3* pFreeErrorInfo;
982
983 // Wait until any errors currently being reported are complete
984 while (!portAtomicCompareAndSwapU32(&pSysErrorInfo->InUse, 1, 0))
985 {
986 // We're not going to sleep, but safe to sleep also means safe to spin..
987 NV_ASSERT_OR_RETURN(portSyncExSafeToSleep(), NV_ERR_INVALID_STATE);
988 portUtilSpin();
989 }
990
991 pFifoErrorInfo = (RMFIFOERRORELEMENT_V3*) pSysErrorInfo->pErrorList;
992 while (NULL != pFifoErrorInfo)
993 {
994 pFreeErrorInfo = pFifoErrorInfo;
995 pFifoErrorInfo = pFifoErrorInfo->ErrorHeader.pNextError;
996 rcdbDeleteErrorElement(pRcDB, pFreeErrorInfo);
997 }
998
999 pSysErrorInfo->ErrorCount = 0x0;
1000 pSysErrorInfo->LogCount = 0x0;
1001 pSysErrorInfo->pErrorList = NULL;
1002
1003 portAtomicSetU32(&pSysErrorInfo->InUse, 0);
1004 return NV_OK;
1005 }
1006
1007
rcdbDeleteErrorElement_IMPL(Journal * pRcDB,void * pDelete)1008 NV_STATUS rcdbDeleteErrorElement_IMPL(Journal *pRcDB, void *pDelete)
1009 {
1010 RMFIFOERRORELEMENT_V3* pFifoDelete = (RMFIFOERRORELEMENT_V3*)pDelete;
1011 RMCD_ERROR_BLOCK* pErrorBlock;
1012 RMCD_ERROR_BLOCK* pOldErrorBlock;
1013
1014 // Free Additional Error Block
1015 for (pErrorBlock = pFifoDelete->ErrorHeader.pErrorBlock; pErrorBlock != NULL;)
1016 {
1017 pOldErrorBlock = pErrorBlock;
1018 pErrorBlock = pErrorBlock->pNext;
1019 portMemFree(pOldErrorBlock->pBlock);
1020 portMemFree(pOldErrorBlock);
1021 }
1022
1023 // Free Error Collector
1024 portMemFree(pFifoDelete);
1025
1026 return NV_OK;
1027 }
1028
1029 // Frees up the all the ring buffers
rcdbDestroyRingBufferCollection_IMPL(Journal * pRcDB)1030 void rcdbDestroyRingBufferCollection_IMPL(Journal *pRcDB)
1031 {
1032 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1033 NvU32 i;
1034 RING_BUFFER_LOG* pCurrentBuffer = pRingBufferColl->pFirstEntry;
1035
1036 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1037 {
1038 RING_BUFFER_LOG* pTempCurrentBuffer = pCurrentBuffer;
1039
1040 NV_ASSERT(pCurrentBuffer != NULL);
1041 NV_ASSERT(pCurrentBuffer->pBuffer != NULL);
1042
1043 portMemFree(pCurrentBuffer->pBuffer);
1044
1045 pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1046
1047 // Free the current ring buffer entry.
1048 portMemFree(pTempCurrentBuffer);
1049 }
1050
1051 // pCurrentBuffer should be NULL if our accounting of NumEntries is correct
1052 NV_ASSERT(pCurrentBuffer == NULL);
1053
1054 portMemSet(pRingBufferColl, 0x00, sizeof(*pRingBufferColl));
1055 }
1056
1057
1058 static NvU32 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord);
1059 static void _rcdbDumpCommonJournalRecord(PRB_ENCODER *pPrbEnc,const PRB_FIELD_DESC *pFieldDesc,PRmRCCommonJournal_RECORD pRec);
1060
1061 /*!
1062 * @brief Initialize the GPU accessible flag
1063 *
1064 * @param[in] pGPU
1065 * @param[in] pRcDB
1066 *
1067 * @return NV_OK
1068 */
1069 NV_STATUS
rcdbDumpInitGpuAccessibleFlag_IMPL(OBJGPU * pGpu,Journal * pRcDB)1070 rcdbDumpInitGpuAccessibleFlag_IMPL
1071 (
1072 OBJGPU *pGpu,
1073 Journal *pRcDB
1074 )
1075 {
1076 pRcDB->nvDumpState.bGpuAccessible =
1077 pRcDB->nvDumpState.bRMLock &&
1078 !pGpu->bIsSOC &&
1079 !IS_VIRTUAL(pGpu) &&
1080 gpuIsGpuFullPower(pGpu) &&
1081 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET) &&
1082 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET) &&
1083 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET) &&
1084 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) &&
1085 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST);
1086
1087 // The GPU should be there... but make sure.
1088 if (pRcDB->nvDumpState.bGpuAccessible)
1089 {
1090 if (GPU_REG_RD32(pGpu, NV_PMC_BOOT_0) != pGpu->chipId0)
1091 {
1092 pRcDB->nvDumpState.bGpuAccessible = NV_FALSE;
1093 }
1094 }
1095
1096 return NV_OK;
1097 }
1098
1099 /*!
1100 * @brief Performs a dump of the specified system component into the given buffer.
1101 *
1102 * @param[in] pSys The system object
1103 * @param[in] component NVDUMP_IS_SYS_COMPONENT(component) must be true.
1104 * @param[in, out] pBuffer Buffer to populate with dump results
1105 * @param[in] policy Policy for buffer allocation: use this one, allocate one or count
1106 * @param[in, out] pBufferCallback Callback function for use with fixed-sized buffer encoding.
1107 * If this is NULL then pBuffer->size is assumed to be large
1108 * enough for the whole dump. Otherwise pBufferCallback is called
1109 * when the buffer is full or when a message ends, allowing the
1110 * the callback to construct the whole buffer piece by piece.
1111 *
1112 * @return NV_OK on success and specific error status on failure
1113 */
1114 NV_STATUS
rcdbDumpComponent_IMPL(OBJRCDB * pRcDB,NvU32 component,NVDUMP_BUFFER * pBuffer,NVDUMP_BUFFER_POLICY policy,PrbBufferCallback * pBufferCallback)1115 rcdbDumpComponent_IMPL
1116 (
1117 OBJRCDB *pRcDB,
1118 NvU32 component,
1119 NVDUMP_BUFFER *pBuffer,
1120 NVDUMP_BUFFER_POLICY policy,
1121 PrbBufferCallback *pBufferCallback
1122 )
1123 {
1124 NVD_STATE *pNvDumpState = &pRcDB->nvDumpState;
1125 void *pBuff;
1126 PRB_ENCODER encoder;
1127 NV_STATUS status = NV_OK;
1128 NvU8 startingDepth;
1129
1130 // Validate arguments.
1131 NV_ASSERT_OR_RETURN(pBuffer != NULL, NV_ERR_INVALID_ARGUMENT);
1132
1133 // Make sure we were not reentered.
1134 if (pNvDumpState->bDumpInProcess)
1135 return NV_ERR_STATE_IN_USE;
1136
1137 // Initialize dump state.
1138 pNvDumpState->bDumpInProcess = NV_TRUE;
1139 pNvDumpState->bugCheckCode = 0;
1140 pNvDumpState->internalCode = NVD_ERROR_CODE(NVD_EXTERNALLY_GENERATED, 0);
1141 pNvDumpState->bRMLock = rmapiLockIsOwner();
1142 pNvDumpState->bGpuAccessible = NV_FALSE;
1143 pNvDumpState->initialbufferSize = pBuffer->size;
1144 pNvDumpState->nvDumpType = NVD_DUMP_TYPE_API;
1145
1146 // Clear dump buffer.
1147 pBuffer->curNumBytes = 0;
1148
1149 // Start encoding protobuf dump message.
1150 switch (policy)
1151 {
1152 case NVDUMP_BUFFER_PROVIDED:
1153 prbEncStart(&encoder, NVDEBUG_NVDUMP, NvP64_VALUE(pBuffer->address),
1154 pBuffer->size, pBufferCallback);
1155 break;
1156 case NVDUMP_BUFFER_ALLOCATE:
1157 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1158 prbEncStartAlloc(&encoder, NVDEBUG_NVDUMP,
1159 pBuffer->size, pBufferCallback));
1160 break;
1161 case NVDUMP_BUFFER_COUNT:
1162 prbEncStartCount(&encoder, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE);
1163 break;
1164 default:
1165 return NV_ERR_INVALID_ARGUMENT;
1166 }
1167
1168 startingDepth = prbEncNestingLevel(&encoder);
1169
1170 switch (component)
1171 {
1172 case NVDUMP_COMPONENT_SYS_RCDB:
1173 {
1174 NV_CHECK_OK(status, LEVEL_ERROR,
1175 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1176 break;
1177 }
1178 case NVDUMP_COMPONENT_SYS_SYSINFO:
1179 {
1180 NV_CHECK_OK(status, LEVEL_ERROR,
1181 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1182 break;
1183 }
1184 case NVDUMP_COMPONENT_SYS_ALL:
1185 {
1186 NV_CHECK_OK(status, LEVEL_ERROR,
1187 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState));
1188 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1189 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState));
1190 break;
1191 }
1192 default:
1193 {
1194 NV_PRINTF(LEVEL_ERROR,
1195 "called with invalid component %u selected.\n",
1196 component);
1197 status = NV_ERR_INVALID_ARGUMENT;
1198 break;
1199 }
1200 }
1201
1202 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1203 prbEncUnwindNesting(&encoder, startingDepth));
1204
1205 {
1206 NvU32 gpu;
1207 OBJGPU *pGpu;
1208
1209 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1210 {
1211 pGpu = gpumgrGetGpu(gpu);
1212
1213 if ((pGpu != NULL) && IS_GSP_CLIENT(pGpu))
1214 {
1215 NV_RM_RPC_DUMP_PROTOBUF_COMPONENT(pGpu, status, &encoder,
1216 pNvDumpState, component);
1217
1218 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR,
1219 prbEncUnwindNesting(&encoder, startingDepth));
1220 }
1221 }
1222 }
1223
1224 // Finish encoding protobuf dump message.
1225 pBuffer->curNumBytes = prbEncFinish(&encoder, &pBuff);
1226 pBuffer->address = NV_SIGN_EXT_PTR_TO_NvP64(pBuff);
1227 pNvDumpState->bDumpInProcess = NV_FALSE;
1228
1229 return status;
1230 }
1231
1232 static NV_STATUS
_rcdbGetTimeInfo(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc)1233 _rcdbGetTimeInfo
1234 (
1235 PRB_ENCODER *pPrbEnc,
1236 NVD_STATE *pNvDumpState,
1237 const PRB_FIELD_DESC *pFieldDesc
1238 )
1239 {
1240 NvU64 timeSinceBoot;
1241 NvU32 sec;
1242 NvU32 usec;
1243 NV_STATUS nvStatus = NV_OK;
1244 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc);
1245
1246 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1247 prbEncNestedStart(pPrbEnc, pFieldDesc));
1248
1249 prbEncAddUInt64(pPrbEnc,
1250 NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_FREQ,
1251 osGetTimestampFreq());
1252
1253 // Add Timestamp
1254 prbEncAddUInt64(pPrbEnc,
1255 NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_DUMP,
1256 osGetTimestamp());
1257 osGetCurrentTime(&sec, &usec);
1258 prbEncAddUInt64(pPrbEnc,
1259 NVDEBUG_SYSTEMINFO_TIMEINFO_SYSTEM_TIME_DUMP,
1260 (NvU64)sec * 1000000 + usec);
1261
1262 // Add time since boot in seconds.
1263 osGetCurrentTick(&timeSinceBoot);
1264 prbEncAddUInt32(pPrbEnc,
1265 NVDEBUG_SYSTEMINFO_TIMEINFO_TIME_SINCE_BOOT_SEC,
1266 (NvU32)(timeSinceBoot / 1000000000ULL));
1267
1268 // Unwind the protobuf to the correct depth.
1269 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1270 prbEncUnwindNesting(pPrbEnc, startingDepth));
1271
1272 return nvStatus;
1273 }
1274
1275 static const char * GPU_NA_UUID = "N/A";
1276
1277 NV_STATUS
rcdbDumpSystemInfo_IMPL(OBJRCDB * pRcDB,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState)1278 rcdbDumpSystemInfo_IMPL
1279 (
1280 OBJRCDB *pRcDB,
1281 PRB_ENCODER *pPrbEnc,
1282 NVD_STATE *pNvDumpState
1283 )
1284 {
1285 OBJGPU *pGpu;
1286 NvU8 *pGidString;
1287 NvU32 gpu;
1288 NvU32 numGpus;
1289 NvU32 gidStrlen;
1290 NvU32 sizeStr;
1291 NV_STATUS nvStatus = NV_OK;
1292 NvBool bRelease;
1293 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc);
1294
1295 OBJSYS *pSys = SYS_GET_INSTANCE();
1296 OBJCL *pCl = SYS_GET_CL(pSys);
1297 OBJGPU *pParent;
1298 NvU32 gpuIndex;
1299 NvU32 gpuMask;
1300 NvBool bGpuDone[NV_MAX_DEVICES];
1301
1302 // All of this stuff should run OK even without the RM lock.
1303 // No need to check pRcDB->nvDumpState.bNoRMLock;
1304
1305 switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1306 {
1307 case NVD_GPU_GENERATED:
1308 case NVD_SKIP_ZERO:
1309 // don't report on these internal codes.
1310 return NV_OK;
1311 break;
1312 }
1313
1314 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1315 prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_SYSTEM_INFO));
1316
1317 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1318 _rcdbGetTimeInfo(pPrbEnc, pNvDumpState, NVDEBUG_SYSTEMINFO_TIME_INFO),
1319 External_Cleanup);
1320
1321 prbEncAddUInt32(pPrbEnc,
1322 NVDEBUG_SYSTEMINFO_BUGCHECK_COUNT,
1323 pRcDB->BugcheckCount);
1324
1325 // Add NorthBridge Info
1326 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1327 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO),
1328 External_Cleanup);
1329
1330 prbEncAddUInt32(pPrbEnc,
1331 NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_ID,
1332 pCl->FHBBusInfo.vendorID |
1333 (pCl->FHBBusInfo.deviceID << 16));
1334
1335 prbEncAddUInt32(pPrbEnc,
1336 NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_SSID,
1337 pCl->FHBBusInfo.subvendorID |
1338 (pCl->FHBBusInfo.subdeviceID << 16));
1339
1340 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO
1341 prbEncNestedEnd(pPrbEnc),
1342 External_Cleanup);
1343
1344 //CPU Info
1345 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1346 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_CPU_INFO),
1347 External_Cleanup);
1348
1349 prbEncAddUInt32(pPrbEnc,
1350 NVDEBUG_SYSTEMINFO_CPUINFO_CPU_TYPE,
1351 pSys->cpuInfo.type);
1352
1353 prbEncAddUInt32(pPrbEnc,
1354 NVDEBUG_SYSTEMINFO_CPUINFO_CPU_CAPS,
1355 pSys->cpuInfo.caps);
1356
1357 prbEncAddUInt32(pPrbEnc,
1358 NVDEBUG_SYSTEMINFO_CPUINFO_NUM_CPU_CORES,
1359 pSys->cpuInfo.numPhysicalCpus);
1360
1361 prbEncAddUInt32(pPrbEnc,
1362 NVDEBUG_SYSTEMINFO_CPUINFO_NUM_LOGICAL_CPUS,
1363 pSys->cpuInfo.numLogicalCpus);
1364
1365 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_CPU_INFO
1366 prbEncNestedEnd(pPrbEnc),
1367 External_Cleanup);
1368
1369 //GPU Info
1370 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1371 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_INFO),
1372 External_Cleanup);
1373
1374 // Count the number of GPUs and List the gpuIds
1375 numGpus = 0;
1376 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1377 {
1378 const NvU32 gidFlags =
1379 DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _BINARY) |
1380 DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE, _SHA1);
1381
1382 pGpu = gpumgrGetGpu(gpu);
1383
1384 if (pGpu)
1385 {
1386 numGpus++;
1387
1388 prbEncAddUInt32(pPrbEnc,
1389 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_ID,
1390 pGpu->gpuId);
1391
1392 nvStatus = gpuGetGidInfo(pGpu, &pGidString,
1393 &gidStrlen, gidFlags);
1394 if (NV_OK == nvStatus)
1395 {
1396 prbEncAddBytes(pPrbEnc,
1397 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1398 pGidString, gidStrlen);
1399 portMemFree(pGidString);
1400 }
1401 else if (pGpu->gpuUuid.isInitialized)
1402 {
1403 prbEncAddBytes(pPrbEnc,
1404 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1405 pGpu->gpuUuid.uuid, sizeof(pGpu->gpuUuid.uuid));
1406 }
1407 else
1408 {
1409 prbEncAddString(pPrbEnc,
1410 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID,
1411 GPU_NA_UUID);
1412 }
1413
1414 prbEncAddUInt32(pPrbEnc,
1415 NVDEBUG_SYSTEMINFO_GPUINFO_DEVICE_ID,
1416 pGpu->idInfo.PCIDeviceID);
1417
1418 prbEncAddUInt32(pPrbEnc,
1419 NVDEBUG_SYSTEMINFO_GPUINFO_PMCBOOT0,
1420 pGpu->chipId0);
1421
1422 prbEncAddUInt32(pPrbEnc,
1423 NVDEBUG_SYSTEMINFO_GPUINFO_SUBDEV_ID,
1424 pGpu->idInfo.PCISubDeviceID);
1425 }
1426 }
1427
1428 prbEncAddUInt32(pPrbEnc,
1429 NVDEBUG_SYSTEMINFO_GPUINFO_NUM_GPUS,
1430 numGpus);
1431
1432 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_INFO
1433 prbEncNestedEnd(pPrbEnc),
1434 External_Cleanup);
1435
1436 //OS Info
1437 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1438 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_OS_INFO),
1439 External_Cleanup);
1440
1441 nvStatus = osGetVersionDump(pPrbEnc);
1442 if (nvStatus != NV_OK)
1443 goto External_Cleanup;
1444
1445 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_OS_INFO
1446 prbEncNestedEnd(pPrbEnc),
1447 External_Cleanup);
1448
1449 // Driver Info
1450 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1451 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_DRIVER_INFO),
1452 External_Cleanup);
1453
1454 sizeStr = (sizeof("RELEASE") < sizeof(NV_DISPLAY_DRIVER_TITLE) ?
1455 sizeof("RELEASE") :
1456 sizeof(NV_DISPLAY_DRIVER_TITLE));
1457
1458 if (portMemCmp(NV_DISPLAY_DRIVER_TITLE, "RELEASE", sizeStr) == 0)
1459 bRelease = NV_TRUE;
1460 else
1461 bRelease = NV_FALSE;
1462
1463 prbEncAddBool(pPrbEnc,
1464 NVDEBUG_SYSTEMINFO_DRIVERINFO_IS_RELEASE,
1465 bRelease);
1466
1467 prbEncAddString(pPrbEnc,
1468 NVDEBUG_SYSTEMINFO_DRIVERINFO_VERSION,
1469 NV_VERSION_STRING);
1470
1471 prbEncAddString(pPrbEnc,
1472 NVDEBUG_SYSTEMINFO_DRIVERINFO_BRANCH,
1473 NV_BUILD_BRANCH_VERSION);
1474
1475 prbEncAddUInt32(pPrbEnc,
1476 NVDEBUG_SYSTEMINFO_DRIVERINFO_CHANGELIST,
1477 NV_LAST_OFFICIAL_CHANGELIST_NUM);
1478
1479 // Only write previous driver version if loaded more than once.
1480 if (pRcDB->driverLoadCount > 1)
1481 {
1482 if (pRcDB->previousDriverVersion != NULL)
1483 {
1484 prbEncAddString(pPrbEnc,
1485 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_VERSION,
1486 pRcDB->previousDriverVersion);
1487 }
1488
1489 if (pRcDB->previousDriverBranch != NULL)
1490 {
1491 prbEncAddString(pPrbEnc,
1492 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_BRANCH,
1493 pRcDB->previousDriverBranch);
1494 }
1495
1496 prbEncAddUInt32(pPrbEnc,
1497 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_CHANGELIST,
1498 pRcDB->prevDriverChangelist);
1499 }
1500
1501 prbEncAddUInt32(pPrbEnc,
1502 NVDEBUG_SYSTEMINFO_DRIVERINFO_LOAD_COUNT,
1503 pRcDB->driverLoadCount);
1504
1505 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_DRIVER_INFO
1506 prbEncNestedEnd(pPrbEnc),
1507 External_Cleanup);
1508
1509 // Dump an table of
1510 // Master GPU -- gpuId
1511 // List all gpus involved by gpuIds
1512 portMemSet(bGpuDone, NV_FALSE, sizeof(bGpuDone));
1513 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++)
1514 {
1515 pGpu = gpumgrGetGpu(gpu);
1516
1517 if ((pGpu) && (bGpuDone[gpu] == NV_FALSE))
1518 {
1519 pParent = gpumgrGetParentGPU(pGpu);
1520
1521 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1522 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_CONFIG),
1523 External_Cleanup);
1524
1525 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_MASTER_ID, pParent->gpuId);
1526 gpuMask = gpumgrGetGpuMask(pGpu);
1527 gpuIndex = 0;
1528 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1529 while (pGpu)
1530 {
1531 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_GPU_ID, pGpu->gpuId);
1532
1533 // gpuIndex is either the next or the MAX
1534 bGpuDone[gpuIndex - 1] = NV_TRUE;
1535 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
1536 }
1537
1538 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_CONFIG
1539 prbEncNestedEnd(pPrbEnc),
1540 External_Cleanup);
1541 }
1542 }
1543
1544 // Error state
1545 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
1546 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_ERROR_STATE),
1547 External_Cleanup);
1548
1549 prbEncAddUInt32(pPrbEnc,
1550 NVDEBUG_SYSTEMINFO_ERRORSTATE_BUGCHECK_CODE,
1551 pNvDumpState->bugCheckCode);
1552
1553 prbEncAddBool(pPrbEnc,
1554 NVDEBUG_SYSTEMINFO_ERRORSTATE_GOT_RM_LOCK,
1555 pNvDumpState->bRMLock);
1556
1557 prbEncAddUInt32(pPrbEnc,
1558 NVDEBUG_SYSTEMINFO_ERRORSTATE_DUMP_BUFFER_SIZE,
1559 pNvDumpState->initialbufferSize);
1560
1561 //
1562 // prbEncNestedEnd for NVDEBUG_SYSTEMINFO_ERROR_STATE and
1563 // NVDEBUG_NVDUMP_SYSTEM_INFO are handled by prbEncUnwindNesting.
1564 //
1565
1566 External_Cleanup:
1567 // Unwind the protobuf to the correct depth.
1568 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
1569 prbEncUnwindNesting(pPrbEnc, startingDepth));
1570
1571 return nvStatus;
1572 }
1573
1574 //
1575 // Routine to dump RcDB Debug Info
1576 //
1577 NV_STATUS
rcdbDumpSystemFunc_IMPL(OBJRCDB * pRcDB,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState)1578 rcdbDumpSystemFunc_IMPL
1579 (
1580 OBJRCDB *pRcDB,
1581 PRB_ENCODER *pPrbEnc,
1582 NVD_STATE *pNvDumpState
1583 )
1584 {
1585 OBJGPU *pGpu = gpumgrGetSomeGpu();
1586
1587 switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode))
1588 {
1589 case NVD_GPU_GENERATED:
1590 case NVD_SKIP_ZERO:
1591 // don't report on these internal codes.
1592 return NV_OK;
1593 break;
1594 }
1595
1596 rcdbDumpJournal(pRcDB, pGpu, pPrbEnc, pNvDumpState, NVDEBUG_NVDUMP_DCL_MSG);
1597 if (pGpu != NULL)
1598 {
1599 rcdbDumpErrorCounters(pRcDB, pGpu, pPrbEnc);
1600 }
1601 else
1602 {
1603 NV_PRINTF(LEVEL_WARNING,
1604 "no GPU - won't dump ring buffers or journal\n");
1605 }
1606
1607 return NV_OK;
1608 }
1609
1610 static NvU32
_rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD * pList,NVD_STATE * pNvDumpState)1611 _rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD *pList, NVD_STATE *pNvDumpState)
1612 {
1613 OBJSYS *pSys = SYS_GET_INSTANCE();
1614 Journal *pRcDB = SYS_GET_RCDB(pSys);
1615 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo;
1616 RMPRBERRORELEMENT_V2* pPrbErrorElement;
1617 RMCD_ERROR_BLOCK* pErrorBlock;
1618 NV_STATUS status = NV_OK;
1619
1620 //
1621 // If we are called from the OCA dump, make sure we have the rm lock.
1622 // TO DO: Try to dump as much as possible without the lock.
1623 //
1624 if (!pNvDumpState->bRMLock)
1625 return NV_OK;
1626
1627 // Get Past Exceptions
1628 pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
1629 while (NULL != pPrbErrorElement)
1630 {
1631 pErrorBlock = pPrbErrorElement->ErrorHeader.pErrorBlock;
1632 switch (pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType)
1633 {
1634 case RmPrbErrorInfo_V2:
1635 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1636 break;
1637
1638 case RmPrbFullDump_V2:
1639 //
1640 // Full crash dumps are a single NvDebug.NvDump message, and
1641 // should be contained in a single block.
1642 //
1643 if (pErrorBlock != NULL)
1644 {
1645 if (pErrorBlock->pNext != NULL)
1646 {
1647 NV_PRINTF(LEVEL_WARNING,
1648 "only one error block expected!\n");
1649 }
1650 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common));
1651 }
1652 break;
1653 default:
1654 // Can only handle protobuf formatted messages
1655 NV_PRINTF(LEVEL_ERROR, "unknown error element type: %d\n",
1656 pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType);
1657 break;
1658 }
1659 pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pPrbErrorElement->ErrorHeader.pNextError;
1660 }
1661 return status;
1662 }
1663
1664 static void
_rcdbDumpCommonJournalRecord(PRB_ENCODER * pPrbEnc,const PRB_FIELD_DESC * pFieldDesc,RmRCCommonJournal_RECORD * pRec)1665 _rcdbDumpCommonJournalRecord
1666 (
1667 PRB_ENCODER *pPrbEnc,
1668 const PRB_FIELD_DESC *pFieldDesc,
1669 RmRCCommonJournal_RECORD *pRec
1670 )
1671 {
1672 NV_STATUS nvStatus = NV_OK;
1673
1674 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1675 prbEncNestedStart(pPrbEnc, pFieldDesc));
1676
1677 if (nvStatus == NV_OK)
1678 {
1679 if (pRec->timeStamp != 0)
1680 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_TIME_STAMP, pRec->timeStamp);
1681 if (pRec->GPUTag != 0)
1682 prbEncAddUInt32(pPrbEnc, JOURNAL_COMMON_GPU_TAG, pRec->GPUTag);
1683 if (pRec->CPUTag != 0)
1684 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_CPU_TAG, pRec->CPUTag);
1685 if (pRec->stateMask != 0)
1686 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_STATE_MASK, pRec->stateMask);
1687 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1688 }
1689 }
1690
1691 static void
rcdbDumpCommonAssertRecord(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,RmRCCommonAssert_RECORD * pRec,NvU32 type)1692 rcdbDumpCommonAssertRecord
1693 (
1694 PRB_ENCODER *pPrbEnc,
1695 NVD_STATE *pNvDumpState,
1696 RmRCCommonAssert_RECORD *pRec,
1697 NvU32 type
1698 )
1699 {
1700 NvU32 i;
1701
1702 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_TYPE, type);
1703
1704 if (pRec->lastTimeStamp != 0)
1705 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_LAST_TIME_STAMP, pRec->lastTimeStamp);
1706
1707 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_BREAKPOINT_ADDR_HINT, pRec->breakpointAddrHint);
1708
1709 // if there is a line number, add it to the message.
1710 if (pRec->lineNum != NV_RM_ASSERT_UNKNOWN_LINE_NUM)
1711 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_SOURCE_LINE, pRec->lineNum);
1712
1713 if (pRec->count != 1)
1714 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_COUNT, pRec->count);
1715
1716 for (i = 0; i < NV_ARRAY_ELEMENTS(pRec->callStack); i++)
1717 {
1718 if (pRec->callStack[i] == 0)
1719 break;
1720
1721 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_CALL_STACK, pRec->callStack[i]);
1722 }
1723 }
1724
1725 static NV_STATUS
_rcdbDumpDclMsgRecord(PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc,RmRCCommonJournal_RECORD * pDclRecord)1726 _rcdbDumpDclMsgRecord(
1727 PRB_ENCODER *pPrbEnc,
1728 NVD_STATE *pNvDumpState,
1729 const PRB_FIELD_DESC *pFieldDesc,
1730 RmRCCommonJournal_RECORD *pDclRecord
1731 )
1732 {
1733 NV_STATUS nvStatus = NV_OK;
1734
1735 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
1736 prbEncNestedStart(pPrbEnc, pFieldDesc));
1737
1738 _rcdbDumpCommonJournalRecord(pPrbEnc, DCL_DCLMSG_COMMON, pDclRecord);
1739
1740 switch (pDclRecord->Header.cRecordType)
1741 {
1742 case RmRC2SwDbgBreakpoint_V3:
1743 case RmRC2SwRmAssert_V3:
1744 {
1745 RmRC2SwRmAssert3_RECORD* pRecord = (RmRC2SwRmAssert3_RECORD*)pDclRecord;
1746
1747 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1748 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1749 if (nvStatus == NV_OK)
1750 {
1751 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState,
1752 &pRecord->commonAssert, pDclRecord->Header.cRecordType);
1753
1754 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_LEVEL, pRecord->level);
1755 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1756 }
1757 break;
1758 }
1759 case RmRC2GpuTimeout_V3:
1760 {
1761 RmRC2GpuTimeout3_RECORD* pRecord = (RmRC2GpuTimeout3_RECORD*)pDclRecord;
1762
1763 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1764 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT));
1765 if (nvStatus == NV_OK)
1766 {
1767 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState, pRecord, pDclRecord->Header.cRecordType);
1768 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1769 }
1770 break;
1771 }
1772 case RmBadRead_V2:
1773 {
1774 RmRC2BadRead2_RECORD* pRecord = (RmRC2BadRead2_RECORD*)pDclRecord;
1775
1776 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1777 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BADREAD));
1778 if (nvStatus == NV_OK)
1779 {
1780 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MEMORY_SPACE, pRecord->MemorySpace);
1781 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_OFFSET, pRecord->Offset);
1782 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MASK, pRecord->Mask);
1783 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_VALUE, pRecord->Value);
1784 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_REASON, pRecord->Reason);
1785 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1786 }
1787 break;
1788 }
1789 case RmDclMsg:
1790 {
1791 RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1792 // Add the bytes after RM_DATA_COLLECTION_RECORD
1793 prbEncAddBytes(pPrbEnc, pRecord->fieldDesc, (void *) (pRecord + 1),
1794 pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1795 break;
1796 }
1797 case RmJournalEngDump:
1798 {
1799 RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord;
1800 // Add the bytes after RM_DATA_COLLECTION_RECORD
1801 prbEncCatMsg(pPrbEnc, (void *)(pRecord + 1),
1802 pRecord->common.Header.wRecordSize - sizeof(*pRecord));
1803 break;
1804 }
1805 case RmJournalBugCheck:
1806 {
1807 RmJournalBugcheck_RECORD* pRecord = (RmJournalBugcheck_RECORD*)pDclRecord;
1808 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1809 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BUGCHECK));
1810 if (nvStatus == NV_OK)
1811 {
1812 prbEncAddUInt32(pPrbEnc, JOURNAL_BUGCHECK_CODE, pRecord->bugCheckCode);
1813 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1814 }
1815 break;
1816 }
1817 case RmPrbErrorInfo_V2:
1818 case RmPrbFullDump_V2:
1819 {
1820 RMPRBERRORELEMENT_V2* pRecord = (RMPRBERRORELEMENT_V2*)((NvU8 *)pDclRecord
1821 - NV_OFFSETOF(RMPRBERRORELEMENT_V2, RmPrbErrorData));
1822 RMCD_ERROR_BLOCK* pErrorBlock;
1823
1824 for (pErrorBlock = pRecord->ErrorHeader.pErrorBlock;
1825 (pErrorBlock != NULL); pErrorBlock = pErrorBlock->pNext)
1826 {
1827 prbEncCatMsg(pPrbEnc, (void *)pErrorBlock->pBlock,
1828 pErrorBlock->blockSize);
1829 }
1830 break;
1831 }
1832 case RmNocatReport:
1833 {
1834 // currently not added to the OCA dump
1835 break;
1836 }
1837
1838 default:
1839 // These are the only ones we know about
1840 NV_PRINTF(LEVEL_ERROR,
1841 "unknown Dcl Record entry type: %d\n",
1842 pDclRecord->Header.cRecordType);
1843 break;
1844 }
1845
1846 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1847 return 0;
1848 }
1849
1850 static NvU32
_rcdbInsertJournalRecordToList(RmRCCommonJournal_RECORD * pList,RmRCCommonJournal_RECORD * pRecord)1851 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord)
1852 {
1853 RmRCCommonJournal_RECORD *pCurrentRecord = pList;
1854 RmRCCommonJournal_RECORD *pNextRecord;
1855
1856 if ((NULL != pList) && (NULL != pRecord))
1857 {
1858 for (pNextRecord = (RmRCCommonJournal_RECORD *)pList->pNext; pNextRecord != pList; pNextRecord = (RmRCCommonJournal_RECORD *)pNextRecord->pNext)
1859 {
1860 if (pRecord->timeStamp < pNextRecord->timeStamp)
1861 {
1862 break;
1863 }
1864 pCurrentRecord = pNextRecord;
1865 }
1866 pRecord->pNext = pCurrentRecord->pNext;
1867 pCurrentRecord->pNext = (NvU8 *)pRecord;
1868 }
1869 return 0;
1870 }
1871
1872 // Todo: format the records into a protobuf DCL record at the source
1873 static NvU32
rcdbInsertRingBufferToList(Journal * pRcDB,RmRCCommonJournal_RECORD * pList,RING_BUFFER_LOG * pRingBuffer)1874 rcdbInsertRingBufferToList(
1875 Journal *pRcDB,
1876 RmRCCommonJournal_RECORD *pList,
1877 RING_BUFFER_LOG *pRingBuffer
1878 )
1879 {
1880 RmRCCommonJournal_RECORD *pCommon;
1881 NvU32 recordSize;
1882 NvU32 i;
1883
1884 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, pRingBuffer->entryType);
1885
1886 //
1887 // Order does not matter here because the record will be inserted into the
1888 // list based on the time of the record, not its postion in the buffer.
1889 //
1890 for (i = 0; i < pRingBuffer->numEntries; i++)
1891 {
1892 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (recordSize * i));
1893
1894 _rcdbInsertJournalRecordToList (pList, pCommon);
1895 }
1896
1897 return 0; // return value should be discarded
1898 }
1899
1900 static NvU32
rcdbInsertRingBufferCollectionToList(Journal * pRcDB,RmRCCommonJournal_RECORD * pList)1901 rcdbInsertRingBufferCollectionToList(
1902 Journal *pRcDB,
1903 RmRCCommonJournal_RECORD *pList)
1904 {
1905 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
1906 RING_BUFFER_LOG *pCurrentBuffer;
1907 NvU32 i;
1908
1909
1910 pCurrentBuffer = pRingBufferColl->pFirstEntry;
1911 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
1912 {
1913 NvU32 recSize = pCurrentBuffer->bufferSize;
1914
1915 NV_ASSERT(pCurrentBuffer->maxEntries *
1916 rcdbGetOcaRecordSizeWithHeader(pRcDB, pCurrentBuffer->entryType) ==
1917 pCurrentBuffer->bufferSize);
1918
1919 if (recSize > 0)
1920 {
1921 rcdbInsertRingBufferToList (pRcDB, pList, pCurrentBuffer);
1922 }
1923 pCurrentBuffer = pCurrentBuffer->pNextRingBuffer;
1924 }
1925
1926 // Assert that we traversed through the entire list.
1927 NV_ASSERT(pCurrentBuffer == NULL);
1928
1929 // return value should be ignored
1930 return 0;
1931 }
1932
1933 NvU32
rcdbDumpJournal_IMPL(OBJRCDB * pRcDB,OBJGPU * pGpu,PRB_ENCODER * pPrbEnc,NVD_STATE * pNvDumpState,const PRB_FIELD_DESC * pFieldDesc)1934 rcdbDumpJournal_IMPL
1935 (
1936 OBJRCDB *pRcDB,
1937 OBJGPU *pGpu,
1938 PRB_ENCODER *pPrbEnc,
1939 NVD_STATE *pNvDumpState,
1940 const PRB_FIELD_DESC *pFieldDesc
1941 )
1942 {
1943 OS_DRIVER_BLOCK DriverBlock;
1944 EVENT_JOURNAL *pJournal = &pRcDB->Journal;
1945 NvU8 *pJournalBuff = pJournal->pBuffer;
1946 RmRCCommonJournal_RECORD *pRecord;
1947 NvU32 recSize;
1948 NV_STATUS nvStatus = NV_OK;
1949 RmRCCommonJournal_RECORD List;
1950
1951 // It is OK to dump the journal entries without the RM lock.
1952 // No need to check pRcDB->nvDumpState.bNoRMLock;
1953
1954 recSize = pJournal->BufferSize - pJournal->BufferRemaining;
1955
1956 if (NULL != pGpu)
1957 {
1958 //
1959 // Add RVA Header, even when there are no journal records.
1960 // This header is required to resolve code addresses using the PDB file.
1961 // We can log code addresses outside of the journal entries.
1962 //
1963 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedStart(pPrbEnc, pFieldDesc));
1964 if (nvStatus == NV_OK)
1965 {
1966 NV_CHECK_OK(nvStatus, LEVEL_ERROR,
1967 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_RVAHEADER));
1968 if (nvStatus == NV_OK)
1969 {
1970 portMemSet(&DriverBlock, 0x00, sizeof(DriverBlock));
1971 osGetDriverBlock(pGpu->pOsGpuInfo, &DriverBlock);
1972 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_DRIVER_START, (NvU64)DriverBlock.driverStart);
1973 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_OFFSET, DriverBlock.offset);
1974 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_POINTER_SIZE, sizeof(pJournal));
1975 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_HIGH, *((NvU64*) DriverBlock.unique_id));
1976 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_LOW, *((NvU64*) (DriverBlock.unique_id + 8)));
1977 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_AGE, DriverBlock.age);
1978 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1979 }
1980 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc));
1981 }
1982 }
1983
1984 // init the list to an empty state
1985 portMemSet(&List, 0x00, sizeof(List));
1986 List.pNext = (NvU8 *)&List;
1987
1988 //
1989 // Don't dump the ring buffers if something is adding to them.
1990 // If we can dump the ring buffers, hold the lock for them until the
1991 // dump is complete to insure that a record is not changed mid-dump.
1992 //
1993 if (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1)
1994 {
1995 //
1996 // If IRQL is low, spin until it gets available
1997 //
1998 if (!osIsRaisedIRQL() && (NULL != pGpu))
1999 {
2000 RMTIMEOUT timeout;
2001 NV_STATUS status = NV_OK;
2002 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
2003 do {
2004 portAtomicDecrementS32(&concurrentRingBufferAccess);
2005
2006 if (NV_ERR_TIMEOUT == status)
2007 {
2008 NV_PRINTF(LEVEL_ERROR,
2009 "timed out waiting for Rm journal ring buffer to be available\n");
2010 DBG_BREAKPOINT();
2011 return 0;
2012 }
2013 status = gpuCheckTimeout(pGpu, &timeout);
2014 osSpinLoop();
2015 } while (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1);
2016 }
2017 else
2018 {
2019 NV_ASSERT_FAILED("Ring Buffer unavailable for dump at high irql.");
2020 }
2021 }
2022
2023 rcdbInsertRingBufferCollectionToList (pRcDB, &List);
2024
2025 _rcdbInsertErrorHistoryToList(&List, pNvDumpState);
2026
2027 // Skip if size is smaller than a header
2028 while (recSize > sizeof(RmRCCommonJournal_RECORD))
2029 {
2030 pRecord = (RmRCCommonJournal_RECORD *)pJournalBuff;
2031
2032 if (pRecord->Header.cRecordGroup != RmGroup)
2033 {
2034 // We only log RM related data
2035 NV_ASSERT(pRecord->Header.cRecordGroup == RmGroup);
2036 break;
2037 }
2038
2039 // Just a safety net...
2040 if (pRecord->Header.wRecordSize > recSize)
2041 {
2042 break;
2043 }
2044 _rcdbInsertJournalRecordToList (&List, pRecord);
2045
2046 recSize -= pRecord->Header.wRecordSize;
2047 pJournalBuff += pRecord->Header.wRecordSize;
2048 }
2049
2050
2051 // dump out the records that have been added to the list.
2052 for (pRecord = (RmRCCommonJournal_RECORD *)List.pNext; pRecord != &List; pRecord = (RmRCCommonJournal_RECORD *)pRecord->pNext)
2053 {
2054 _rcdbDumpDclMsgRecord(pPrbEnc, pNvDumpState, pFieldDesc, pRecord);
2055 }
2056 portAtomicDecrementS32(&concurrentRingBufferAccess);
2057
2058 // return value should be ignored
2059 return 0;
2060 }
2061
2062 NvU32
rcdbDumpErrorCounters_IMPL(Journal * pRcDB,OBJGPU * pGpu,PRB_ENCODER * pPrbEnc)2063 rcdbDumpErrorCounters_IMPL(Journal *pRcDB, OBJGPU *pGpu, PRB_ENCODER *pPrbEnc)
2064 {
2065 NvU32 i;
2066 NvU32 rcErrTyp = RC_ERROR_COUNTER_TYPE_INVALID;
2067 NV_STATUS nvStatus = NV_OK;
2068 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc);
2069
2070 // Opens NVDEBUG_NVDUMP_DCL_MSG
2071 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2072 prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_DCL_MSG),
2073 cleanupAndExit);
2074
2075 for (i = 0; i <= RC_ERROR_COUNTER_OTHER_INDEX; i++)
2076 {
2077 // For Counters
2078 rcErrTyp = pRcDB->rcErrorCounterArray[i].rcErrorType;
2079 if (rcErrTyp != RC_ERROR_COUNTER_TYPE_INVALID)
2080 {
2081 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2082 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_RCCOUNTER),
2083 cleanupAndExit);
2084
2085 // Write Power Event
2086 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCERRORTYPE, rcErrTyp);
2087
2088 // Write Power State
2089 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_COUNT, pRcDB->rcErrorCounterArray[i].rcErrorCount);
2090
2091 // Dump the channel ID and the last time when this error occurred on this channel ID
2092 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCLASTCHID, pRcDB->rcErrorCounterArray[i].rcLastCHID);
2093 prbEncAddUInt64(pPrbEnc, RC_RCCOUNTER_RCLASTTIME, pRcDB->rcErrorCounterArray[i].rcLastTime);
2094
2095 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR,
2096 prbEncNestedEnd(pPrbEnc),
2097 cleanupAndExit);
2098 }
2099 } // For Counters
2100
2101 // Close NVDEBUG_NVDUMP_DCL_MSG handled by prbEncUnwindNesting.
2102
2103 cleanupAndExit:
2104 // Unwind the protobuff to inital depth
2105 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR,
2106 prbEncUnwindNesting(pPrbEnc, startingDepth));
2107
2108 return 0;
2109 }
2110
2111 static void
_rcdbAddRmGpuDumpCallback(void * pData)2112 _rcdbAddRmGpuDumpCallback
2113 (
2114 void *pData
2115 )
2116 {
2117 OBJSYS *pSys = SYS_GET_INSTANCE();
2118 NV_STATUS status;
2119
2120 NvU32 gpuInstance = *((NvU32 *)pData);
2121 status = osAcquireRmSema(pSys->pSema);
2122 if (status == NV_OK)
2123 {
2124 // LOCK: acquire API lock
2125 status = rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_DIAG);
2126 if (status == NV_OK)
2127 {
2128 // LOCK: acquire GPUs lock
2129 status = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE,
2130 RM_LOCK_MODULES_DIAG);
2131 if (status == NV_OK)
2132 {
2133 Journal *pRcDB = SYS_GET_RCDB(pSys);
2134 OBJGPU *pGpu = gpumgrGetGpu(gpuInstance);
2135
2136 //
2137 // Mark the Journal object as in the deferred dump path so we won't
2138 // re-attempt again.
2139 //
2140 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_TRUE);
2141
2142 status = rcdbAddRmGpuDump(pGpu);
2143 NV_ASSERT(status == NV_OK);
2144
2145 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_FALSE);
2146
2147 // UNLOCK: release GPUs lock
2148 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL);
2149 }
2150 else
2151 {
2152 NV_PRINTF(LEVEL_ERROR, "failed to acquire the GPU locks!\n");
2153 }
2154 // UNLOCK: release API lock
2155 rmapiLockRelease();
2156 }
2157 else
2158 {
2159 NV_PRINTF(LEVEL_ERROR, "failed to acquire the API lock!\n");
2160 }
2161 osReleaseRmSema(pSys->pSema, NULL);
2162 }
2163 else
2164 {
2165 NV_PRINTF(LEVEL_ERROR, "failed to acquire the OS semaphore!\n");
2166 }
2167 }
2168
2169 static NV_STATUS
nvdDebuggerBufferCallback(void * pEncoder,NvBool bBufferFull)2170 nvdDebuggerBufferCallback(void *pEncoder, NvBool bBufferFull)
2171 {
2172 if (bBufferFull)
2173 {
2174 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_BUFFER_FULL;
2175 }
2176 else
2177 {
2178 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_END_OF_MSG;
2179 }
2180
2181 return NV_OK;
2182 }
2183
2184 /*!
2185 * @brief NvDebug kernel debugger dump control
2186 *
2187 * Allows external kernel debuggers to control the RM's dump interface
2188 * without assuming anything about the current system state.
2189 *
2190 * WARNING! This function should never be called directly!
2191 *
2192 * If correctly setup, a kernel debugger will place a processor
2193 * hardware watchpoint on the nvDumpConfig.handshake variable.
2194 * Each time this is written to, the debugger will break and get a chance
2195 * to examine the rest of the nvDumpConfig state.
2196 *
2197 * @return This function should never return! External debugger should abort it!
2198 */
2199 static void
nvdDebuggerControlFunc(void)2200 nvdDebuggerControlFunc(void)
2201 {
2202 OBJSYS *pSys = SYS_GET_INSTANCE();
2203 Journal *pRcDB = SYS_GET_RCDB(pSys);
2204 OBJGPU *pGpu = NULL;
2205 NvDebugDump *pNvd = NULL;
2206 NVDUMP_BUFFER *pBuffer = (NVDUMP_BUFFER *)&nvDumpConfig.buffer; // discard volatile
2207
2208 // Process actions while debugger provides work to do.
2209 while (nvDumpConfig.dumpStatus != NVDUMP_STATUS_IDLE)
2210 {
2211 nvDumpConfig.rmStatus = NV_OK;
2212
2213 NV_PRINTF(LEVEL_INFO,
2214 "Dump triggered: gpuSelect=%u, component=%u, dumpStatus=%u\n",
2215 nvDumpConfig.gpuSelect, nvDumpConfig.component,
2216 nvDumpConfig.dumpStatus);
2217
2218 if (NVDUMP_IS_GPU_COMPONENT(nvDumpConfig.component))
2219 {
2220 pGpu = gpumgrGetGpu(nvDumpConfig.gpuSelect);
2221 pNvd = GPU_GET_NVD(pGpu);
2222
2223 switch (nvDumpConfig.dumpStatus)
2224 {
2225 case NVDUMP_STATUS_COUNT_REQUESTED:
2226 nvDumpConfig.rmStatus = nvdDumpComponent(
2227 pGpu, pNvd, nvDumpConfig.component, pBuffer,
2228 NVDUMP_BUFFER_COUNT, NULL);
2229 nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2230 break;
2231 case NVDUMP_STATUS_DUMP_REQUESTED:
2232 nvDumpConfig.rmStatus = nvdDumpComponent(
2233 pGpu, pNvd, nvDumpConfig.component, pBuffer,
2234 NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2235 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2236 break;
2237 default:
2238 NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2239 nvDumpConfig.dumpStatus);
2240 nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2241 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2242 break;
2243 }
2244 }
2245 else if (NVDUMP_IS_SYS_COMPONENT(nvDumpConfig.component))
2246 {
2247 switch (nvDumpConfig.dumpStatus)
2248 {
2249 case NVDUMP_STATUS_COUNT_REQUESTED:
2250 nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2251 nvDumpConfig.component, pBuffer,
2252 NVDUMP_BUFFER_COUNT, NULL);
2253 nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE;
2254 break;
2255 case NVDUMP_STATUS_DUMP_REQUESTED:
2256 nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB,
2257 nvDumpConfig.component, pBuffer,
2258 NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback);
2259 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE;
2260 break;
2261 default:
2262 NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n",
2263 nvDumpConfig.dumpStatus);
2264 nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE;
2265 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2266
2267 break;
2268 }
2269 }
2270 else
2271 {
2272 NV_PRINTF(LEVEL_ERROR, "Invalid component %u\n",
2273 nvDumpConfig.component);
2274 nvDumpConfig.rmStatus = NV_ERR_INVALID_PARAM_STRUCT;
2275 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR;
2276 }
2277 }
2278
2279 // Ensure we really don't exit this function without debugger.
2280 while (1)
2281 {
2282 NV_PRINTF(LEVEL_ERROR, "Should never reach this point!\n");
2283 DBG_BREAKPOINT();
2284 }
2285 }
2286
2287 /*!
2288 * @brief Release Build NV_ASSERT function
2289 *
2290 * @details Called by NV_ASSERT when the assertion fails.
2291 * By putting this logic in its own function, we save on binary size.
2292 */
2293 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
_rcdbRmAssert(NvU32 level,NvU32 lineNum,NvU64 ip)2294 static void _rcdbRmAssert(NvU32 level, NvU32 lineNum, NvU64 ip)
2295 {
2296 RmRC2SwRmAssert3_RECORD* pRec = NULL;
2297 if (rcdbAddAssertJournalRecWithLine(NULL, lineNum, (void **)&pRec, RmGroup,
2298 RmRC2SwRmAssert_V3, sizeof(RmRC2SwRmAssert3_RECORD),
2299 level, ip) == NV_OK)
2300 {
2301 pRec->level = level;
2302 }
2303
2304 #if !defined(DEBUG) && !defined(QA_BUILD)
2305 {
2306 OBJSYS *pSys = SYS_GET_INSTANCE();
2307
2308 // Add assert to NvLog. But skip when nvLog asserts to avoid stack overflow.
2309 if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2310 {
2311 // check for GPU lost.
2312 rcdProbeAllGpusPresent(ip);
2313 }
2314 portAtomicDecrementS32(&nvLogRecursion);
2315
2316 if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_ASSERT) &
2317 DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2318 {
2319 REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release NV_ASSERT Break\n");
2320 }
2321 }
2322
2323 // If enabled bugcheck on assert
2324 osDbgBugCheckOnAssert();
2325
2326 #endif
2327 }
2328
2329 //
2330 // Some param-less wrappers for rcdbXxxEx() functions.
2331 // If the params are not needed, calling these functions saves on binary size
2332 //
rcdbRmAssert(NvU32 LineNum,NvU64 ip)2333 void rcdbRmAssert(NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(0, LineNum, ip); }
rcdbRmAssertStatus(NvU32 status,NvU32 LineNum,NvU64 ip)2334 void rcdbRmAssertStatus(NvU32 status, NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(status, LineNum, ip); }
2335
2336 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS)
2337
2338 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2339
2340 /*!
2341 * @brief Release Build DBGBREAKPOINT() function
2342 *
2343 * @details Called by DBGBREAKPOINT when the assertion fails.
2344 * By putting this logic in its own function, we save on binary size.
2345 */
_rcdbDbgBreakEx(void * pGpu,NvU32 lineNum,NvU32 level,NvU64 ip)2346 static void _rcdbDbgBreakEx(void *pGpu, NvU32 lineNum, NvU32 level, NvU64 ip)
2347 {
2348 RmRC2SwRmAssert3_RECORD* pRec = NULL;
2349 if (rcdbAddAssertJournalRecWithLine(pGpu, lineNum, (void**)&pRec, RmGroup,
2350 RmRC2SwDbgBreakpoint_V3, sizeof(RmRC2SwRmAssert3_RECORD), level, ip) == NV_OK)
2351 {
2352 pRec->level = level;
2353 }
2354
2355 #if !defined(DEBUG) && !defined(QA_BUILD)
2356 {
2357 OBJSYS *pSys = SYS_GET_INSTANCE();
2358
2359 // Add assert to NvLog. But skip when nvLog asserts to avoid stack overflow.
2360 if (portAtomicIncrementS32(&nvLogRecursion) == 1)
2361 {
2362 NV_PRINTF(LEVEL_NOTICE, "Breakpoint at 0x%llx.\n", ip);
2363 }
2364 portAtomicDecrementS32(&nvLogRecursion);
2365
2366 if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_DBG_BREAK) &
2367 DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags)))
2368 {
2369 REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release Debug Break\n");
2370 }
2371 }
2372 #endif
2373
2374 // If enabled bugcheck on assert
2375 osDbgBugCheckOnAssert();
2376 }
2377
rcdbDbgBreak(NvU64 ip)2378 void rcdbDbgBreak(NvU64 ip) { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0, ip); }
rcdbDbgBreakGpu(void * pGpu,NvU64 ip)2379 void rcdbDbgBreakGpu(void *pGpu, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0, ip); }
rcdbDbgBreakStatus(NvU32 status,NvU64 ip)2380 void rcdbDbgBreakStatus(NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
rcdbDbgBreakEx(void * pGpu,NvU32 status,NvU64 ip)2381 void rcdbDbgBreakEx(void *pGpu, NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); }
2382
2383 #endif
2384
2385 NV_STATUS
rcdbAddRmEngDump(OBJGPU * pGpu,NvU32 component)2386 rcdbAddRmEngDump
2387 (
2388 OBJGPU *pGpu,
2389 NvU32 component
2390 )
2391 {
2392 OBJSYS *pSys = SYS_GET_INSTANCE();
2393 Journal *pRcDB = SYS_GET_RCDB(pSys);
2394 NvDebugDump *pNvd = GPU_GET_NVD(pGpu);
2395 NVDUMP_BUFFER nvDumpBuffer = {0};
2396 RM_DATA_COLLECTION_RECORD *pRec;
2397 NV_STATUS rmStatus;
2398 NvU16 totalSize;
2399
2400 nvDumpBuffer.size = NVDUMP_MAX_DUMP_SIZE;
2401
2402 rmStatus = nvdDumpComponent(pGpu, pNvd, component, &nvDumpBuffer,
2403 NVDUMP_BUFFER_ALLOCATE, NULL);
2404 if (rmStatus != NV_OK)
2405 {
2406 goto rcdbAddRmEngDump_error_handle;
2407 }
2408
2409 totalSize = (NvU16)(nvDumpBuffer.curNumBytes + sizeof(*pRec));
2410 //align to 8 bytes to keep the readability of RM journal
2411 totalSize = (totalSize + 0x7) & ~0x7;
2412 // check for overflow
2413 if (((NvU32)totalSize) < nvDumpBuffer.curNumBytes + sizeof(*pRec))
2414 {
2415 goto rcdbAddRmEngDump_error_handle;
2416 }
2417
2418 rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pRec, RmGroup,
2419 RmJournalEngDump, totalSize);
2420 if (rmStatus != NV_OK)
2421 {
2422 goto rcdbAddRmEngDump_error_handle;
2423 }
2424 rcdbSetCommonJournalRecord(pGpu, &pRec->common);
2425
2426 // copy the dump buffer right after the RM_DATA_COLLECTION_RECORD struct
2427 portMemCopy((void *)(pRec + 1), nvDumpBuffer.curNumBytes, NvP64_VALUE(nvDumpBuffer.address), nvDumpBuffer.curNumBytes);
2428
2429 pRec->fieldDesc = NVDEBUG_NVDUMP_GPU_INFO;
2430
2431 rcdbAddRmEngDump_error_handle:
2432 if (nvDumpBuffer.address != NvP64_NULL)
2433 {
2434 portMemFree(NvP64_VALUE(nvDumpBuffer.address));
2435 }
2436
2437 return rmStatus;
2438 }
2439
2440
2441 // Finds the ring buffer for a corresponding type. Returns error if not allocated.
2442 static void
rcdbFindRingBufferForType(Journal * pRcDB,RMCD_RECORD_TYPE recType,RING_BUFFER_LOG ** ppRingBuffer)2443 rcdbFindRingBufferForType
2444 (
2445 Journal *pRcDB,
2446 RMCD_RECORD_TYPE recType,
2447 RING_BUFFER_LOG **ppRingBuffer
2448 )
2449 {
2450 NvU32 i;
2451 RING_BUFFER_LOG *pCurrentRingBuffer = NULL;
2452 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2453
2454 NV_ASSERT(ppRingBuffer != NULL);
2455 *ppRingBuffer = NULL;
2456
2457 //
2458 // Loop through our ring buffer collection, and find the
2459 // ring buffer corresponding to our type.
2460 //
2461 pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2462 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2463 {
2464 NV_ASSERT(pCurrentRingBuffer != NULL);
2465 if (pCurrentRingBuffer->entryType == recType)
2466 {
2467 *ppRingBuffer = pCurrentRingBuffer;
2468 return;
2469 }
2470 pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2471 }
2472
2473 NV_PRINTF(LEVEL_INFO, "Ring Buffer not found for type %d\n", recType);
2474 return;
2475 }
2476
2477 //
2478 // Creates a ring buffer capable of holding "maxEntries" number of entries, and
2479 // adds it to the ring buffer collection.
2480 // Returns a pointer to the created ring buffer so that individual modules can
2481 // examine the data on-demand easily.
2482 //
2483 //PRINT_BUFFER_LOG
2484 NvU8 *
rcdbCreateRingBuffer_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type,NvU32 maxEntries)2485 rcdbCreateRingBuffer_IMPL
2486 (
2487 Journal *pRcDB,
2488 RMCD_RECORD_TYPE type,
2489 NvU32 maxEntries
2490 )
2491 {
2492 NV_STATUS status;
2493 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2494 RING_BUFFER_LOG *pRingBuffer;
2495 NvU8* pBuffer = NULL;
2496 NvU32 bufferSize, entrySize;
2497
2498 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2499
2500 entrySize = rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2501 if (entrySize == 0)
2502 {
2503 NV_ASSERT(entrySize != 0);
2504 return NULL;
2505 }
2506
2507 // We need to store maxEntries number of entries. Check for overflow too
2508 if (portSafeMulU32(maxEntries, entrySize, &bufferSize) == NV_FALSE)
2509 {
2510 return NULL;
2511 }
2512
2513 if (pRingBuffer != NULL)
2514 {
2515 NvU32 totalSize;
2516
2517 if (portSafeAddU32(bufferSize, pRingBuffer->bufferSize, &totalSize) == NV_FALSE)
2518 {
2519 return NULL;
2520 }
2521
2522 bufferSize = totalSize;
2523 pRingBuffer->refCount++;
2524
2525 //
2526 // XXX The collect-all design of the ring buffers allows for
2527 // interleaved entries for different GPUs. This makes it
2528 // hard to dynamically shrink any given ring buffer as GPUs are
2529 // torn down, and requires that an upper bound be placed on
2530 // the buffer's size.
2531 //
2532 // The upper bound, as chosen, is somewhat arbitrary, but at
2533 // the time of this writing, consistent with the use of
2534 // this interface (i.e. the number of entries for each type is
2535 // the same for each GPU).
2536 //
2537 if (bufferSize > pRingBuffer->maxBufferSize)
2538 return NULL;
2539 }
2540 else
2541 {
2542 pRingBuffer = portMemAllocNonPaged(sizeof(RING_BUFFER_LOG));
2543 if (pRingBuffer == NULL)
2544 {
2545 status = NV_ERR_NO_MEMORY;
2546 NV_ASSERT(status == NV_OK);
2547 return NULL;
2548 }
2549
2550 portMemSet(pRingBuffer, 0x00, sizeof(*pRingBuffer));
2551 pRingBuffer->refCount = 1;
2552 }
2553
2554 pBuffer = portMemAllocNonPaged(bufferSize);
2555 if (pBuffer == NULL)
2556 {
2557 status = NV_ERR_NO_MEMORY;
2558 NV_ASSERT(status == NV_OK);
2559 pRingBuffer->refCount--;
2560 if (pRingBuffer->pBuffer == NULL)
2561 {
2562 portMemFree(pRingBuffer);
2563 }
2564 return NULL;
2565 }
2566
2567 // Now, initialize the entries the RING_BUFFER structure.
2568 pRingBuffer->maxEntries += maxEntries;
2569
2570 // Add the ring buffer to the beginning of the ring buffer collection.
2571 if (pRingBuffer->pBuffer == NULL)
2572 {
2573 if (portSafeMulU32(bufferSize, NV_MAX_DEVICES, &pRingBuffer->maxBufferSize) == NV_FALSE)
2574 {
2575 pRingBuffer->refCount--;
2576 if (pRingBuffer->pBuffer == NULL)
2577 {
2578 portMemFree(pRingBuffer);
2579 }
2580
2581 portMemFree(pBuffer);
2582 return NULL;
2583 }
2584
2585 pRingBuffer->maxBufferSize = (bufferSize * NV_MAX_DEVICES);
2586 pRingBuffer->entryType = type;
2587 pRingBuffer->pNextRingBuffer = pRingBufferColl->pFirstEntry;
2588 pRingBufferColl->pFirstEntry = pRingBuffer;
2589 pRingBufferColl->NumRingBuffers++;
2590 }
2591 else
2592 {
2593 NvU32 copySize;
2594
2595 if (portSafeSubU32(bufferSize, pRingBuffer->bufferSize, ©Size) == NV_FALSE)
2596 {
2597 pRingBuffer->refCount--;
2598 if (pRingBuffer->pBuffer == NULL)
2599 {
2600 portMemFree(pRingBuffer);
2601 }
2602
2603 portMemFree(pBuffer);
2604 return NULL;
2605 }
2606
2607 portMemCopy(pBuffer, copySize, pRingBuffer->pBuffer, copySize);
2608 portMemFree(pRingBuffer->pBuffer);
2609 }
2610
2611 pRingBuffer->bufferSize = bufferSize;
2612 pRingBuffer->pBuffer = pBuffer;
2613 return (NvU8 *)pRingBuffer;
2614 }
2615
2616 void
rcdbDestroyRingBuffer_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type)2617 rcdbDestroyRingBuffer_IMPL
2618 (
2619 Journal *pRcDB,
2620 RMCD_RECORD_TYPE type
2621 )
2622 {
2623 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl;
2624 RING_BUFFER_LOG *pRingBuffer, *pCurrentRingBuffer;
2625 NvU32 i;
2626
2627 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2628 if (pRingBuffer == NULL)
2629 return;
2630
2631 if (--pRingBuffer->refCount > 0)
2632 return;
2633
2634 pCurrentRingBuffer = pRingBufferColl->pFirstEntry;
2635 if (pCurrentRingBuffer == pRingBuffer)
2636 {
2637 pRingBufferColl->pFirstEntry = pCurrentRingBuffer->pNextRingBuffer;
2638 }
2639 else
2640 {
2641 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++)
2642 {
2643 if (pCurrentRingBuffer->pNextRingBuffer == pRingBuffer)
2644 {
2645 pCurrentRingBuffer->pNextRingBuffer =
2646 pRingBuffer->pNextRingBuffer;
2647 break;
2648 }
2649 pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer;
2650 }
2651 }
2652
2653 portMemFree(pRingBuffer->pBuffer);
2654 portMemFree(pRingBuffer);
2655
2656 pRingBufferColl->NumRingBuffers--;
2657 }
2658
2659 /*
2660 ** _rcdbAllocRecFromRingBuffer allocates a buffer entry from the
2661 ** specified ring buffer.
2662 **
2663 ** parameters:
2664 ** pGpu a pointer to the GPU object associated with the entry.
2665 ** pRcdb a pointer toe the Journal that contains the ring buffers
2666 ** type the record type to locate a buffer for.
2667 ** recordSize the size of the expected record
2668 **
2669 ** notes:
2670 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
2671 ** failure to do so can result in concurrency issues.
2672 */
2673 RmRCCommonJournal_RECORD *
_rcdbAllocRecFromRingBuffer(OBJGPU * pGpu,Journal * pRcDB,RMCD_RECORD_TYPE type)2674 _rcdbAllocRecFromRingBuffer
2675 (
2676 OBJGPU *pGpu,
2677 Journal *pRcDB,
2678 RMCD_RECORD_TYPE type
2679 )
2680 {
2681 RING_BUFFER_LOG *pRingBuffer = NULL;
2682 NvU32 newItemIndex;
2683 RmRCCommonJournal_RECORD
2684 *pCommon = NULL;
2685
2686 // Find the ring buffer for this entry in the collection.
2687 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer);
2688
2689 if (pRingBuffer == NULL)
2690 {
2691 NV_ASSERT(0);
2692 //
2693 // There is no ring buffer allocated for this type.
2694 // Nothing we can do about it.
2695 //
2696 return NULL;
2697 }
2698
2699 newItemIndex = (pRingBuffer->numEntries + pRingBuffer->headIndex) % pRingBuffer->maxEntries;
2700
2701 // prepend the rmJournalCommon record to record.
2702 pCommon = (RmRCCommonJournal_RECORD*)(pRingBuffer->pBuffer + (rcdbGetOcaRecordSizeWithHeader(pRcDB, type) * newItemIndex));
2703 pCommon->Header.cRecordGroup = RmGroup;
2704 pCommon->Header.cRecordType = type;
2705 pCommon->Header.wRecordSize = (NvU16)rcdbGetOcaRecordSizeWithHeader(pRcDB, type);
2706 rcdbSetCommonJournalRecord(pGpu, pCommon);
2707
2708 // Increment the number of entries or advance the head index.
2709 if (pRingBuffer->numEntries < pRingBuffer->maxEntries)
2710 {
2711 ++pRingBuffer->numEntries;
2712 }
2713 else
2714 {
2715 ++(pRingBuffer->headIndex);
2716 if (pRingBuffer->headIndex >= pRingBuffer->maxEntries)
2717 {
2718 pRingBuffer->headIndex = 0;
2719 }
2720 }
2721 return pCommon;
2722 }
2723
2724 /*
2725 ** rcdbAddRecToRingBuffer_IMPL allocates a buffer entry from the
2726 ** specified ring buffer & copies the supplied data buffer into it.
2727 **
2728 ** parameters:
2729 ** pGpu a pointer to the GPU object associated with the entry.
2730 ** pRcdb a pointer toe the Journal that contains the ring buffers
2731 ** type the record type to locate a buffer for.
2732 ** recordSize the size of the expected record
2733 ** pRecord a pointer to the data that will populate the new ring buffer entry.
2734 **
2735 ** notes:
2736 */
2737 RmRCCommonJournal_RECORD *
rcdbAddRecToRingBuffer_IMPL(OBJGPU * pGpu,Journal * pRcDB,RMCD_RECORD_TYPE type,NvU32 recordSize,NvU8 * pRecord)2738 rcdbAddRecToRingBuffer_IMPL
2739 (
2740 OBJGPU *pGpu,
2741 Journal *pRcDB,
2742 RMCD_RECORD_TYPE type,
2743 NvU32 recordSize,
2744 NvU8 *pRecord
2745 )
2746 {
2747 RmRCCommonJournal_RECORD *pCommon = NULL;
2748
2749 NV_ASSERT(recordSize == _rcdbGetOcaRecordSize(pRcDB, type));
2750
2751 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
2752 {
2753 pCommon = _rcdbAllocRecFromRingBuffer(pGpu, pRcDB, type);
2754 if (pCommon != NULL)
2755 {
2756 // copy the record to follow the common header.
2757 portMemCopy(&(pCommon[1]), recordSize, pRecord, recordSize);
2758 }
2759 }
2760 portAtomicDecrementS32(&concurrentRingBufferAccess);
2761
2762 return pCommon;
2763 }
2764
_rcdbGetOcaRecordSize(Journal * pRcDB,RMCD_RECORD_TYPE type)2765 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type)
2766 {
2767 switch(type)
2768 {
2769 case RmRcDiagReport:
2770 return sizeof(RmRcDiag_RECORD);
2771 break;
2772 case RmNocatReport:
2773 return sizeof(RM_NOCAT_JOURNAL_ENTRY);
2774 break;
2775 default:
2776 return 0;
2777 }
2778 }
2779
rcdbGetOcaRecordSizeWithHeader_IMPL(Journal * pRcDB,RMCD_RECORD_TYPE type)2780 NvU32 rcdbGetOcaRecordSizeWithHeader_IMPL(Journal *pRcDB, RMCD_RECORD_TYPE type)
2781 {
2782 NvU32 recSz;
2783
2784 recSz = _rcdbGetOcaRecordSize(pRcDB, type);
2785 if (0 < recSz)
2786 {
2787 recSz += sizeof(RmRCCommonJournal_RECORD);
2788 }
2789
2790 //
2791 // On architecture like RISC-V, loads/stores need to be aligned to the
2792 // request size (1, 2, 4, 8-byte). Here, OCA record and header are stored
2793 // in a ring buffer, hence total recSz needs to be 8-byte aligned for both
2794 // producer (GSP RM) and consumer (CPU RM) of this data.
2795 //
2796 return NV_ALIGN_UP(recSz, 8);
2797 }
2798
2799 NV_STATUS
rcdbAddRmGpuDump(OBJGPU * pGpu)2800 rcdbAddRmGpuDump
2801 (
2802 OBJGPU *pGpu
2803 )
2804 {
2805 NV_STATUS status = NV_OK;
2806 OBJSYS *pSys = SYS_GET_INSTANCE();
2807 Journal *pRcDB = SYS_GET_RCDB(pSys);
2808 NvDebugDump *pNvd = GPU_GET_NVD(pGpu);
2809 NVD_STATE *pNvDumpState = &pRcDB->nvDumpState;
2810 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo;
2811 RMPRBERRORELEMENT_V2 *pPrbErrorInfo = NULL;
2812 RMPRBERRORELEMENT_V2 *pErrorList = NULL;
2813 RMCD_ERROR_BLOCK *pNewErrorBlock = NULL;
2814 RMERRORHEADER *pErrorHeader = NULL;
2815 PRB_ENCODER prbEnc;
2816 NvU32 bufferUsed;
2817 NvU8 *pBuf = NULL;
2818
2819 //
2820 // The deferred dump codepath will block out other dumps until the DPC can
2821 // be executed. If this is the deferred callback attempting to do the dump,
2822 // carry on.
2823 //
2824 if (pNvDumpState->bDumpInProcess &&
2825 !pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2826 {
2827 return NV_ERR_STATE_IN_USE;
2828 }
2829
2830 prbEnc.depth = 0;
2831 pNvDumpState->bDumpInProcess = NV_TRUE;
2832 pNvDumpState->nvDumpType = NVD_DUMP_TYPE_OCA;
2833 pNvDumpState->bRMLock = rmapiLockIsOwner();
2834
2835 rcdbDumpInitGpuAccessibleFlag(pGpu, pRcDB);
2836
2837 //
2838 // General process:
2839 // 1. Start the protobuf encoder in ALLOCATE mode, and dump the data
2840 // 2. Allocate an error element to stick in the Journal list
2841 // 3. Add the protobuf dump to the error element
2842 // 4. Put the error element at the end of the error list on OBJRCDB
2843 //
2844 status = prbEncStartAlloc(&prbEnc, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE,
2845 NULL);
2846 if (status != NV_OK)
2847 {
2848 //
2849 // If we couldn't allocate the memory, it may be because we're at a
2850 // raised IRQL. It's not a great idea to be gathering a bunch of state
2851 // from the interrupt context anyway, so queue a work item to come back
2852 // later and try again.
2853 //
2854 NvU32 *pGpuInstance = NULL;
2855
2856 //
2857 // If that's what we've already done and we're still failing, bail out
2858 // to avoid an infinite fail/queue-work-item loop.
2859 //
2860 if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH))
2861 {
2862 NV_PRINTF(LEVEL_ERROR,
2863 "deferred GPU dump encoder init failed (status = 0x%x)\n",
2864 status);
2865 goto done;
2866 }
2867
2868 NV_PRINTF(LEVEL_INFO, "deferring GPU dump for normal context\n");
2869
2870 //
2871 // This will be freed by the OS work item layer. We pass the GPU
2872 // instance as the data separately because if the GPU has fallen off
2873 // the bus, the OS layer may refuse to execute work items attached to
2874 // it. Instead, use the system work item interface and handle the GPU
2875 // ourselves.
2876 //
2877 pGpuInstance = portMemAllocNonPaged(sizeof(NvU32));
2878 if (pGpuInstance == NULL)
2879 {
2880 status = NV_ERR_NO_MEMORY;
2881 goto done;
2882 }
2883
2884 *pGpuInstance = gpuGetInstance(pGpu);
2885 status = osQueueSystemWorkItem(_rcdbAddRmGpuDumpCallback,
2886 pGpuInstance);
2887 if (status != NV_OK)
2888 {
2889 portMemFree(pGpuInstance);
2890 goto done;
2891 }
2892
2893 //
2894 // Since we've queued the work item, leave the dump state marked as in
2895 // use to prevent other interrupts and codepaths from attempting to
2896 // initiate the dump and/or queue a new work item.
2897 //
2898 return NV_WARN_MORE_PROCESSING_REQUIRED;
2899 }
2900
2901 status = nvdDumpAllEngines(pGpu, pNvd, &prbEnc, pNvDumpState);
2902 if (status != NV_OK)
2903 {
2904 //
2905 // If the dump failed somewhere, unwind the encoder and then drop
2906 // through to finish it out so we can get the pointer to the
2907 // allocated buffer to free.
2908 //
2909 while (prbEnc.depth > 1)
2910 {
2911 prbEncNestedEnd(&prbEnc);
2912 }
2913 }
2914
2915 bufferUsed = prbEncFinish(&prbEnc, (void **)&pBuf);
2916
2917 if (status != NV_OK)
2918 {
2919 goto done;
2920 }
2921
2922 // Allocate and initialize the error element
2923 pPrbErrorInfo = portMemAllocNonPaged(sizeof(RMPRBERRORELEMENT_V2));
2924 if (pPrbErrorInfo == NULL)
2925 {
2926 status = NV_ERR_NO_MEMORY;
2927 goto done;
2928 }
2929
2930 portMemSet(pPrbErrorInfo, 0, sizeof(RMPRBERRORELEMENT_V2));
2931 pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordGroup = RmGroup;
2932 pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordType = RmPrbFullDump_V2;
2933 pPrbErrorInfo->RmPrbErrorData.common.Header.wRecordSize = sizeof(RMPRBERRORELEMENT_V2);
2934 rcdbSetCommonJournalRecord(pGpu, &(pPrbErrorInfo->RmPrbErrorData.common));
2935 pErrorHeader = &pPrbErrorInfo->ErrorHeader;
2936 pErrorHeader->pErrorBlock = NULL;
2937
2938 //
2939 // Allocate and initialize the error "block" associated with this protobuf
2940 // dump
2941 //
2942 pNewErrorBlock = portMemAllocNonPaged(sizeof(RMCD_ERROR_BLOCK));
2943 if (pNewErrorBlock == NULL)
2944 {
2945 status = NV_ERR_NO_MEMORY;
2946 goto done;
2947 }
2948
2949 portMemSet(pNewErrorBlock, 0, sizeof(RMCD_ERROR_BLOCK));
2950 pNewErrorBlock->pBlock = pBuf;
2951 pNewErrorBlock->blockSize = bufferUsed;
2952 pNewErrorBlock->pNext = NULL;
2953 pErrorHeader->pErrorBlock = pNewErrorBlock;
2954
2955 // Add the error element to the Journal list
2956 if (pSysErrorInfo->pErrorList != NULL)
2957 {
2958 pErrorList = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList;
2959 while (pErrorList->ErrorHeader.pNextError != NULL)
2960 {
2961 pErrorList = (RMPRBERRORELEMENT_V2*)pErrorList->ErrorHeader.pNextError;
2962 }
2963
2964 pErrorList->ErrorHeader.pNextError = (RMFIFOERRORELEMENT_V3*)pPrbErrorInfo;
2965 }
2966 else
2967 {
2968 pSysErrorInfo->pErrorList = pPrbErrorInfo;
2969 }
2970
2971 pSysErrorInfo->ErrorCount++;
2972
2973 done:
2974 if (status != NV_OK)
2975 {
2976 if (pBuf != NULL)
2977 {
2978 portMemFree(pPrbErrorInfo);
2979 portMemFree(pBuf);
2980 }
2981 }
2982
2983 pNvDumpState->bDumpInProcess = NV_FALSE;
2984 return status;
2985 }
2986
2987 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
2988 #if !defined(DEBUG) && !defined(QA_BUILD)
2989 /*
2990 */
2991 NvBool
rcdProbeGpuPresent(OBJGPU * pGpu,NvU64 ip)2992 rcdProbeGpuPresent(
2993 OBJGPU *pGpu,
2994 NvU64 ip
2995 )
2996 {
2997 NvU32 testValue;
2998 NvBool bFoundLostGpu = NV_FALSE;
2999
3000 // protect against recursion when probing the GPU.
3001 if (portAtomicIncrementS32(&probeGpuRecursion) == 1)
3002 {
3003 if (NULL != pGpu)
3004 {
3005 // is the GPU we are checking allready reported lost?
3006 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) &&
3007 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
3008 {
3009 testValue = GPU_CHECK_REG_RD32(pGpu, NV_PMC_BOOT_0, (~(pGpu->chipId0)));
3010 if (testValue == GPU_REG_VALUE_INVALID)
3011 {
3012 // there shouldn't be a need to make a journal entry,
3013 // as that should have been done by GPU_CHECK_REG_RD32
3014
3015 // Add GPU lost detection to to NvLog.
3016 // But skip when nvLog asserts to avoid stack overflow.
3017 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3018 if (portAtomicIncrementS32(&nvLogRecursion) == 1)
3019 #endif
3020 {
3021 NV_PRINTF(LEVEL_ERROR,
3022 "found GPU %d (0x%p) inaccessible After assert\n",
3023 pGpu->gpuInstance, pGpu);
3024 }
3025 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS))
3026 portAtomicDecrementS32(&nvLogRecursion);
3027 #endif
3028 bFoundLostGpu = NV_TRUE;
3029 }
3030 }
3031 }
3032 }
3033 portAtomicDecrementS32(&probeGpuRecursion);
3034 return bFoundLostGpu;
3035 }
3036
3037 NvBool
rcdProbeAllGpusPresent(NvU64 ip)3038 rcdProbeAllGpusPresent(
3039 NvU64 ip
3040 )
3041 {
3042 OBJSYS *pSys = SYS_GET_INSTANCE();
3043 NvBool bFoundLostGpu = NV_FALSE;
3044 OBJGPU *pGpu;
3045 NvU32 gpuMask;
3046 NvU32 gpuIndex = 0;
3047
3048 if (pSys->getProperty(pSys, PDB_PROP_SYS_DESTRUCTING))
3049 {
3050 return NV_FALSE;
3051 }
3052
3053 gpumgrGetGpuAttachInfo(NULL, &gpuMask);
3054 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3055 while (pGpu)
3056 {
3057 bFoundLostGpu = bFoundLostGpu || rcdProbeGpuPresent(pGpu, ip);
3058 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex);
3059 }
3060 return bFoundLostGpu;
3061 }
3062 #endif // !defined(DEBUG) && !defined(QA_BUILD)
3063 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)
3064
3065 void
rcdbAddCrashedFalcon(Falcon * pFlcn)3066 rcdbAddCrashedFalcon
3067 (
3068 Falcon *pFlcn
3069 )
3070 {
3071 OBJSYS *pSys = SYS_GET_INSTANCE();
3072 Journal *pRcDB = SYS_GET_RCDB(pSys);
3073
3074 pRcDB->pCrashedFlcn = pFlcn;
3075 }
3076
3077
3078 /*
3079 ** _rcdbNocatCollectContext records the context of the GPU at the time the error is reported.
3080 **
3081 ** parameters:
3082 ** pGpu pointer to GPU to be reported on.
3083 ** pContext pointer to context structure to be filled in.
3084 **
3085 ** returns:
3086 ** NV_ERR_INVALID_ARGUMENT -- pContext is NULL
3087 */
3088 NV_STATUS
_rcdbNocatCollectContext(OBJGPU * pGpu,Journal * pRcdb,NV2080_NOCAT_JOURNAL_GPU_STATE * pContext)3089 _rcdbNocatCollectContext(OBJGPU *pGpu, Journal* pRcdb, NV2080_NOCAT_JOURNAL_GPU_STATE* pContext)
3090 {
3091 NV2080_NOCAT_JOURNAL_GPU_STATE* pContextCache = NULL;
3092 const char *pTag;
3093
3094 if (pRcdb == NULL)
3095 {
3096 return NV_ERR_INVALID_ARGUMENT;
3097 }
3098
3099 // determine which tag to use.
3100 if (pRcdb->nocatJournalDescriptor.tag[0] != '\0')
3101 {
3102 pTag = (char *)pRcdb->nocatJournalDescriptor.tag;
3103 }
3104 else
3105 {
3106 pTag = NOCAT_DEFAULT_TAG_VALUE_STR;
3107 }
3108 if (pGpu == NULL)
3109 {
3110 // w/o a GPU the only thing we can do is set the tag.
3111 if (pContext != NULL)
3112 {
3113 portMemSet(pContext, 0, sizeof(*pContext));
3114
3115 portStringCopy((char *)pContext->tag,
3116 NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3117 pTag,
3118 portStringLength(pTag) + 1);
3119 }
3120 return NV_OK;
3121 }
3122 #if NOCAT_COLLECT_PERF
3123 pGpuCache = &(pGpu->nocatGpuCache);
3124 #endif
3125 pContextCache = &(pRcdb->nocatJournalDescriptor.nocatGpuState);
3126
3127 // insert tag if we have one.
3128 portStringCopy((char *)pContextCache->tag,
3129 NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3130 pTag,
3131 portStringLength(pTag) + 1);
3132
3133 if (!pContextCache->bValid)
3134 {
3135 pContextCache->deviceId = (NvU16)(DRF_VAL(_PCI, _DEVID, _DEVICE, pGpu->idInfo.PCIDeviceID));
3136 pContextCache->vendorId = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCIDeviceID));
3137 pContextCache->subsystemVendor = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCISubDeviceID));
3138 pContextCache->subsystemId = (NvU16)(DRF_VAL(_PCI, _SUBID, _DEVICE, pGpu->idInfo.PCISubDeviceID));
3139 pContextCache->revision = pGpu->idInfo.PCIRevisionID;
3140 pContextCache->type = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_MOBILE);
3141 pContextCache->bMsHybrid = FLD_TEST_DRF(_JT_FUNC, _CAPS, _MSHYB_ENABLED, _TRUE,
3142 pGpu->acpiMethodData.jtMethodData.jtCaps);
3143
3144 portStringCopy((char *)pContextCache->vbiosProject, NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3145 NOCAT_UNKNOWN_STR, portStringLength(NOCAT_UNKNOWN_STR) + 1);
3146
3147 if (!osIsRaisedIRQL())
3148 {
3149 NV_STATUS status = pGpu->acpiMethodData.capsMethodData.status;
3150 if (status == NV_OK)
3151 {
3152 pContextCache->bOptimus =
3153 FLD_TEST_DRF(OP_FUNC, _OPTIMUSCAPS, _OPTIMUS_CAPABILITIES,
3154 _DYNAMIC_POWER_CONTROL, pGpu->acpiMethodData.capsMethodData.optimusCaps);
3155 }
3156
3157 pContextCache->bValid = NV_TRUE;
3158 }
3159 }
3160 if (pContext != NULL)
3161 {
3162 portMemSet(pContext, 0, sizeof(*pContext));
3163
3164 *pContext = *pContextCache;
3165
3166 pContext->bFullPower = gpuIsGpuFullPower(pGpu);
3167 pContext->bInGc6Reset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET);
3168 pContext->bInFullchipReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET);
3169 pContext->bInSecBusReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET);
3170 }
3171 return NV_OK;
3172 }
3173
3174 /*
3175 ** _rcdbSetTdrReason translates the reason code to a string & puts that string
3176 ** in the provided buffer.
3177 **
3178 ** parameters:
3179 ** tdrReason the reason code for the TDR
3180 ** pTdrReasonStr pointer to the place to copy the reason string to
3181 ** maxLen the size of the buffer pointed to in pTdrReasonStr.
3182 **
3183 */
_rcdbSetTdrReason(Journal * pRcdb,NvU32 tdrReason,char * pTdrReasonStr,NvU32 maxLen)3184 void _rcdbSetTdrReason
3185 (
3186 Journal *pRcdb,
3187 NvU32 tdrReason,
3188 char *pTdrReasonStr,
3189 NvU32 maxLen
3190 )
3191 {
3192 const char *pTmpStr;
3193
3194 // validate inputs.
3195 if (pRcdb == NULL)
3196 {
3197 return;
3198 }
3199
3200 // is there a string buffer & is it large enough to hold more than a NULL string
3201 if ((pTdrReasonStr == NULL) || (maxLen < 2))
3202 {
3203 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3204 return;
3205 }
3206 switch (tdrReason)
3207 {
3208 case NV2080_CTRL_NOCAT_TDR_TYPE_NONE:
3209 pTmpStr = NOCAT_NA_STR;
3210 break;
3211 case NV2080_CTRL_NOCAT_TDR_TYPE_LEGACY:
3212 pTmpStr = NOCAT_LEGACY_STR;
3213 break;
3214 case NV2080_CTRL_NOCAT_TDR_TYPE_FULLCHIP:
3215 pTmpStr = NOCAT_FULLCHIP_TDR_STR;
3216 break;
3217 case NV2080_CTRL_NOCAT_TDR_TYPE_BUSRESET:
3218 pTmpStr = NOCAT_BUS_RESET_TDR_STR;
3219 break;
3220 case NV2080_CTRL_NOCAT_TDR_TYPE_GC6_RESET:
3221 pTmpStr = NOCAT_GC6_RESET_TDR_STR;
3222 break;
3223 case NV2080_CTRL_NOCAT_TDR_TYPE_SURPRISE_REMOVAL:
3224 pTmpStr = NOCAT_SURPRISE_REMOVAL_TDR_STR;
3225 break;
3226 case NV2080_CTRL_NOCAT_TDR_TYPE_UCODE_RESET:
3227 pTmpStr = NOCAT_UCODE_RESET_TDR_STR;
3228 break;
3229 default:
3230 pTmpStr = NOCAT_UNKNOWN_STR;
3231 break;
3232 }
3233 portStringCopy(pTdrReasonStr, maxLen,
3234 pTmpStr, portStringLength(pTmpStr) + 1);
3235 }
3236
3237 /*
3238 ** _rcdbAllocNocatJournalRecord allocates a buffer entry from the Journal ring buffer
3239 ** for the specified type
3240 **
3241 ** parameters:
3242 ** pGpu a pointer to the GPU object associated with the entry.
3243 ** pRcdb a pointer toe the Journal that contains the ring buffers
3244 ** type the record type to locate a buffer for.
3245 **
3246 ** returns a pointer to a record in the ring buffer, or NULL if a record could not be allocated.
3247 **
3248 ** notes:
3249 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3250 ** the lock should be held until access the buffer is completed.
3251 ** failure to do so can result in concurrency issues.
3252 **
3253 ** if successful, the buffer that is returned is cleared & an id assigned.
3254 */
_rcdbAllocNocatJournalRecord(OBJGPU * pGpu,OBJRCDB * pRcdb,RmRCCommonJournal_RECORD ** ppCommon)3255 RM_NOCAT_JOURNAL_ENTRY* _rcdbAllocNocatJournalRecord
3256 (
3257 OBJGPU *pGpu,
3258 OBJRCDB *pRcdb,
3259 RmRCCommonJournal_RECORD **ppCommon
3260 )
3261 {
3262 nocatQueueDescriptor *pDesc = NULL;
3263 RmRCCommonJournal_RECORD* pCommon;
3264 RM_NOCAT_JOURNAL_ENTRY * pNocatEntry = NULL;
3265
3266 // make sure someone has the lock.
3267 if (concurrentRingBufferAccess == 0)
3268 {
3269 return NULL;
3270 }
3271
3272 pDesc = &pRcdb->nocatJournalDescriptor;
3273
3274 // Get the next record from the appropriate nocat ring buffer.
3275 pCommon = _rcdbAllocRecFromRingBuffer(
3276 pGpu,
3277 pRcdb,
3278 RmNocatReport);
3279
3280 if (pCommon != NULL)
3281 {
3282 // advance the pointer past the common header.
3283 pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3284
3285 // clear the record & assign an id.
3286 portMemSet(pNocatEntry, 0, sizeof(*pNocatEntry));
3287 pNocatEntry->id = pDesc->nextRecordId++;
3288 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOCATED_IDX]++;
3289 portAtomicIncrementS32(&pNocatEntry->inUse);
3290 }
3291 else
3292 {
3293 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOC_FAILED_IDX]++;
3294 }
3295 if (ppCommon != NULL)
3296 {
3297 *ppCommon = pCommon;
3298 }
3299 return pNocatEntry;
3300 }
3301
3302 /*
3303 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3304 ** or optionally the oldest record if the requested one is not available.
3305 **
3306 ** parameters:
3307 ** pRcdb a pointer toe the Journal that contains the ring buffers
3308 ** id id of the record we are looking for
3309 ** bExactMatch indicates if we want an exact match, or the closest record.
3310 ** ppCommon a pointer to a pointer that will hold the pointer to
3311 ** the common part of the record.
3312 ** this can be NULL
3313 ** ppReturnedNocatEntry
3314 ** a pointer to a pointer that will hold the pointer to
3315 ** the nocat part of the record
3316 ** this can be NULL
3317 **
3318 ** notes:
3319 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3320 ** the lock should be held until access the buffer is completed.
3321 ** failure to do so can result in concurrency issues.
3322 */
3323 NV_STATUS
_rcdbGetNocatJournalRecord(OBJRCDB * pRcdb,NvU32 reqId,NvBool bExactMatch,RmRCCommonJournal_RECORD ** ppReturnedCommon,RM_NOCAT_JOURNAL_ENTRY ** ppReturnedNocatEntry)3324 _rcdbGetNocatJournalRecord
3325 (
3326 OBJRCDB *pRcdb,
3327 NvU32 reqId,
3328 NvBool bExactMatch,
3329 RmRCCommonJournal_RECORD
3330 **ppReturnedCommon,
3331 RM_NOCAT_JOURNAL_ENTRY
3332 **ppReturnedNocatEntry
3333 )
3334 {
3335 nocatQueueDescriptor *pDesc;
3336 RmRCCommonJournal_RECORD *pCommon = NULL;
3337 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
3338 RING_BUFFER_LOG *pRingBuffer = NULL;
3339 NvS32 offset;
3340 NvS32 idx;
3341
3342 // make sure someone has the lock.
3343 if (concurrentRingBufferAccess == 0)
3344 {
3345 return NV_ERR_BUSY_RETRY;
3346 }
3347
3348 // is there anything to do
3349 if ((ppReturnedCommon == NULL) && (ppReturnedNocatEntry == NULL))
3350 {
3351 return NV_OK;
3352 }
3353
3354 // validate inputs.
3355 if (pRcdb == NULL)
3356 {
3357 return NV_ERR_INVALID_ARGUMENT;
3358 }
3359 pDesc = &pRcdb->nocatJournalDescriptor;
3360
3361 // assume we will fail
3362 if (ppReturnedCommon != NULL)
3363 {
3364 *ppReturnedCommon = NULL;
3365 }
3366 if (ppReturnedNocatEntry != NULL)
3367 {
3368 *ppReturnedNocatEntry = NULL;
3369 }
3370
3371 // if there is nothing in the buffer,
3372 // we can't return a record.
3373 if ((pDesc->nextRecordId - pDesc->nextReportedId) == 0)
3374 {
3375 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_RECORDS_IDX]++;
3376 return NV_ERR_OBJECT_NOT_FOUND;
3377 }
3378
3379 // Find the ring buffer for the diag reports
3380 rcdbFindRingBufferForType(pRcdb, RmNocatReport, &pRingBuffer);
3381 if (pRingBuffer == NULL)
3382 {
3383 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_BUFFER_IDX]++;
3384 return NV_ERR_OBJECT_NOT_FOUND;
3385 }
3386 // determine how far back from the head our record should be.
3387 offset = pDesc->nextRecordId - reqId;
3388
3389 // start of from the next record we will replace.
3390 // this will be the oldest buffer in the record,
3391 // or the next empty record, either way, we will wrap to the right one
3392 idx = pRingBuffer->headIndex;
3393
3394 // is the requested record in the buffer?
3395 if ((0 <= offset) && ((NvU16)offset <= pRingBuffer->numEntries))
3396 {
3397 // back out the offset from the newest/empty record.
3398 idx += pRingBuffer->numEntries - offset;
3399 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_MATCH_FOUND_IDX]++;
3400 }
3401 else if (bExactMatch)
3402 {
3403 // the record is not in the buffer, & we weren't asked for the closest match.
3404 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_MATCH_IDX]++;
3405 return NV_ERR_OBJECT_NOT_FOUND;
3406 }
3407 else
3408 {
3409 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_CLOSEST_FOUND_IDX]++;
3410 }
3411 // wrap the idx to the current size of the buffer.
3412 idx %= pRingBuffer->numEntries;
3413
3414 // get a pointer to the common record & the record from the buffer.
3415 pCommon = (RmRCCommonJournal_RECORD*)(((NvU8*)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcdb, RmNocatReport) * idx));
3416
3417 // get a pointer to the data that follows the common header, that is the record data.
3418 pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD));
3419 portAtomicIncrementS32(&pNocatEntry->inUse);
3420
3421 // pass the record along
3422 if (ppReturnedCommon != NULL)
3423 {
3424 *ppReturnedCommon = pCommon;
3425 }
3426 if (ppReturnedNocatEntry != NULL)
3427 {
3428 *ppReturnedNocatEntry = pNocatEntry;
3429 }
3430 return NV_OK;
3431 }
3432 /*
3433 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record,
3434 ** or optionally the oldest record if the requested one is not available.
3435 **
3436 ** parameters:
3437 ** pRcdb a pointer toe the Journal that contains the ring buffers
3438 ** id id of the record we are looking for
3439 ** bExactMatch indicates if we want an exact match, or the closest record.
3440 ** ppCommon a pointer to a pointer that will hold the pointer to
3441 ** the common part of the record.
3442 ** this can be NULL
3443 ** ppReturnedNocatEntry
3444 ** a pointer to a pointer that will hold the pointer to
3445 ** the nocat part of the record
3446 ** this can be NULL
3447 **
3448 ** notes:
3449 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3450 ** the lock should be held until access the buffer is completed.
3451 ** failure to do so can result in concurrency issues.
3452 */
3453 NV_STATUS
_rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY * pNocatEntry)3454 _rcdbReleaseNocatJournalRecord
3455 (
3456 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry
3457 )
3458 {
3459 if (pNocatEntry == NULL)
3460 {
3461 return NV_ERR_INVALID_ARGUMENT;
3462 }
3463 if (portAtomicDecrementS32(&pNocatEntry->inUse) != 0)
3464 {
3465 return NV_ERR_BUSY_RETRY;
3466 }
3467 return NV_OK;
3468 }
3469
3470 /*
3471 ** _rcdbGetNewestNocatJournalRecordForType returns a pointer to the newest record for the
3472 ** specified type if there is one.
3473 **
3474 ** parameters:
3475 ** pRcdb a pointer toe the Journal that contains the ring buffers
3476 ** type type of record we want.
3477 ** ppCommon a pointer to a pointer that will hold the pointer to
3478 ** the common part of the record.
3479 ** this can be NULL
3480 ** ppCommon a pointer to a pointer that will hold the pointer to
3481 ** the nocat part of the record
3482 ** this can be NULL
3483 **
3484 ** notes:
3485 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock.
3486 ** the lock should be held until access the buffer is completed.
3487 ** failure to do so can result in concurrency issues.
3488 */
3489 NV_STATUS
_rcdbGetNewestNocatJournalRecordForType(OBJRCDB * pRcdb,NvU32 type,RmRCCommonJournal_RECORD ** ppReturnedCommon,RM_NOCAT_JOURNAL_ENTRY ** ppReturnedNocatEntry)3490 _rcdbGetNewestNocatJournalRecordForType
3491 (
3492 OBJRCDB *pRcdb,
3493 NvU32 type,
3494 RmRCCommonJournal_RECORD
3495 **ppReturnedCommon,
3496 RM_NOCAT_JOURNAL_ENTRY
3497 **ppReturnedNocatEntry
3498 )
3499 {
3500 if (type >= NV2080_NOCAT_JOURNAL_REC_TYPE_COUNT)
3501 {
3502 // we failed
3503 if (ppReturnedCommon != NULL)
3504 {
3505 *ppReturnedCommon = NULL;
3506 }
3507 if (ppReturnedNocatEntry != NULL)
3508 {
3509 *ppReturnedNocatEntry = NULL;
3510 }
3511 return NV_ERR_OBJECT_NOT_FOUND;
3512 }
3513 return _rcdbGetNocatJournalRecord(pRcdb, pRcdb->nocatJournalDescriptor.lastRecordId[type], NV_TRUE,
3514 ppReturnedCommon, ppReturnedNocatEntry);
3515 }
3516
3517 /*
3518 ** rcdbReportNextNocatJournalEntry fills in the provided Nocat Journal record with the next record
3519 ** to be reported, then updates the last reported id.
3520 **
3521 ** parameters:
3522 ** pReturnedNocatEntry a pointer to the buffer where the journal record will be transferred to
3523 **
3524 ** returns:
3525 ** NV_OK -- the record was successfully updated with the next record to report.
3526 ** NV_ERR_INVALID_ARGUMENT -- the provided pointer is NULL
3527 ** NV_ERR_OBJECT_NOT_FOUND -- we could not locate a record to report.
3528 **
3529 ** notes:
3530 ** we are transferring the record to the target location here instead of passing a pointer
3531 ** to insure the data is transferred while we hold the concurrentRingBufferAccess lock.
3532 ** failure to do so can result in concurrency issues.
3533 **
3534 ** priority is determined by the record journal queue values. the lower value has
3535 ** higher priority.
3536 **
3537 ** now that we have moved from a single entry, to a queue, we need to
3538 ** consume the entry once we report it
3539 **
3540 */
3541 NV_STATUS
rcdbReportNextNocatJournalEntry(NV2080_NOCAT_JOURNAL_RECORD * pReturnedNocatEntry)3542 rcdbReportNextNocatJournalEntry
3543 (
3544 NV2080_NOCAT_JOURNAL_RECORD
3545 *pReturnedNocatEntry
3546 )
3547 {
3548 OBJSYS *pSys = SYS_GET_INSTANCE();
3549 Journal *pRcdb = SYS_GET_RCDB(pSys);
3550 NV_STATUS status = NV_ERR_OBJECT_NOT_FOUND;
3551 nocatQueueDescriptor *pDesc;
3552 RmRCCommonJournal_RECORD *pCommon = NULL;
3553 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
3554
3555 // validate inputs.
3556 if (pRcdb == NULL)
3557 {
3558 return NV_ERR_INVALID_ARGUMENT;
3559 }
3560 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REQUESTED_IDX]++;
3561
3562 if (pReturnedNocatEntry == NULL)
3563 {
3564 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3565 return NV_ERR_INVALID_ARGUMENT;
3566 }
3567 portMemSet(pReturnedNocatEntry, 0, sizeof(*pReturnedNocatEntry));
3568
3569 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3570 {
3571 pDesc = &pRcdb->nocatJournalDescriptor;
3572 _rcdbGetNocatJournalRecord(pRcdb,
3573 pDesc->nextReportedId, NV_FALSE,
3574 &pCommon, &pNocatEntry);
3575 if ((pCommon != NULL) && (pNocatEntry != NULL))
3576 {
3577 // we have a record, push it into the return buffer
3578 pReturnedNocatEntry->GPUTag = pCommon->GPUTag;
3579
3580 // copy over the data into the supplied buffer.
3581 pReturnedNocatEntry->loadAddress = pDesc->loadAddress;
3582 pReturnedNocatEntry->timeStamp = pCommon->timeStamp;
3583 pReturnedNocatEntry->stateMask = pCommon->stateMask;
3584 pReturnedNocatEntry->nocatGpuState = pNocatEntry->nocatGpuState;
3585 pReturnedNocatEntry->nocatJournalEntry = pNocatEntry->nocatJournalEntry;
3586
3587 // check if we lost any records.
3588 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_DROPPED_IDX] +=
3589 pNocatEntry->id - pDesc->nextReportedId;
3590
3591 // update the NocatJournalNextReportedId
3592 pDesc->nextReportedId = pNocatEntry->id + 1;
3593 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REPORTED_IDX]++;
3594
3595 _rcdbReleaseNocatJournalRecord(pNocatEntry);
3596 status = NV_OK;
3597
3598 }
3599 }
3600 else
3601 {
3602 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
3603 status = NV_ERR_BUSY_RETRY;
3604 }
3605 portAtomicDecrementS32(&concurrentRingBufferAccess);
3606 if ((pRcdb->nocatJournalDescriptor.lockTimestamp != 0) && (rcdbGetNocatOutstandingCount(pRcdb) == 0))
3607 {
3608 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_UNLOCKED_IDX]++;
3609 pRcdb->nocatJournalDescriptor.lockTimestamp = 0;
3610 }
3611 return status;
3612 }
3613
3614 /*
3615 ** rcdbGetNocatOutstandingCount returns the number of NOCAT events that have
3616 ** been recorded since the last reported record.
3617 **
3618 ** parameters:
3619 ** pRcdb -- a pointer to the Journal object.
3620 **
3621 ** returns:
3622 ** number of NOCAT events that have been recorded since the last reported record.
3623 ** or NV_U32_MAX if a NULL journal object pointer is provided.
3624 **
3625 ** notes:
3626 ** the returned count includes records that have been dropped due to wrapping.
3627 **
3628 */
3629 NvU32
rcdbGetNocatOutstandingCount(Journal * pRcdb)3630 rcdbGetNocatOutstandingCount(Journal *pRcdb)
3631 {
3632 NvU32 count = NV_U32_MAX;
3633 if (pRcdb != NULL)
3634 {
3635 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3636 {
3637 count = pRcdb->nocatJournalDescriptor.nextRecordId
3638 - pRcdb->nocatJournalDescriptor.nextReportedId;
3639 }
3640 portAtomicDecrementS32(&concurrentRingBufferAccess);
3641 }
3642 return count;
3643 }
3644
3645 /*
3646 ** _rcdbSendNocatJournalNotification sends an ETW Notification that a NOCAT Journal record has been posted.
3647 **
3648 ** parameters:
3649 ** pGpu -- a pointer to the GPU object associated with teh new entry
3650 ** (may be NULL)
3651 ** pRcdb -- a pointer to the Journal object NOCAT is using.
3652 ** posted -- the number of records posted since the last record that was retrieved.
3653 ** pCommon -- a pointer to the common record header associated with the record.
3654 ** type -- the record type
3655 **
3656 ** returns:
3657 ** NV_OK -- the call to post the record was made.
3658 ** note that the call to post the record does not return a status,
3659 ** so we do not know if the call was successful.
3660 ** NV_ERR_INVALID_ARGUMENT -- one of the required pointers is NULL
3661 **
3662 */
3663 NV_STATUS
_rcdbSendNocatJournalNotification(OBJGPU * pGpu,Journal * pRcdb,NvU32 posted,RmRCCommonJournal_RECORD * pCommon,NvU32 type)3664 _rcdbSendNocatJournalNotification
3665 (
3666 OBJGPU *pGpu,
3667 Journal *pRcdb,
3668 NvU32 posted,
3669 RmRCCommonJournal_RECORD *pCommon, // todo: pass in timestamp instead of common.
3670 NvU32 type
3671 )
3672 {
3673 if ((pCommon == NULL) || (pRcdb == NULL))
3674 {
3675 return NV_ERR_INVALID_ARGUMENT;
3676 }
3677 RMTRACE_NOCAT(_REPORT_PENDING, (pGpu ? pGpu->gpuId : RMTRACE_UNKNOWN_GPUID),
3678 RmNocatReport,
3679 posted,
3680 type,
3681 rcdbGetNocatOutstandingCount(pRcdb),
3682 pCommon->timeStamp);
3683
3684 // count the number of notifications.
3685 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NOTIFICATIONS_IDX]++;
3686 return NV_OK;
3687 }
3688
3689 /*
3690 ** rcdbInitNocatGpuCache_IMPL initializes a per GPU cache held in the GPU object to be used by NOCAT
3691 **
3692 ** parameters:
3693 ** pGpu -- a pointer to the GPU Object the containing the cache
3694 **
3695 ** notes:
3696 ** this function:
3697 ** * caches the driver load address
3698 ** * allocates a small block of memory in the frame buffer for testing
3699 ** * initializes the GPU context cache
3700 **
3701 */
rcdbInitNocatGpuCache_IMPL(OBJGPU * pGpu)3702 void rcdbInitNocatGpuCache_IMPL(OBJGPU *pGpu)
3703 {
3704 OS_DRIVER_BLOCK driverBlock;
3705 OBJSYS *pSys = SYS_GET_INSTANCE();
3706 Journal *pRcdb = SYS_GET_RCDB(pSys);
3707 #if NOCAT_PROBE_FB_MEMORY
3708 NvU8 *pCpuPtr;
3709 NV_STATUS status;
3710 #endif
3711
3712 if (pGpu == NULL)
3713 {
3714 return;
3715 }
3716 portMemSet(&driverBlock, 0x00, sizeof(driverBlock));
3717 if (osGetDriverBlock(pGpu->pOsGpuInfo, &driverBlock) == NV_OK)
3718 {
3719 pRcdb->nocatJournalDescriptor.loadAddress = (NvU64)driverBlock.driverStart;
3720 }
3721
3722 #if NOCAT_PROBE_FB_MEMORY
3723 // Allocate some memory for virtual BAR2 testing
3724 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_ALL_INST_IN_SYSMEM) && !IsAMODEL(pGpu))
3725 {
3726 memdescCreateExisting(&pGpu->nocatGpuCache.fbTestMemDesc,
3727 pGpu, NOCAT_FBSIZETESTED, ADDR_FBMEM, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE);
3728 memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_102,
3729 (&pGpu->nocatGpuCache.fbTestMemDesc));
3730 if (status != NV_OK)
3731 {
3732 NV_PRINTF(LEVEL_ERROR, "Could not allocate vidmem for NOCAT bar2 testing\n");
3733 return;
3734 }
3735 pCpuPtr = kbusMapRmAperture_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc);
3736 if (pCpuPtr == NULL)
3737 {
3738 memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3739 memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3740 pGpu->nocatGpuCache.pCpuPtr = NULL;
3741 return;
3742 }
3743 pGpu->nocatGpuCache.pCpuPtr = pCpuPtr;
3744 }
3745 #endif
3746 // initialize the context cache
3747 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3748 {
3749 _rcdbNocatCollectContext(pGpu, pRcdb, NULL);
3750 }
3751 portAtomicDecrementS32(&concurrentRingBufferAccess);
3752
3753 return;
3754 }
3755
3756 /*
3757 ** rcdbCleanupNocatGpuCache_IMPL returns per GPU resources used by NOCAT.
3758 **
3759 ** parameters:
3760 ** pGpu -- a pointer to the GPU Object the containing the cache
3761 **
3762 ** notes:
3763 ** This will free up the FB test window if allocated, and clear out the cache
3764 **
3765 */
rcdbCleanupNocatGpuCache_IMPL(OBJGPU * pGpu)3766 void rcdbCleanupNocatGpuCache_IMPL(OBJGPU *pGpu)
3767 {
3768 #if NOCAT_PROBE_FB_MEMORY
3769 if (pGpu == NULL)
3770 {
3771 return;
3772 }
3773 if (pGpu->nocatGpuCache.pCpuPtr != NULL)
3774 {
3775 kbusUnmapRmApertureWithFlags_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc,
3776 &pGpu->nocatGpuCache.pCpuPtr, TRANSFER_FLAGS_NONE);
3777 memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc);
3778 memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc);
3779 }
3780 portMemSet(&pGpu->nocatGpuCache, 0, sizeof(pGpu->nocatGpuCache));
3781 #endif
3782
3783 return;
3784 }
3785
3786
3787
3788 /*
3789 ** rcdbNocatInsertNocatError records a reported NOCAT error
3790 **
3791 ** parameters:
3792 ** pGpu Pointer to GPU associated with the error
3793 ** may be NULL if there is no GPU associated with the error
3794 ** if NULL the primary GPU is used
3795 ** pNewEntry A pointer to a structure that contains all the available data for the report
3796 */
3797 NvU32
rcdbNocatInsertNocatError(OBJGPU * pGpu,NOCAT_JOURNAL_PARAMS * pNewEntry)3798 rcdbNocatInsertNocatError(
3799 OBJGPU *pGpu,
3800 NOCAT_JOURNAL_PARAMS *pNewEntry
3801 )
3802 {
3803 OBJSYS *pSys = SYS_GET_INSTANCE();
3804 Journal *pRcdb = SYS_GET_RCDB(pSys);
3805 #if(NOCAT_PROBE_FB_MEMORY)
3806 NvBool bCheckFBState = NV_FALSE;
3807 #endif
3808 RmRCCommonJournal_RECORD *pCommon = NULL;
3809 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
3810 NvU32 id = INVALID_RCDB_NOCAT_ID;
3811 const char *pSource = NULL;
3812 NvU32 diagBufferLen = 0;
3813 const char *pFaultingEngine = NULL;
3814 NvBool postRecord;
3815 // validate inputs.
3816 if (pRcdb == NULL)
3817 {
3818 return NV_ERR_INVALID_ARGUMENT;
3819 }
3820 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_REQ_IDX]++;
3821 if (pNewEntry == NULL)
3822 {
3823 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++;
3824 return 0;
3825 }
3826 // assign a timestamp if none was provided
3827 if (pNewEntry->timestamp == 0)
3828 {
3829 pNewEntry->timestamp = osGetTimestamp();
3830 }
3831
3832 // initially set postRecord based on the current state of the lock;
3833 postRecord = pRcdb->nocatJournalDescriptor.lockTimestamp == 0;
3834
3835 // perform any record type specific setup
3836 switch (pNewEntry->recType)
3837 {
3838 case NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK:
3839 #if(NOCAT_PROBE_FB_MEMORY)
3840 bCheckFBState = NV_TRUE;
3841 #endif
3842 // fall thru
3843
3844 case NV2080_NOCAT_JOURNAL_REC_TYPE_TDR:
3845 // lock the journal so we don't wrap over the record we are inserting.
3846 if (pRcdb->nocatJournalDescriptor.lockTimestamp == 0)
3847 {
3848 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCKED_IDX]++;
3849 }
3850 else
3851 {
3852 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCK_UPDATED_IDX]++;
3853 }
3854
3855 pRcdb->nocatJournalDescriptor.lockTimestamp = pNewEntry->timestamp;
3856 postRecord = NV_TRUE;
3857 break;
3858
3859 case NV2080_NOCAT_JOURNAL_REC_TYPE_RC:
3860 #if(NOCAT_PROBE_FB_MEMORY)
3861 bCheckFBState = NV_TRUE;
3862 #endif
3863 // set the source
3864 pSource = "RC Error";
3865 break;
3866
3867 case NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT:
3868 // set the source
3869 pSource = "ASSERT";
3870 break;
3871
3872 case NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE:
3873 break;
3874
3875 case NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN:
3876 default:
3877 return 0;
3878 break;
3879 }
3880 // check if we should post the record when locked.
3881 if (!postRecord)
3882 {
3883 if ((NvS64)(pNewEntry->timestamp - pRcdb->nocatJournalDescriptor.lockTimestamp) < 0)
3884 {
3885 // the record predates the lock, so it's Grandfathered in.
3886 postRecord = NV_TRUE;
3887 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_GRANDFATHERED_RECORD_IDX]++;
3888 }
3889 else
3890 {
3891 // we are dropping the record, count that.
3892 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_LOCKED_OUT_IDX]++;
3893 }
3894 }
3895 if (postRecord)
3896 {
3897 // is the buffer available?
3898 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
3899 {
3900 // start recording this new record by allocating a record from the buffer.
3901 pNocatEntry = _rcdbAllocNocatJournalRecord(pGpu, pRcdb, &pCommon);
3902 if (pNocatEntry != NULL)
3903 {
3904 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECTED_IDX]++;
3905
3906 // update the time stamp to the one supplied.
3907 pCommon->timeStamp = pNewEntry->timestamp;
3908
3909 // save the record Id for the type.
3910 pRcdb->nocatJournalDescriptor.lastRecordId[pNewEntry->recType] =
3911 pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY] =
3912 pRcdb->nocatJournalDescriptor.nextRecordId - 1;
3913
3914 // set the type.
3915 pNocatEntry->nocatJournalEntry.recType = pNewEntry->recType;
3916
3917 // set bugcheck
3918 pNocatEntry->nocatJournalEntry.bugcheck = pNewEntry->bugcheck;
3919
3920 // get context
3921 _rcdbNocatCollectContext(pGpu, pRcdb, &(pNocatEntry->nocatGpuState));
3922
3923 #if(NOCAT_PROBE_FB_MEMORY)
3924 if ((bCheckFBState)
3925 && (pGpu != NULL)
3926 && (pGpu->nocatGpuCache.pCpuPtr != NULL)
3927 // If using Coherent CPU mapping instead of BAR2 do not call VerifyBar2
3928 && !pGpu->getProperty(pGpu, PDB_PROP_GPU_COHERENT_CPU_MAPPING))
3929 {
3930 switch (kbusVerifyBar2_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu),
3931 &pGpu->nocatGpuCache.fbTestMemDesc, pGpu->nocatGpuCache.pCpuPtr, 0, NOCAT_FBSIZETESTED))
3932 {
3933 case NV_OK: // everything passed
3934 break;
3935
3936 case NV_ERR_MEMORY_ERROR: // BAR 0 failed & BAR 2 was not checked, or BAR 2 failed
3937 // for now we don't know which BAR failed, so mark both.
3938 // but only one BAR failed.
3939 // (if BAR 0 Failed, BAR 2 was not checked)
3940 pCommon->stateMask |=
3941 NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR0
3942 | NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR2;
3943 break;
3944
3945 default: // some other processing error cause us to not test the BAR
3946 break;
3947 }
3948 }
3949 #endif
3950 // is there a valid string for source?
3951 // (non NULL ptr & more than just a termination)
3952 if ((pNewEntry->pSource != NULL) && (pNewEntry->pSource[0] != '\0'))
3953 {
3954 // yes, use that.
3955 pSource = pNewEntry->pSource;
3956 }
3957 // the caller did not supply a source,
3958 // did we set a default source based on record type?
3959 else if (pSource == NULL)
3960 {
3961 // no, supply the unknown string for source.
3962 pSource = NOCAT_UNKNOWN_STR;
3963 }
3964 portStringCopy((char*)pNocatEntry->nocatJournalEntry.source,
3965 NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3966 pSource,
3967 portStringLength(pSource) + 1);
3968
3969 pNocatEntry->nocatJournalEntry.subsystem = pNewEntry->subsystem;
3970 pNocatEntry->nocatJournalEntry.errorCode = pNewEntry->errorCode;
3971
3972 if ((pNewEntry->pDiagBuffer != NULL) && (pNewEntry->diagBufferLen != 0))
3973 {
3974 // checking length here as we don't want portMemCopy to assert
3975 if (pNewEntry->diagBufferLen < NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer))
3976 {
3977 diagBufferLen = pNewEntry->diagBufferLen;
3978 }
3979 else
3980 {
3981 // make best effort
3982 diagBufferLen = NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer);
3983 }
3984 portMemCopy(pNocatEntry->nocatJournalEntry.diagBuffer,
3985 sizeof(pNocatEntry->nocatJournalEntry.diagBuffer),
3986 pNewEntry->pDiagBuffer, diagBufferLen);
3987 }
3988 pNocatEntry->nocatJournalEntry.diagBufferLen = diagBufferLen;
3989
3990 pFaultingEngine = pNewEntry->pFaultingEngine != NULL ?
3991 pNewEntry->pFaultingEngine : NOCAT_UNKNOWN_STR;
3992
3993 portStringCopy((char*)pNocatEntry->nocatJournalEntry.faultingEngine,
3994 NV2080_NOCAT_JOURNAL_MAX_STR_LEN,
3995 pFaultingEngine, portStringLength(pFaultingEngine) + 1);
3996
3997 _rcdbSetTdrReason(pRcdb, pNewEntry->tdrReason,
3998 (char*)pNocatEntry->nocatJournalEntry.tdrReason,
3999 sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
4000
4001 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4002 }
4003 else
4004 {
4005 // record was not allocated, bail.
4006 postRecord = NV_FALSE;
4007 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_FAILED_IDX]++;
4008 }
4009 }
4010 else
4011 {
4012 // we are busy, so we can't insert the record, count the record as dropped & count the busy.
4013 postRecord = NV_FALSE;
4014 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4015 }
4016 portAtomicDecrementS32(&concurrentRingBufferAccess);
4017 }
4018
4019 // no matter what happened, trigger the event to indicate a record was processed.
4020 _rcdbSendNocatJournalNotification(pGpu, pRcdb, postRecord, pCommon, pNewEntry->recType);
4021
4022 return id;
4023 }
4024 /*
4025 ** rcdbNocatInsertBugcheck is the interface to record a bugcheck NOCAT report
4026 **
4027 ** parameters:
4028 ** deviceInstance The instance of the GPU associated with the bugcheck.
4029 ** bugcheck The bugcheck number
4030 */
4031 NvU32
rcdbNocatInsertBugcheck(NvU32 deviceInstance,NvU32 bugCheckCode)4032 rcdbNocatInsertBugcheck
4033 (
4034 NvU32 deviceInstance,
4035 NvU32 bugCheckCode)
4036 {
4037 NOCAT_JOURNAL_PARAMS newEntry;
4038
4039 portMemSet(&newEntry, 0, sizeof(newEntry));
4040 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK;
4041 newEntry.bugcheck = bugCheckCode;
4042 newEntry.pSource = "OS";
4043 newEntry.errorCode = bugCheckCode;
4044 return rcdbNocatInsertNocatError(gpumgrGetGpu(deviceInstance), &newEntry);
4045 }
4046
4047 /*
4048 ** rcdbNocatInitEngineErrorEvent initializes a parameter structure for an engine error event
4049 **
4050 ** parameters:
4051 ** pNewEntry Pointer to event parameter structure to be initialized
4052 */
4053 NV_STATUS
rcdbNocatInitEngineErrorEvent(NOCAT_JOURNAL_PARAMS * pNewEntry)4054 rcdbNocatInitEngineErrorEvent
4055 (
4056 NOCAT_JOURNAL_PARAMS *pNewEntry
4057 )
4058 {
4059 if (pNewEntry == NULL)
4060 {
4061 return NV_ERR_INVALID_ARGUMENT;
4062 }
4063 portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4064 pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE;
4065 return NV_OK;
4066 }
4067
4068 /*
4069 ** rcdbNocatInsertEngineError records a reported NOCAT error from an engine,
4070 **
4071 ** parameters:
4072 ** pGpu Pointer to GPU associated with the error
4073 ** may be NULL if there is no GPU associated with the error
4074 ** if NULL the primary GPU is used
4075 ** pSource A string indicating the reporting source of the error.
4076 ** if NULL, a default values will be used
4077 ** subsystem The optional subsystem ID used by the source to identify the error
4078 ** errorCode The error code
4079 ** pDiagBuffer A pointer to the diagnostic buffer associated with the error
4080 ** may be NULL
4081 ** diagBufferLen The size of the diagnostic buffer
4082 ** if the size exceeds the supported diagBuffer size, the buffer contents will be truncated to fit.
4083 */
4084 NvU32
rcdbNocatInsertEngineError(OBJGPU * pGpu,const char * pSource,NvU32 subsystem,NvU64 errorCode,NvU8 * pDiagBuffer,NvU32 diagBufferLen)4085 rcdbNocatInsertEngineError(
4086 OBJGPU *pGpu,
4087 const char *pSource,
4088 NvU32 subsystem,
4089 NvU64 errorCode,
4090 NvU8 *pDiagBuffer,
4091 NvU32 diagBufferLen
4092 )
4093 {
4094 NOCAT_JOURNAL_PARAMS newEntry;
4095
4096 rcdbNocatInitEngineErrorEvent(&newEntry);
4097 newEntry.pSource = pSource;
4098 newEntry.subsystem = subsystem;
4099 newEntry.errorCode = errorCode;
4100 newEntry.pDiagBuffer = pDiagBuffer;
4101 newEntry.diagBufferLen = diagBufferLen;
4102 return rcdbNocatInsertNocatError(pGpu, &newEntry);
4103 }
4104
4105 /*
4106 ** rcdbNocatInsertTDRError records an TDR error,
4107 **
4108 ** parameters:
4109 ** pGpu Pointer to GPU associated with the error
4110 ** may be NULL if there is no GPU associated with the error
4111 ** if NULL the primary GPU is used
4112 ** pSource A string indicating the reporting source of the error.
4113 ** if NULL, a default values will be used
4114 ** subsystem The optional subsystem ID used by the source to identify the error
4115 ** errorCode The error code
4116 ** TDRBucket The TDR bucket
4117 ** pDiagBuffer A pointer to the diagnostic buffer associated with the error
4118 ** may be NULL
4119 ** diagBufferLen The size of the diagnostic buffer
4120 ** if the size exceeds the supported diagBuffer size,
4121 ** the buffer contents will be truncated to fit.
4122 ** tdrReason A reason code for the TDR
4123 ** pFaultingApp A pointer to the faulting app name if known
4124 */
4125 NvU32
rcdbNocatInsertTDRError(OBJGPU * pGpu,const char * pSource,NvU32 subsystem,NvU64 errorCode,NvU32 TdrBucket,NvU8 * pDiagBuffer,NvU32 diagBufferLen,NvU32 tdrReason,const char * pFaultingEngine)4126 rcdbNocatInsertTDRError
4127 (
4128 OBJGPU *pGpu,
4129 const char *pSource,
4130 NvU32 subsystem,
4131 NvU64 errorCode,
4132 NvU32 TdrBucket,
4133 NvU8 *pDiagBuffer,
4134 NvU32 diagBufferLen,
4135 NvU32 tdrReason,
4136 const char *pFaultingEngine
4137 )
4138 {
4139 NOCAT_JOURNAL_PARAMS newEntry;
4140
4141 portMemSet(&newEntry, 0, sizeof(newEntry));
4142 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4143 newEntry.pSource = pSource;
4144 newEntry.subsystem = subsystem;
4145 newEntry.errorCode = errorCode;
4146 newEntry.pDiagBuffer = pDiagBuffer;
4147 newEntry.diagBufferLen = diagBufferLen;
4148 newEntry.pFaultingEngine = pFaultingEngine;
4149 return rcdbNocatInsertNocatError(pGpu, &newEntry);
4150 }
4151 NV_STATUS
rcdbNocatInitRCErrorEvent(NOCAT_JOURNAL_PARAMS * pNewEntry)4152 rcdbNocatInitRCErrorEvent
4153 (
4154 NOCAT_JOURNAL_PARAMS *pNewEntry
4155 )
4156 {
4157 if (pNewEntry == NULL)
4158 {
4159 return NV_ERR_INVALID_ARGUMENT;
4160 }
4161 portMemSet(pNewEntry, 0, sizeof(*pNewEntry));
4162 pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_RC;
4163 pNewEntry->pSource = "RC ERROR";
4164 return NV_OK;
4165 }
4166
4167 /*
4168 ** _rcdbNocatReportAssert adds an assert record.
4169 **
4170 ** parameters:
4171 ** pGpu Pointer to GPU associated with the error
4172 ** may be NULL
4173 ** pAssertRec A pointer to the assert to report
4174 */
4175 NV_STATUS
_rcdbNocatReportAssert(OBJGPU * pGpu,RmRCCommonAssert_RECORD * pAssertRec)4176 _rcdbNocatReportAssert
4177 (
4178 OBJGPU *pGpu,
4179 RmRCCommonAssert_RECORD *pAssertRec
4180 )
4181 {
4182 OBJSYS *pSys = SYS_GET_INSTANCE();
4183 Journal *pRcdb = SYS_GET_RCDB(pSys);
4184 NOCAT_JOURNAL_PARAMS newEntry;
4185 RM_NOCAT_ASSERT_DIAG_BUFFER diagBuffer;
4186 RM_NOCAT_ASSERT_DIAG_BUFFER *pDiagData;
4187 NvU32 idx;
4188 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL;
4189 NvU32 gpuCnt= 0;
4190 OBJGPU *pTmpGpu = gpumgrGetGpu(0);
4191
4192 // validate inputs.
4193 if ((pRcdb == NULL) || (pAssertRec == NULL))
4194 {
4195 return NV_ERR_INVALID_ARGUMENT;
4196 }
4197 if (pGpu == NULL)
4198 {
4199 // we don't have a GPU, if there is only 1 GPU,
4200 // we can safely use it for logging this assert
4201 gpumgrGetGpuAttachInfo(&gpuCnt, NULL);
4202 if (gpuCnt == 1)
4203 {
4204 pGpu = pTmpGpu;
4205 }
4206 }
4207
4208 // start off assuming we will be recording a report
4209 portMemSet(&newEntry, 0, sizeof(newEntry));
4210 newEntry.timestamp = pAssertRec->common.timeStamp;
4211 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT;
4212 newEntry.pSource = "ASSERT";
4213
4214 // save the assert point as the error code.
4215 newEntry.errorCode =
4216 (NvU32)((pAssertRec->breakpointAddrHint - pRcdb->nocatJournalDescriptor.loadAddress)
4217 & 0xffffffff);
4218
4219 // put the line number in the upper 32 bits.
4220 newEntry.errorCode |= ((NvU64)pAssertRec->lineNum) << 32;
4221
4222 for (idx = 0; idx < NV_ARRAY_ELEMENTS(pAssertRec->callStack); idx++)
4223 {
4224 diagBuffer.callStack[idx] =
4225 (NvU32)((pAssertRec->callStack[idx] - pRcdb->nocatJournalDescriptor.loadAddress)
4226 & 0xffffffff);
4227 }
4228 // initialize count
4229 diagBuffer.count = 1;
4230
4231 // setup the pointer to our diag buffer & its length
4232 newEntry.pDiagBuffer = (NvU8 *)&diagBuffer;
4233
4234 newEntry.diagBufferLen = sizeof(diagBuffer);
4235
4236 // is the last thing we logged an assert, & is this the same assert?
4237 if ((pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT]
4238 == pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY])
4239 && (0 == portMemCmp(&pRcdb->nocatJournalDescriptor.lastAssertData,
4240 diagBuffer.callStack, // same stack
4241 sizeof(diagBuffer.callStack))))
4242 {
4243 // it is the same as the last assert we logged. so don't log it again.
4244 // but see if we can increment the counter in an unreported assert.
4245 // check if the last record is also an assert
4246 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4247 {
4248 // get the last record from the buffer
4249 _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4250 NV2080_NOCAT_JOURNAL_REC_TYPE_ANY,
4251 NULL, &pNocatEntry);
4252 if (pNocatEntry != NULL)
4253 {
4254 // is it an assert?
4255 if (pNocatEntry->nocatJournalEntry.recType == (NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT))
4256 {
4257 // increment the count
4258 pDiagData = (RM_NOCAT_ASSERT_DIAG_BUFFER*)&pNocatEntry->nocatJournalEntry.diagBuffer;
4259 pDiagData->count++;
4260 }
4261 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4262
4263 }
4264 }
4265 else
4266 {
4267 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++;
4268 }
4269 portAtomicDecrementS32(&concurrentRingBufferAccess);
4270 }
4271 else
4272 {
4273 // we are logging this assert, save off the stack so we can use it to
4274 // compare against future asserts.
4275 portMemCopy(&pRcdb->nocatJournalDescriptor.lastAssertData,
4276 sizeof(pRcdb->nocatJournalDescriptor.lastAssertData),
4277 &diagBuffer, sizeof(diagBuffer));
4278 rcdbNocatInsertNocatError(pGpu, &newEntry);
4279 }
4280
4281 return NV_OK;
4282 }
4283
4284 /*
4285 ** rcdbNocatInsertRMCDErrorEvent creates an event from an RMCD error block
4286 **
4287 ** parameters:
4288 ** pGpu pointer to GPU object associated with the error
4289 ** recType the type of event to create
4290 ** pSource a pointer to the source string
4291 ** subsystem the subsystem associated with the event.
4292 ** errorCode error code for the event
4293 ** pFault pointer to a faulting component identifier associated with the error
4294 */
rcdbNocatInsertRMCDErrorEvent(OBJGPU * pGpu,NvU32 recType,const char * pSource,NvU32 subsystem,NvU64 errorCode,const char * pFault,RMCD_ERROR_BLOCK * pRcdError)4295 NvU32 rcdbNocatInsertRMCDErrorEvent(OBJGPU *pGpu, NvU32 recType,
4296 const char *pSource, NvU32 subsystem, NvU64 errorCode, const char *pFault,
4297 RMCD_ERROR_BLOCK *pRcdError)
4298 {
4299 NOCAT_JOURNAL_PARAMS newEntry;
4300
4301 portMemSet(&newEntry, 0, sizeof(newEntry));
4302 newEntry.recType = recType;
4303 newEntry.pSource = pSource;
4304 newEntry.subsystem = subsystem;
4305 newEntry.errorCode = errorCode;
4306 newEntry.pFaultingEngine = pFault;
4307 if (pRcdError != NULL)
4308 {
4309 newEntry.pDiagBuffer = (NvU8 * )pRcdError->pBlock;
4310 newEntry.diagBufferLen = pRcdError->blockSize;
4311 }
4312 return rcdbNocatInsertNocatError(pGpu, &newEntry);
4313 }
4314
4315 /*
4316 ** rcdbSetNocatTdrReason sets the TDR reason code in the most recent TDR record if there is one,
4317 ** otherwise, it creates one with the reason code.
4318 **
4319 ** parameters:
4320 ** pReasonData the data supplied with including the reason code.
4321 ** if a TDR record exists, the reason will be added to the existing record,
4322 ** otherwise the rest of the data will be used to create a new TDR event.
4323 */
rcdbSetNocatTdrReason(NV2080CtrlNocatJournalDataTdrReason * pReasonData)4324 NV_STATUS rcdbSetNocatTdrReason
4325 (
4326 NV2080CtrlNocatJournalDataTdrReason *pReasonData
4327 )
4328 {
4329 OBJSYS *pSys = SYS_GET_INSTANCE();
4330 Journal *pRcdb = SYS_GET_RCDB(pSys);
4331 RM_NOCAT_JOURNAL_ENTRY* pNocatEntry = NULL;
4332
4333 // validate inputs.
4334 if ((pRcdb == NULL) || (pReasonData == NULL))
4335 {
4336 return NV_ERR_INVALID_ARGUMENT;
4337 }
4338 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATE_REQ_IDX]++;
4339
4340 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1)
4341 {
4342 // see if there is a TDR record.
4343 _rcdbGetNewestNocatJournalRecordForType(pRcdb,
4344 NV2080_NOCAT_JOURNAL_REC_TYPE_TDR,
4345 NULL, &pNocatEntry);
4346 if (pNocatEntry != NULL)
4347 {
4348 // there is, set the reason.
4349 _rcdbSetTdrReason(pRcdb, pReasonData->reasonCode,
4350 (char *)pNocatEntry->nocatJournalEntry.tdrReason,
4351 sizeof(pNocatEntry->nocatJournalEntry.tdrReason));
4352 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATED_IDX]++;
4353 _rcdbReleaseNocatJournalRecord(pNocatEntry);
4354 }
4355 }
4356 portAtomicDecrementS32(&concurrentRingBufferAccess);
4357
4358 // if we did not get a TDR record, create one.
4359 // we need to do it after the ring buffers are released.
4360 if (pNocatEntry == NULL)
4361 {
4362 NOCAT_JOURNAL_PARAMS newEntry;
4363
4364 portMemSet(&newEntry, 0, sizeof(newEntry));
4365 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR;
4366 newEntry.pSource = (char *)pReasonData->source;
4367 newEntry.subsystem = pReasonData->subsystem;
4368 newEntry.errorCode = pReasonData->errorCode;
4369 newEntry.tdrReason = pReasonData->reasonCode;
4370 return rcdbNocatInsertNocatError(NULL, &newEntry);
4371 }
4372 return NV_OK;
4373 }
4374