1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /***************************** HW State Routines ***************************\ 25 * * 26 * RM robust error journal (formerly RCDB) * 27 * * 28 \***************************************************************************/ 29 30 #include "gpu_mgr/gpu_mgr.h" 31 #include "nvRmReg.h" 32 #include "nvBldVer.h" 33 #include "nvVer.h" 34 #include "os/os.h" 35 #include "core/system.h" 36 #include "gpu/gpu.h" 37 #include "diagnostics/journal.h" 38 #include "platform/chipset/chipset.h" 39 #include "diagnostics/nv_debug_dump.h" 40 #include "diagnostics/tracer.h" 41 #include "core/locks.h" 42 #include "rmapi/rs_utils.h" 43 #include "vgpu/rpc.h" 44 #include "gpu/bus/kern_bus.h" 45 #include "gpu/mem_mgr/mem_mgr.h" 46 #include "nvdevid.h" 47 #include "nvop.h" 48 #include "jt.h" 49 50 51 52 #include "ctrl/ctrl0000/ctrl0000nvd.h" 53 54 #include "nvlimits.h" 55 #include "Nvcm.h" 56 57 #include "lib/protobuf/prb_util.h" 58 #include "g_all_dcl_pb.h" 59 #include "g_nvdebug_pb.h" 60 #include "nv_ref.h" 61 62 #define NOCAT_UNKNOWN_STR "*** unknown ***" 63 #define NOCAT_NA_STR "N/A" 64 #define NOCAT_LEGACY_STR "LEGACY" 65 #define NOCAT_FULLCHIP_TDR_STR "FULL CHIP RESET" 66 #define NOCAT_BUS_RESET_TDR_STR "BUS RESET" 67 #define NOCAT_GC6_RESET_TDR_STR "GC6 RESET" 68 #define NOCAT_NORMAL_TDR_STR "NORMAL TDR" 69 #define NOCAT_UCODE_RESET_TDR_STR "UCODE RESET" 70 #define NOCAT_SURPRISE_REMOVAL_TDR_STR "SURPRISE REMOVAL" 71 #define NOCAT_DEFAULT_TAG_VALUE_STR "prod" 72 #define NOCAT_DEFAULT_TDR_REASON_SRC_STR "KMD" 73 #define NOCAT_FBSIZETESTED 0x10 74 75 // Reducing size to 4K for reducing non-paged allocations on win8 76 #define NVDUMP_DEBUGGER_BUFFER_SIZE (4 * 1024) 77 78 #define JOURNAL_BUFFER_SIZE_DEFAULT (4 * 1024) 79 80 #define JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES 5 81 82 static void nvdDebuggerControlFunc(void); 83 84 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS) 85 #if !defined(DEBUG) && !defined(QA_BUILD) 86 static NvBool rcdProbeGpuPresent(OBJGPU *pGpu, NvU64 ip); 87 static NvBool rcdProbeAllGpusPresent(NvU64 ip); 88 static volatile NvS32 probeGpuRecursion = 0; 89 #endif 90 #endif 91 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type); 92 static volatile NvS32 concurrentRingBufferAccess = 0; 93 static volatile NvS32 assertListRecursion = 0; 94 static void rcdbFindRingBufferForType(Journal *pRcDB, RMCD_RECORD_TYPE recType, RING_BUFFER_LOG **ppRingBuffer); 95 static NV_STATUS _rcdbGetNocatJournalRecord(OBJRCDB* pRcdb, 96 NvU32 id, NvBool bExactMatch, 97 RmRCCommonJournal_RECORD** ppReturnedCommon, RM_NOCAT_JOURNAL_ENTRY** ppReturnedJournal); 98 static NV_STATUS _rcdbReleaseNocatJournalRecord(RM_NOCAT_JOURNAL_ENTRY* pReturnedJournal); 99 static NV_STATUS _rcdbNocatReportAssert(OBJGPU* pGpu, RmRCCommonAssert_RECORD* pAssert); 100 101 // Global flag to make sure we never re-enter the nvLog code. 102 #if defined(DEBUG) || defined(ASSERT_BUILD) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)) 103 static volatile NvS32 nvLogRecursion = 0; 104 #endif 105 106 // NvDump interface config - communicates with external kernel debuggers 107 NVDUMP_EXPORT volatile NV_DECLARE_ALIGNED(NVDUMP_CONFIG, 8) nvDumpConfig = 108 { 109 NVDUMP_CONFIG_SIGNATURE, // sigHead 110 NvP64_NULL, // debuggerControlFuncAddr 111 { NvP64_NULL, NVDUMP_DEBUGGER_BUFFER_SIZE, 0 }, // buffer 112 0, // gpuSelect 113 NVDUMP_COMPONENT_SYS_ALL, // component 114 NVDUMP_STATUS_IDLE, // dumpStatus 115 NV_OK, // rmStatus 116 117 NVDUMP_CONFIG_SIGNATURE // sigTail 118 }; 119 120 void 121 rcdbDestruct_IMPL(Journal *pRcDB) 122 { 123 EVENT_JOURNAL *pJournal = &pRcDB->Journal; 124 125 // Deallocate NvDebug debugger dump buffer. 126 if (nvDumpConfig.buffer.address != NvP64_NULL) 127 { 128 portMemFree(NvP64_VALUE(nvDumpConfig.buffer.address)); 129 nvDumpConfig.buffer.address = NvP64_NULL; 130 } 131 132 // Delete Journal and Btree 133 if (pJournal->pBuffer != NULL) 134 { 135 portMemFree(pJournal->pBuffer); 136 portMemFree(pJournal->AssertList.ppList); 137 138 // clear journal of anything 139 portMemSet(pJournal, 0, sizeof(EVENT_JOURNAL)); 140 } 141 142 rcdbClearErrorHistory(pRcDB); 143 144 rcdbDestroyRingBufferCollection(pRcDB); 145 146 portMemFree(pRcDB->previousDriverVersion); 147 pRcDB->previousDriverVersion = NULL; 148 149 portMemFree(pRcDB->previousDriverBranch); 150 pRcDB->previousDriverBranch = NULL; 151 } 152 153 static void 154 _initJournal(EVENT_JOURNAL *pJournal, NvU32 size) 155 { 156 // verify we are not abandoning any memory allocations. 157 NV_ASSERT(NULL == pJournal->pBuffer); 158 NV_ASSERT(NULL == (NvU8*) pJournal->AssertList.ppList); 159 160 // init the Journal to an empty buffer. 161 pJournal->pBuffer = NULL; 162 pJournal->BufferSize = 0; 163 pJournal->pFree = pJournal->pBuffer; 164 pJournal->BufferRemaining = pJournal->BufferSize; 165 pJournal->pCurrCollection = NULL; 166 pJournal->RecordCount = 0; 167 168 // init the assert list to an empty buffer. 169 pJournal->AssertList.ppList = NULL; 170 pJournal->AssertList.Size = 0; 171 pJournal->AssertList.Count = 0; 172 pJournal->AssertList.QualifyingStackSize = JOURNAL_ASSERT_RECORD_QUALIFYING_STACK_ENTRIES; 173 174 // allocate and initialize journal buffer memory 175 pJournal->pBuffer = portMemAllocNonPaged(size); 176 if (pJournal->pBuffer != NULL ) 177 { 178 pJournal->BufferSize = size; 179 pJournal->pFree = pJournal->pBuffer; 180 pJournal->BufferRemaining = pJournal->BufferSize; 181 182 // if the journal is large enough to hold at least one assert record, 183 // init the assert list as well. 184 if (sizeof(RmRCCommonAssert_RECORD) <= pJournal->BufferSize) 185 { 186 pJournal->AssertList.Size = pJournal->BufferSize / sizeof(RmRCCommonAssert_RECORD); 187 pJournal->AssertList.ppList = portMemAllocNonPaged(pJournal->AssertList.Size * sizeof(pJournal->AssertList.ppList[0])); 188 if (pJournal->AssertList.ppList == NULL ) 189 { 190 NV_PRINTF(LEVEL_ERROR, 191 "Failure to allocate RC assert tracking buffer \n"); 192 pJournal->AssertList.Size = 0; 193 } 194 } 195 } 196 else 197 { 198 NV_PRINTF(LEVEL_ERROR, "Failure to allocate RC journal buffer \n"); 199 } 200 } 201 202 NV_STATUS 203 rcdbConstruct_IMPL(Journal *pRcDB) 204 { 205 EVENT_JOURNAL *pJournal = &pRcDB->Journal; 206 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 207 NvU32 i; 208 void *pBuffer; 209 210 // Time parameters 211 NvU32 sec, usec; 212 NvU64 timeStamp; 213 NvU64 systemTime; 214 NvU64 timeStampFreq; 215 216 _initJournal(pJournal, JOURNAL_BUFFER_SIZE_DEFAULT); 217 218 portMemSet(pRingBufferColl, 0x00, sizeof(pRcDB->RingBufferColl)); 219 220 pRcDB->BugcheckCount = 0; 221 222 // Allocate NvDebug debugger dump buffer. 223 pBuffer = portMemAllocNonPaged(nvDumpConfig.buffer.size); 224 if (pBuffer != NULL) 225 { 226 nvDumpConfig.buffer.address = NV_SIGN_EXT_PTR_TO_NvP64(pBuffer); 227 } 228 else 229 { 230 NV_PRINTF(LEVEL_ERROR, 231 "failed to allocate NVD debugger dump buffer\n"); 232 } 233 234 // Initialize NvDebug debugger function address. 235 nvDumpConfig.debuggerControlFuncAddr = NV_SIGN_EXT_PTR_TO_NvP64(nvdDebuggerControlFunc); 236 237 // 238 // Create RC Diagnostic report Wrap Buffer 239 // 240 if (NULL == rcdbCreateRingBuffer(pRcDB, RmRcDiagReport, MAX_RCDB_RCDIAG_WRAP_BUFF)) 241 { 242 NV_PRINTF(LEVEL_ERROR, "failed to allocate RC Diagnostic Ring Buffer\n"); 243 } 244 // init the RC error report data 245 pRcDB->RcErrRptNextIdx = 0; 246 pRcDB->RcErrRptRecordsDropped = NV_FALSE; 247 248 // Initialize RC Error Counters. 249 for ( i = 0 ; i < MAX_RC_ERROR_COUNTER ; i++) 250 { 251 pRcDB->rcErrorCounterArray[i].rcErrorType = RC_ERROR_COUNTER_TYPE_INVALID; 252 pRcDB->rcErrorCounterArray[i].rcErrorCount = 0; 253 pRcDB->rcErrorCounterArray[i].rcLastCHID = INVALID_CHID; 254 pRcDB->rcErrorCounterArray[i].rcLastTime = 0; 255 } 256 pRcDB->rcErrorCounterArray[RC_ERROR_COUNTER_OTHER_INDEX].rcErrorType = RC_ERROR_COUNTER_OTHER_TYPE; 257 258 // clear the Nocat Queue descriptors & counters 259 portMemSet(&pRcDB->nocatJournalDescriptor, 0x00, sizeof(pRcDB->nocatJournalDescriptor)); 260 portMemSet(pRcDB->nocatJournalDescriptor.lastRecordId, 0xff, sizeof(pRcDB->nocatJournalDescriptor.lastRecordId)); 261 pRcDB->nocatJournalDescriptor.nocatLastRecordType = NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN; 262 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks = NOCAT_CACHE_FRESHNESS_PERIOD_MS; 263 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks *= osGetTimestampFreq(); 264 pRcDB->nocatJournalDescriptor.cacheFreshnessPeriodticks /= 1000ULL; 265 266 // 267 // Create NOCAT report Wrap Buffer 268 // 269 if (NULL == rcdbCreateRingBuffer(pRcDB, RmNocatReport, MAX_RCDB_NOCAT_WRAP_BUFF)) 270 { 271 NV_PRINTF(LEVEL_ERROR, "failed to allocate NOCAT Ring Buffer\n"); 272 } 273 274 // Save params for timestamp conversion 275 timeStampFreq = osGetTimestampFreq(); 276 timeStamp = osGetTimestamp(); 277 osGetCurrentTime(&sec, &usec); 278 systemTime = ((NvU64)sec * 1000000) + (NvU64)usec; 279 280 pRcDB->systemTimeReference = systemTime - ((timeStamp * 1000000) / timeStampFreq); 281 pRcDB->timeStampFreq = timeStampFreq; 282 283 return NV_OK; 284 } 285 286 // 287 // Retrieve the previous driver version from volatile registry entires 288 // and then save the current driver version for next time. 289 // 290 NV_STATUS rcdbSavePreviousDriverVersion_IMPL 291 ( 292 OBJGPU *pGpu, 293 Journal *pRcDB 294 ) 295 { 296 NV_STATUS nvStatus = NV_OK; 297 298 NvU32 regEntrySize = 0; 299 NvU32 changeListNum = NV_LAST_OFFICIAL_CHANGELIST_NUM; 300 301 // Only run this code only once each time the driver is loaded. 302 if (pRcDB->bPrevDriverCodeExecuted) 303 return NV_OK; 304 305 pRcDB->bPrevDriverCodeExecuted = NV_TRUE; 306 307 // 308 // Get the previous driver version information 309 // from volatile registry settings. 310 // 311 nvStatus = osReadRegistryVolatileSize(pGpu, 312 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, ®EntrySize); 313 314 // Early exit if this platform does not support volatile registry. 315 if (nvStatus == NV_ERR_NOT_SUPPORTED) 316 return NV_OK; 317 318 if ((NV_OK == nvStatus) && (0 != regEntrySize)) 319 { 320 // 321 // Previous driver version is there, so assume all previous driver 322 // information is there as well. 323 // 324 pRcDB->previousDriverVersion = portMemAllocNonPaged(regEntrySize + 1); 325 if (pRcDB->previousDriverVersion == NULL) 326 { 327 nvStatus = NV_ERR_NO_MEMORY; 328 DBG_BREAKPOINT(); 329 goto rcdbSavePreviousDriverVersion_writeRegistry; 330 } 331 332 nvStatus = osReadRegistryVolatile(pGpu, 333 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, 334 (NvU8 *)pRcDB->previousDriverVersion, 335 regEntrySize); 336 if (nvStatus != NV_OK) 337 { 338 DBG_BREAKPOINT(); 339 goto rcdbSavePreviousDriverVersion_writeRegistry; 340 } 341 pRcDB->previousDriverVersion[regEntrySize] = 0; 342 343 nvStatus = osReadRegistryVolatileSize(pGpu, 344 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, ®EntrySize); 345 if ((nvStatus != NV_OK) || (0 == regEntrySize)) 346 { 347 DBG_BREAKPOINT(); 348 goto rcdbSavePreviousDriverVersion_writeRegistry; 349 } 350 351 pRcDB->previousDriverBranch = portMemAllocNonPaged(regEntrySize + 1); 352 if (pRcDB->previousDriverBranch == NULL) 353 { 354 nvStatus = NV_ERR_NO_MEMORY; 355 DBG_BREAKPOINT(); 356 goto rcdbSavePreviousDriverVersion_writeRegistry; 357 } 358 359 nvStatus = osReadRegistryVolatile(pGpu, 360 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, 361 (NvU8 *)pRcDB->previousDriverBranch, 362 regEntrySize); 363 if (nvStatus != NV_OK) 364 { 365 DBG_BREAKPOINT(); 366 goto rcdbSavePreviousDriverVersion_writeRegistry; 367 } 368 pRcDB->previousDriverBranch[regEntrySize] = 0; 369 370 nvStatus = osReadRegistryVolatile(pGpu, 371 NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST, 372 (NvU8 *)&pRcDB->prevDriverChangelist, 373 sizeof(pRcDB->prevDriverChangelist)); 374 if (nvStatus != NV_OK) 375 { 376 DBG_BREAKPOINT(); 377 goto rcdbSavePreviousDriverVersion_writeRegistry; 378 } 379 380 nvStatus = osReadRegistryVolatile(pGpu, 381 NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT, 382 (NvU8 *)&pRcDB->driverLoadCount, 383 sizeof(pRcDB->driverLoadCount)); 384 if (nvStatus != NV_OK) 385 { 386 DBG_BREAKPOINT(); 387 goto rcdbSavePreviousDriverVersion_writeRegistry; 388 } 389 } 390 391 // Always write out the driver info, even if there was an error reading it. 392 rcdbSavePreviousDriverVersion_writeRegistry: 393 pRcDB->driverLoadCount++; 394 395 osWriteRegistryVolatile(pGpu, 396 NV_REG_STR_RM_RC_PREV_DRIVER_VERSION, 397 (NvU8 *)NV_VERSION_STRING, 398 sizeof(NV_VERSION_STRING)); 399 400 osWriteRegistryVolatile(pGpu, 401 NV_REG_STR_RM_RC_PREV_DRIVER_BRANCH, 402 (NvU8 *)NV_BUILD_BRANCH_VERSION, 403 sizeof(NV_BUILD_BRANCH_VERSION)); 404 405 osWriteRegistryVolatile(pGpu, 406 NV_REG_STR_RM_RC_PREV_DRIVER_CHANGELIST, 407 (NvU8 *)&changeListNum, 408 sizeof(changeListNum)); 409 410 osWriteRegistryVolatile(pGpu, 411 NV_REG_STR_RM_RC_PREV_DRIVER_LOAD_COUNT, 412 (NvU8 *)&pRcDB->driverLoadCount, 413 sizeof(pRcDB->driverLoadCount)); 414 415 return nvStatus; 416 } 417 418 NV_STATUS rcdbAddAssertJournalRecWithLine(void *pVoidGpu, NvU32 lineNum, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key) 419 { 420 OBJSYS *pSys; 421 Journal *pRcDB; 422 OBJGPU *pPossibleNULLGpu; 423 JOURNAL_ASSERT_LIST *pAssertList; 424 RmRCCommonAssert_RECORD newAssertRec; 425 RmRCCommonAssert_RECORD *pAssertRec; 426 NV_STATUS rmStatus = NV_ERR_GENERIC; 427 NvU32 i; 428 429 // 430 // Note: we allow NULL pGpu here, as many clients (such as KMD) 431 // do not have access to pGpu. And much of the RM does not provide this either. 432 // 433 pPossibleNULLGpu = reinterpretCast(pVoidGpu, OBJGPU *); 434 435 pSys = SYS_GET_INSTANCE(); 436 if (!pSys) 437 { 438 return NV_ERR_INVALID_STATE; 439 } 440 441 pRcDB = SYS_GET_RCDB(pSys); 442 if (!pRcDB) 443 { 444 return NV_ERR_INVALID_STATE; 445 } 446 447 pAssertList = &pRcDB->Journal.AssertList; 448 449 *ppRec = NULL; 450 451 RMTRACE_PROBE4_PRIMTYPE(rcjournal, assertlog, NvU32, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : 0), NvU8, type, NvU32, level, NvU64, key); 452 453 // create a local instance of the Assert record. 454 portMemSet(&newAssertRec, 0x00, sizeof(newAssertRec)); 455 rcdbSetCommonJournalRecord(pPossibleNULLGpu, &newAssertRec.common); 456 newAssertRec.count = 1; 457 newAssertRec.breakpointAddrHint = key; 458 newAssertRec.lineNum = lineNum; 459 460 if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_COMPRESS)) 461 { 462 // search for a pre-existing assert record with the same stack 463 for (i = 0; i < pAssertList->Count; ++i) 464 { 465 pAssertRec = pAssertList->ppList[i]; 466 if ((newAssertRec.breakpointAddrHint == pAssertRec->breakpointAddrHint) && 467 (0 == portMemCmp(newAssertRec.callStack, pAssertRec->callStack, 468 sizeof(newAssertRec.callStack[0]) * pAssertList->QualifyingStackSize))) 469 { 470 pAssertRec->count++; 471 pAssertRec->lastTimeStamp = newAssertRec.common.timeStamp; 472 473 rmStatus = NV_OK; 474 break; 475 } 476 } 477 } 478 479 if (rmStatus != NV_OK) 480 { 481 // Discard to avoid reentry from messing up record array. 482 if (portAtomicIncrementS32(&assertListRecursion) == 1) 483 { 484 rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pAssertRec, jGroup, type, size); 485 if (NV_OK == rmStatus) 486 { 487 // the Header is filled in when the record is allocated, so update the local instance header. 488 newAssertRec.common.Header = pAssertRec->common.Header; 489 *pAssertRec = newAssertRec; 490 if (pAssertList->Count < pAssertList->Size) 491 { 492 pAssertList->ppList[pAssertList->Count] = pAssertRec; 493 ++(pAssertList->Count); 494 } 495 else 496 { 497 // based on the way the assert list size is calculated this should never happen.... 498 NV_PRINTF(LEVEL_ERROR, 499 "failed to insert tracking for assert record\n"); 500 } 501 } 502 } 503 portAtomicDecrementS32(&assertListRecursion); 504 } 505 506 if (rmStatus == NV_OK) 507 { 508 RMTRACE_RMJOURNAL(_ASSERTLOG, (pPossibleNULLGpu ? pPossibleNULLGpu->gpuId : RMTRACE_UNKNOWN_GPUID), 509 type, 510 jGroup, 511 key, 512 pAssertRec->count, 513 pAssertRec->common.timeStamp, 514 pAssertRec->lastTimeStamp); 515 *ppRec = pAssertRec; 516 517 _rcdbNocatReportAssert(pPossibleNULLGpu, pAssertRec); 518 } 519 else 520 { 521 _rcdbNocatReportAssert(pPossibleNULLGpu, &newAssertRec); 522 } 523 524 return rmStatus; 525 } 526 527 NV_STATUS rcdbAddAssertJournalRec(void *pVoidGpu, void** ppRec, NvU8 jGroup, NvU8 type, NvU16 size, NvU32 level, NvU64 key) 528 { 529 return rcdbAddAssertJournalRecWithLine(pVoidGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, ppRec, jGroup, type, size, level, key); 530 } 531 // Populate stateMask with flags that represent the power state and other useful things. 532 static NvU64 _getCommonJournalStateMask(OBJGPU *pGpu) 533 { 534 NvU64 stateMask = REF_NUM(NV_RM_JOURNAL_STATE_MASK_GC6_STATE, 535 pGpu->gc6State.currentState); 536 537 if (!gpuIsGpuFullPower(pGpu)) 538 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_FULL_POWER; 539 540 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED)) 541 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_NOT_CONNECTED; 542 543 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_STANDBY)) 544 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_STANDBY; 545 546 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_HIBERNATE)) 547 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_HIBERNATE; 548 549 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH)) 550 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_PM_CODEPATH; 551 552 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET)) 553 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_GC6_RESET; 554 555 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET)) 556 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_FULLCHIP_RESET; 557 558 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET)) 559 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_SEC_BUS_RESET; 560 561 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_TIMEOUT_RECOVERY)) 562 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_IN_TIMEOUT_RECOVERY; 563 564 if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST)) 565 stateMask |= NV_RM_JOURNAL_STATE_MASK_IS_LOST; 566 567 return stateMask; 568 } 569 570 // Fill in the common portion of the journal structure. 571 void 572 rcdbSetCommonJournalRecord 573 ( 574 OBJGPU *pGpu, 575 RmRCCommonJournal_RECORD *pRec 576 ) 577 { 578 OS_THREAD_HANDLE threadId; 579 580 pRec->timeStamp = osGetTimestamp(); 581 pRec->GPUTag = 0; 582 pRec->CPUTag = 0; 583 pRec->stateMask = 0; 584 585 if (pGpu) 586 { 587 pRec->GPUTag = pGpu->gpuId; 588 pRec->stateMask = _getCommonJournalStateMask(pGpu); 589 } 590 591 if (NV_OK == osGetCurrentThread(&threadId)) 592 { 593 pRec->CPUTag = (NvU64)threadId; 594 } 595 } 596 597 NV_STATUS 598 rcdbAddBugCheckRec_IMPL 599 ( 600 OBJGPU *pGpu, 601 Journal *pRcDB, 602 NvU32 bugCheckCode 603 ) 604 { 605 RmJournalBugcheck_RECORD *pRec; 606 NV_STATUS rmStatus; 607 608 rmStatus = rcdbAllocNextJournalRec(pRcDB, 609 (NVCD_RECORD **)&pRec, 610 RmGroup, 611 RmJournalBugCheck, 612 sizeof(*pRec)); 613 if (NV_OK == rmStatus) 614 { 615 rcdbSetCommonJournalRecord(pGpu, &pRec->common); 616 pRec->bugCheckCode = bugCheckCode; 617 } 618 619 pRcDB->BugcheckCount++; 620 621 return rmStatus; 622 } 623 624 NV_STATUS 625 rcdbAddPowerStateRec_IMPL 626 ( 627 OBJGPU *pGpu, 628 Journal *pRcDB, 629 NvU32 powerEvent, 630 NvU32 state, 631 NvU32 fastBootPowerState 632 ) 633 { 634 RmPowerState_RECORD newRmDiagWrapBuffRec; 635 636 // Create Records, then write it. 637 newRmDiagWrapBuffRec.powerState = state; 638 newRmDiagWrapBuffRec.powerEvent = powerEvent; 639 newRmDiagWrapBuffRec.fastBootPowerState = fastBootPowerState; 640 rcdbAddRecToRingBuffer(pGpu, pRcDB, RmPowerState, 641 sizeof(RmPowerState_RECORD), (NvU8 *)&newRmDiagWrapBuffRec); 642 return NV_OK; 643 } 644 645 NV_STATUS 646 rcdbGetRcDiagRecBoundaries_IMPL 647 ( 648 Journal *pRcDB, 649 NvU16 *pStart, 650 NvU16 *pEnd, 651 NvU32 owner, 652 NvU32 processId 653 ) 654 { 655 NV_STATUS status = NV_ERR_MISSING_TABLE_ENTRY; 656 RmRCCommonJournal_RECORD *pCommon; 657 RmRcDiag_RECORD *pRecord = NULL; 658 RING_BUFFER_LOG *pRingBuffer = NULL; 659 NvU32 i; 660 NvU16 logicalStartIdx; 661 NvU16 start = 0; 662 NvU16 end = 0; 663 NvBool foundStart = NV_FALSE; 664 NvBool foundEnd = NV_FALSE; 665 666 // scan the buffer to find all the qualified records & return the 667 // first & last indicies of the qualified records found. 668 669 // Get the Diag Report Ring buffer. 670 rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer); 671 672 // attempt to claim ownership 673 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 674 { 675 // get the logical start of the buffer. 676 logicalStartIdx = pRingBuffer->headIndex; 677 678 // run thru all the entries in the buffer, start to end, until we find the start & end of the range we are looking for. 679 for (i = 0; i < pRingBuffer->numEntries; ++i) 680 { 681 // get a pointer to the record from the buffer. 682 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * ((logicalStartIdx + i) % pRingBuffer->maxEntries))); 683 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]); 684 685 // check to see if the record qualifies 686 if (((RCDB_RCDIAG_DEFAULT_OWNER != owner) && (pRecord->owner != owner) && (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID != owner)) 687 || ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID != processId) && (pRecord->processId != processId))) 688 { 689 continue; 690 } 691 switch (foundStart) 692 { 693 case NV_FALSE: 694 // check if this is a start record. 695 // we want the first record to be a start record to insure that all the reports that are in the range are complete 696 // (I.E. we didn't wrap over the first record of a report) 697 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_FIRST)) 698 { 699 // yes save the idx as the first Idx, & note that we found the start of the range. 700 start = pRecord->idx; 701 foundStart = NV_TRUE; 702 } 703 // fall thru to check if the start of the report is also the end of the report. 704 705 case NV_TRUE: 706 // check if this is an end record. 707 // we want the last record in the range to be an end record to insure that all the reports that are in the range are complete 708 // (Note -- in the case of end records, this should only be an issue if we are interrupting the collection of a report) 709 if (0 != (pRecord->flags & NV0000_CTRL_CMD_NVD_RCERR_RPT_FLAGS_POS_LAST)) 710 { 711 // save the idx as the last idx & continue scanning until we have checked all the records. 712 // the last idx saved will be the last idx. 713 end = pRecord->idx; 714 foundEnd = foundStart; 715 } 716 break; 717 } 718 } 719 // checking end is sufficient, because end can't be set w/o start being set first. 720 if (foundEnd) 721 { 722 // we found a complete range, mark us as succeeding. 723 status = NV_OK; 724 725 // pass up the results. 726 if (NULL != pEnd) 727 { 728 *pEnd = end; 729 } 730 if (NULL != pStart) 731 { 732 *pStart = start; 733 } 734 } 735 } 736 else 737 { 738 // the buffer is currently busy. 739 status = NV_ERR_BUSY_RETRY; 740 } 741 portAtomicDecrementS32(&concurrentRingBufferAccess); 742 return status; 743 } 744 745 RmRCCommonJournal_RECORD * 746 rcdbAddRcDiagRec_IMPL 747 ( 748 OBJGPU *pGpu, 749 Journal *pRcDB, 750 RmRcDiag_RECORD *pRmDiagWrapBuffRec 751 ) 752 { 753 RmRCCommonJournal_RECORD *pCommon; 754 NvU32 usec; 755 756 // Create Records, then write it. 757 pRmDiagWrapBuffRec->idx = (pRcDB->RcErrRptNextIdx)++; 758 if (MAX_RCDB_RCDIAG_ENTRIES < pRmDiagWrapBuffRec->count) 759 { 760 NV_ASSERT_FAILED("Diag report to large for buffer"); 761 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].offset = 0; 762 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].tag = NV0000_CTRL_CMD_NVD_RCERR_RPT_REG_OVERFLOWED; 763 pRmDiagWrapBuffRec->data[MAX_RCDB_RCDIAG_ENTRIES - 1].value = pRmDiagWrapBuffRec->count - MAX_RCDB_RCDIAG_ENTRIES + 1; 764 pRmDiagWrapBuffRec->count = MAX_RCDB_RCDIAG_ENTRIES; 765 } 766 osGetCurrentTime(&(pRmDiagWrapBuffRec->timeStamp), &usec); 767 768 pCommon = rcdbAddRecToRingBuffer(pGpu, pRcDB, RmRcDiagReport, 769 sizeof(RmRcDiag_RECORD), (NvU8 *)pRmDiagWrapBuffRec); 770 771 pRcDB->RcErrRptRecordsDropped |= pRcDB->RcErrRptNextIdx >= MAX_RCDB_RCDIAG_WRAP_BUFF; 772 return pCommon; 773 } 774 775 RmRCCommonJournal_RECORD * 776 rcdbAddRcDiagRecFromGsp_IMPL 777 ( 778 OBJGPU *pGpu, 779 Journal *pRcDB, 780 RmRCCommonJournal_RECORD *pCommonGsp, 781 RmRcDiag_RECORD *pRmDiagGsp 782 ) 783 { 784 RmRCCommonJournal_RECORD *pCommonCpu; 785 786 pCommonCpu = rcdbAddRcDiagRec(pGpu, pRcDB, pRmDiagGsp); 787 if (pCommonCpu) 788 { 789 NV_ASSERT(pCommonCpu->GPUTag == pCommonGsp->GPUTag); 790 pCommonCpu->stateMask |= pCommonGsp->stateMask; 791 } 792 793 return pCommonCpu; 794 } 795 796 NV_STATUS 797 _rcdbInternalGetRcDiagRec 798 ( 799 Journal *pRcDB, 800 NvU16 reqIdx, 801 RmRCCommonJournal_RECORD **ppRmDiagWrapBuffRec, 802 NvU32 owner, 803 NvU32 processId 804 ) 805 { 806 RmRCCommonJournal_RECORD *pCommon; 807 RmRcDiag_RECORD* pRecord = NULL; 808 NV_STATUS status = NV_ERR_INVALID_INDEX; 809 RING_BUFFER_LOG *pRingBuffer = NULL; 810 811 NvU32 i; 812 813 // assume we will fail. 814 *ppRmDiagWrapBuffRec = NULL; 815 816 // Find the ring buffer for the diag reports 817 rcdbFindRingBufferForType(pRcDB, RmRcDiagReport, &pRingBuffer); 818 819 // is the requested record in the buffer? 820 if ((NvU16)(pRcDB->RcErrRptNextIdx - reqIdx) <= pRingBuffer->numEntries) 821 { 822 // calculate the location of the record. 823 // find the record just past the last record in the buffer. to use as the initial offset. 824 i = pRingBuffer->headIndex + pRingBuffer->numEntries; 825 826 // subtract off the diff between the next idx to be used & the requested idx. 827 i -= pRcDB->RcErrRptNextIdx - reqIdx; 828 829 // wrap the offset to the size of the buffer. 830 i %= pRingBuffer->maxEntries; 831 832 // get a pointer to the record from the buffer. 833 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport) * i)); 834 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]); 835 836 // verify we have the record that was requested. 837 NV_ASSERT_OR_RETURN(pRecord->idx == reqIdx, NV_ERR_INVALID_INDEX); 838 839 // we found the requested Index, 840 // check to see if the record qualifies 841 if (((RCDB_RCDIAG_DEFAULT_OWNER == owner) || (pRecord->owner == owner) || (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner)) 842 && ((NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId) || (pRecord->processId == processId))) 843 { 844 // combination of ANY_OWNER_ID && ANY_PROCESS_ID is not valid 845 if (NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_OWNER_ID == owner && NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID == processId) 846 { 847 status = NV_ERR_INSUFFICIENT_PERMISSIONS; 848 goto exit; 849 } 850 // we found a record that fully qualifies 851 *ppRmDiagWrapBuffRec = pCommon; 852 status = NV_OK; 853 } 854 else 855 { 856 // we found the record, but it does not pass the filter. 857 status = NV_ERR_INSUFFICIENT_PERMISSIONS; 858 } 859 } 860 exit: 861 return status; 862 } 863 864 NV_STATUS 865 rcdbGetRcDiagRec_IMPL 866 ( 867 Journal *pRcDB, 868 NvU16 reqIdx, 869 RmRCCommonJournal_RECORD **ppRmDiagWrapBuffRec, 870 NvU32 owner, 871 NvU32 processId 872 ) 873 { 874 NV_STATUS status; 875 876 if (ppRmDiagWrapBuffRec == NULL) 877 { 878 return NV_ERR_INVALID_ARGUMENT; 879 } 880 881 *ppRmDiagWrapBuffRec = NULL; 882 883 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 884 { 885 status = _rcdbInternalGetRcDiagRec(pRcDB, reqIdx, ppRmDiagWrapBuffRec, owner, processId); 886 } 887 else 888 { 889 status = NV_ERR_BUSY_RETRY; 890 } 891 portAtomicDecrementS32(&concurrentRingBufferAccess); 892 return status; 893 } 894 895 // 896 // The function to set context data for all the RmRcDiag_RECORDs in a specified range 897 // 898 NV_STATUS 899 rcdbUpdateRcDiagRecContext_IMPL 900 ( 901 Journal *pRcDB, 902 NvU16 rangeStartIdx, 903 NvU16 rangeEndIdx, 904 NvU32 processId, 905 NvU32 owner 906 ) 907 { 908 RmRCCommonJournal_RECORD *pCommon = NULL; 909 RmRcDiag_RECORD* pRecord = NULL; 910 NV_STATUS status = NV_OK; 911 NV_STATUS recStatus = NV_ERR_OUT_OF_RANGE; 912 913 NvU16 i; 914 915 // go from the start index thru the end index. 916 // note we use != because the indicies will wrap. 917 for (i = rangeStartIdx; i != (NvU16)(rangeEndIdx + 1U); i++) 918 { 919 recStatus = rcdbGetRcDiagRec(pRcDB, i, &pCommon, RCDB_RCDIAG_DEFAULT_OWNER, NV0000_CTRL_CMD_NVD_RCERR_RPT_ANY_PROCESS_ID); 920 if (NV_OK != recStatus) 921 { 922 // something went wrong, 923 // record the status & skip this record. 924 status = recStatus; 925 continue; 926 } 927 // get the pointer to the diag record. 928 pRecord = (RmRcDiag_RECORD*) &(pCommon[1]); 929 930 pRecord->owner = owner; 931 pRecord->processId = processId; 932 } 933 return status; 934 } 935 936 // 937 // size must include NVCD_RECORD size too 938 // 939 NV_STATUS rcdbAllocNextJournalRec_IMPL(Journal *pRcDB, NVCD_RECORD** ppRec, NvU8 jGroup, NvU8 type, NvU16 size) 940 { 941 EVENT_JOURNAL *pJournal = &pRcDB->Journal; 942 943 if ( ppRec == NULL ) 944 return NV_ERR_GENERIC; 945 946 if ( pJournal->pBuffer == NULL || pJournal->BufferSize == 0 ) 947 return NV_ERR_GENERIC; 948 949 if ( size == 0 || pJournal->BufferRemaining < size ) 950 { 951 return NV_ERR_GENERIC; 952 } 953 954 *ppRec = (NVCD_RECORD*)(pJournal->pFree); 955 956 (*ppRec)->cRecordGroup = jGroup; 957 (*ppRec)->cRecordType = type; 958 (*ppRec)->wRecordSize = size; 959 960 if ( pJournal->pCurrCollection ) 961 { 962 pJournal->pCurrCollection->NumRecords++; 963 pJournal->pCurrCollection->Header.wRecordSize += size; 964 } 965 else 966 { 967 // standalone record (not part of collection) - increase total count 968 pJournal->RecordCount++; 969 } 970 971 pJournal->pFree += size; 972 pJournal->BufferRemaining -= size; 973 974 return NV_OK; 975 } 976 977 NV_STATUS rcdbClearErrorHistory_IMPL(Journal *pRcDB) 978 { 979 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo; 980 RMFIFOERRORELEMENT_V3* pFifoErrorInfo; 981 RMFIFOERRORELEMENT_V3* pFreeErrorInfo; 982 983 // Wait until any errors currently being reported are complete 984 while (!portAtomicCompareAndSwapU32(&pSysErrorInfo->InUse, 1, 0)) 985 { 986 // We're not going to sleep, but safe to sleep also means safe to spin.. 987 NV_ASSERT_OR_RETURN(portSyncExSafeToSleep(), NV_ERR_INVALID_STATE); 988 portUtilSpin(); 989 } 990 991 pFifoErrorInfo = (RMFIFOERRORELEMENT_V3*) pSysErrorInfo->pErrorList; 992 while (NULL != pFifoErrorInfo) 993 { 994 pFreeErrorInfo = pFifoErrorInfo; 995 pFifoErrorInfo = pFifoErrorInfo->ErrorHeader.pNextError; 996 rcdbDeleteErrorElement(pRcDB, pFreeErrorInfo); 997 } 998 999 pSysErrorInfo->ErrorCount = 0x0; 1000 pSysErrorInfo->LogCount = 0x0; 1001 pSysErrorInfo->pErrorList = NULL; 1002 1003 portAtomicSetU32(&pSysErrorInfo->InUse, 0); 1004 return NV_OK; 1005 } 1006 1007 1008 NV_STATUS rcdbDeleteErrorElement_IMPL(Journal *pRcDB, void *pDelete) 1009 { 1010 RMFIFOERRORELEMENT_V3* pFifoDelete = (RMFIFOERRORELEMENT_V3*)pDelete; 1011 RMCD_ERROR_BLOCK* pErrorBlock; 1012 RMCD_ERROR_BLOCK* pOldErrorBlock; 1013 1014 // Free Additional Error Block 1015 for (pErrorBlock = pFifoDelete->ErrorHeader.pErrorBlock; pErrorBlock != NULL;) 1016 { 1017 pOldErrorBlock = pErrorBlock; 1018 pErrorBlock = pErrorBlock->pNext; 1019 portMemFree(pOldErrorBlock->pBlock); 1020 portMemFree(pOldErrorBlock); 1021 } 1022 1023 // Free Error Collector 1024 portMemFree(pFifoDelete); 1025 1026 return NV_OK; 1027 } 1028 1029 // Frees up the all the ring buffers 1030 void rcdbDestroyRingBufferCollection_IMPL(Journal *pRcDB) 1031 { 1032 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 1033 NvU32 i; 1034 RING_BUFFER_LOG* pCurrentBuffer = pRingBufferColl->pFirstEntry; 1035 1036 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++) 1037 { 1038 RING_BUFFER_LOG* pTempCurrentBuffer = pCurrentBuffer; 1039 1040 NV_ASSERT(pCurrentBuffer != NULL); 1041 NV_ASSERT(pCurrentBuffer->pBuffer != NULL); 1042 1043 portMemFree(pCurrentBuffer->pBuffer); 1044 1045 pCurrentBuffer = pCurrentBuffer->pNextRingBuffer; 1046 1047 // Free the current ring buffer entry. 1048 portMemFree(pTempCurrentBuffer); 1049 } 1050 1051 // pCurrentBuffer should be NULL if our accounting of NumEntries is correct 1052 NV_ASSERT(pCurrentBuffer == NULL); 1053 1054 portMemSet(pRingBufferColl, 0x00, sizeof(*pRingBufferColl)); 1055 } 1056 1057 1058 static NvU32 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord); 1059 static void _rcdbDumpCommonJournalRecord(PRB_ENCODER *pPrbEnc,const PRB_FIELD_DESC *pFieldDesc,PRmRCCommonJournal_RECORD pRec); 1060 1061 /*! 1062 * @brief Initialize the GPU accessible flag 1063 * 1064 * @param[in] pGPU 1065 * @param[in] pRcDB 1066 * 1067 * @return NV_OK 1068 */ 1069 NV_STATUS 1070 rcdbDumpInitGpuAccessibleFlag_IMPL 1071 ( 1072 OBJGPU *pGpu, 1073 Journal *pRcDB 1074 ) 1075 { 1076 pRcDB->nvDumpState.bGpuAccessible = 1077 pRcDB->nvDumpState.bRMLock && 1078 !pGpu->bIsSOC && 1079 !IS_VIRTUAL(pGpu) && 1080 gpuIsGpuFullPower(pGpu) && 1081 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET) && 1082 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET) && 1083 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET) && 1084 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) && 1085 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST); 1086 1087 // The GPU should be there... but make sure. 1088 if (pRcDB->nvDumpState.bGpuAccessible) 1089 { 1090 if (GPU_REG_RD32(pGpu, NV_PMC_BOOT_0) != pGpu->chipId0) 1091 { 1092 pRcDB->nvDumpState.bGpuAccessible = NV_FALSE; 1093 } 1094 } 1095 1096 return NV_OK; 1097 } 1098 1099 /*! 1100 * @brief Performs a dump of the specified system component into the given buffer. 1101 * 1102 * @param[in] pSys The system object 1103 * @param[in] component NVDUMP_IS_SYS_COMPONENT(component) must be true. 1104 * @param[in, out] pBuffer Buffer to populate with dump results 1105 * @param[in] policy Policy for buffer allocation: use this one, allocate one or count 1106 * @param[in, out] pBufferCallback Callback function for use with fixed-sized buffer encoding. 1107 * If this is NULL then pBuffer->size is assumed to be large 1108 * enough for the whole dump. Otherwise pBufferCallback is called 1109 * when the buffer is full or when a message ends, allowing the 1110 * the callback to construct the whole buffer piece by piece. 1111 * 1112 * @return NV_OK on success and specific error status on failure 1113 */ 1114 NV_STATUS 1115 rcdbDumpComponent_IMPL 1116 ( 1117 OBJRCDB *pRcDB, 1118 NvU32 component, 1119 NVDUMP_BUFFER *pBuffer, 1120 NVDUMP_BUFFER_POLICY policy, 1121 PrbBufferCallback *pBufferCallback 1122 ) 1123 { 1124 NVD_STATE *pNvDumpState = &pRcDB->nvDumpState; 1125 void *pBuff; 1126 PRB_ENCODER encoder; 1127 NV_STATUS status = NV_OK; 1128 NvU8 startingDepth; 1129 1130 // Validate arguments. 1131 NV_ASSERT_OR_RETURN(pBuffer != NULL, NV_ERR_INVALID_ARGUMENT); 1132 1133 // Make sure we were not reentered. 1134 if (pNvDumpState->bDumpInProcess) 1135 return NV_ERR_STATE_IN_USE; 1136 1137 // Initialize dump state. 1138 pNvDumpState->bDumpInProcess = NV_TRUE; 1139 pNvDumpState->bugCheckCode = 0; 1140 pNvDumpState->internalCode = NVD_ERROR_CODE(NVD_EXTERNALLY_GENERATED, 0); 1141 pNvDumpState->bRMLock = rmapiLockIsOwner(); 1142 pNvDumpState->bGpuAccessible = NV_FALSE; 1143 pNvDumpState->initialbufferSize = pBuffer->size; 1144 pNvDumpState->nvDumpType = NVD_DUMP_TYPE_API; 1145 1146 // Clear dump buffer. 1147 pBuffer->curNumBytes = 0; 1148 1149 // Start encoding protobuf dump message. 1150 switch (policy) 1151 { 1152 case NVDUMP_BUFFER_PROVIDED: 1153 prbEncStart(&encoder, NVDEBUG_NVDUMP, NvP64_VALUE(pBuffer->address), 1154 pBuffer->size, pBufferCallback); 1155 break; 1156 case NVDUMP_BUFFER_ALLOCATE: 1157 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 1158 prbEncStartAlloc(&encoder, NVDEBUG_NVDUMP, 1159 pBuffer->size, pBufferCallback)); 1160 break; 1161 case NVDUMP_BUFFER_COUNT: 1162 prbEncStartCount(&encoder, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE); 1163 break; 1164 default: 1165 return NV_ERR_INVALID_ARGUMENT; 1166 } 1167 1168 startingDepth = prbEncNestingLevel(&encoder); 1169 1170 switch (component) 1171 { 1172 case NVDUMP_COMPONENT_SYS_RCDB: 1173 { 1174 NV_CHECK_OK(status, LEVEL_ERROR, 1175 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState)); 1176 break; 1177 } 1178 case NVDUMP_COMPONENT_SYS_SYSINFO: 1179 { 1180 NV_CHECK_OK(status, LEVEL_ERROR, 1181 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState)); 1182 break; 1183 } 1184 case NVDUMP_COMPONENT_SYS_ALL: 1185 { 1186 NV_CHECK_OK(status, LEVEL_ERROR, 1187 rcdbDumpSystemInfo(pRcDB, &encoder, pNvDumpState)); 1188 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR, 1189 rcdbDumpSystemFunc(pRcDB, &encoder, pNvDumpState)); 1190 break; 1191 } 1192 default: 1193 { 1194 NV_PRINTF(LEVEL_ERROR, 1195 "called with invalid component %u selected.\n", 1196 component); 1197 status = NV_ERR_INVALID_ARGUMENT; 1198 break; 1199 } 1200 } 1201 1202 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR, 1203 prbEncUnwindNesting(&encoder, startingDepth)); 1204 1205 { 1206 NvU32 gpu; 1207 OBJGPU *pGpu; 1208 1209 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++) 1210 { 1211 pGpu = gpumgrGetGpu(gpu); 1212 1213 if ((pGpu != NULL) && IS_GSP_CLIENT(pGpu)) 1214 { 1215 NV_RM_RPC_DUMP_PROTOBUF_COMPONENT(pGpu, status, &encoder, 1216 pNvDumpState, component); 1217 1218 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(status, LEVEL_ERROR, 1219 prbEncUnwindNesting(&encoder, startingDepth)); 1220 } 1221 } 1222 } 1223 1224 // Finish encoding protobuf dump message. 1225 pBuffer->curNumBytes = prbEncFinish(&encoder, &pBuff); 1226 pBuffer->address = NV_SIGN_EXT_PTR_TO_NvP64(pBuff); 1227 pNvDumpState->bDumpInProcess = NV_FALSE; 1228 1229 return status; 1230 } 1231 1232 static NV_STATUS 1233 _rcdbGetTimeInfo 1234 ( 1235 PRB_ENCODER *pPrbEnc, 1236 NVD_STATE *pNvDumpState, 1237 const PRB_FIELD_DESC *pFieldDesc 1238 ) 1239 { 1240 NvU64 timeSinceBoot; 1241 NvU32 sec; 1242 NvU32 usec; 1243 NV_STATUS nvStatus = NV_OK; 1244 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc); 1245 1246 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 1247 prbEncNestedStart(pPrbEnc, pFieldDesc)); 1248 1249 prbEncAddUInt64(pPrbEnc, 1250 NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_FREQ, 1251 osGetTimestampFreq()); 1252 1253 // Add Timestamp 1254 prbEncAddUInt64(pPrbEnc, 1255 NVDEBUG_SYSTEMINFO_TIMEINFO_TIMESTAMP_DUMP, 1256 osGetTimestamp()); 1257 osGetCurrentTime(&sec, &usec); 1258 prbEncAddUInt64(pPrbEnc, 1259 NVDEBUG_SYSTEMINFO_TIMEINFO_SYSTEM_TIME_DUMP, 1260 (NvU64)sec * 1000000 + usec); 1261 1262 // Add time since boot in seconds. 1263 osGetCurrentTick(&timeSinceBoot); 1264 prbEncAddUInt32(pPrbEnc, 1265 NVDEBUG_SYSTEMINFO_TIMEINFO_TIME_SINCE_BOOT_SEC, 1266 (NvU32)(timeSinceBoot / 1000000000ULL)); 1267 1268 // Unwind the protobuf to the correct depth. 1269 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1270 prbEncUnwindNesting(pPrbEnc, startingDepth)); 1271 1272 return nvStatus; 1273 } 1274 1275 static const char * GPU_NA_UUID = "N/A"; 1276 1277 NV_STATUS 1278 rcdbDumpSystemInfo_IMPL 1279 ( 1280 OBJRCDB *pRcDB, 1281 PRB_ENCODER *pPrbEnc, 1282 NVD_STATE *pNvDumpState 1283 ) 1284 { 1285 OBJGPU *pGpu; 1286 NvU8 *pGidString; 1287 NvU32 gpu; 1288 NvU32 numGpus; 1289 NvU32 gidStrlen; 1290 NvU32 sizeStr; 1291 NV_STATUS nvStatus = NV_OK; 1292 NvBool bRelease; 1293 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc); 1294 1295 OBJSYS *pSys = SYS_GET_INSTANCE(); 1296 OBJCL *pCl = SYS_GET_CL(pSys); 1297 OBJGPU *pParent; 1298 NvU32 gpuIndex; 1299 NvU32 gpuMask; 1300 NvBool bGpuDone[NV_MAX_DEVICES]; 1301 1302 // All of this stuff should run OK even without the RM lock. 1303 // No need to check pRcDB->nvDumpState.bNoRMLock; 1304 1305 switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode)) 1306 { 1307 case NVD_GPU_GENERATED: 1308 case NVD_SKIP_ZERO: 1309 // don't report on these internal codes. 1310 return NV_OK; 1311 break; 1312 } 1313 1314 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 1315 prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_SYSTEM_INFO)); 1316 1317 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1318 _rcdbGetTimeInfo(pPrbEnc, pNvDumpState, NVDEBUG_SYSTEMINFO_TIME_INFO), 1319 External_Cleanup); 1320 1321 prbEncAddUInt32(pPrbEnc, 1322 NVDEBUG_SYSTEMINFO_BUGCHECK_COUNT, 1323 pRcDB->BugcheckCount); 1324 1325 // Add NorthBridge Info 1326 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1327 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO), 1328 External_Cleanup); 1329 1330 prbEncAddUInt32(pPrbEnc, 1331 NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_ID, 1332 pCl->FHBBusInfo.vendorID | 1333 (pCl->FHBBusInfo.deviceID << 16)); 1334 1335 prbEncAddUInt32(pPrbEnc, 1336 NVDEBUG_SYSTEMINFO_NORTHBRIDGEINFO_SSID, 1337 pCl->FHBBusInfo.subvendorID | 1338 (pCl->FHBBusInfo.subdeviceID << 16)); 1339 1340 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_NORTHBRIDGE_INFO 1341 prbEncNestedEnd(pPrbEnc), 1342 External_Cleanup); 1343 1344 //CPU Info 1345 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1346 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_CPU_INFO), 1347 External_Cleanup); 1348 1349 prbEncAddUInt32(pPrbEnc, 1350 NVDEBUG_SYSTEMINFO_CPUINFO_CPU_TYPE, 1351 pSys->cpuInfo.type); 1352 1353 prbEncAddUInt32(pPrbEnc, 1354 NVDEBUG_SYSTEMINFO_CPUINFO_CPU_CAPS, 1355 pSys->cpuInfo.caps); 1356 1357 prbEncAddUInt32(pPrbEnc, 1358 NVDEBUG_SYSTEMINFO_CPUINFO_NUM_CPU_CORES, 1359 pSys->cpuInfo.numPhysicalCpus); 1360 1361 prbEncAddUInt32(pPrbEnc, 1362 NVDEBUG_SYSTEMINFO_CPUINFO_NUM_LOGICAL_CPUS, 1363 pSys->cpuInfo.numLogicalCpus); 1364 1365 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_CPU_INFO 1366 prbEncNestedEnd(pPrbEnc), 1367 External_Cleanup); 1368 1369 //GPU Info 1370 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1371 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_INFO), 1372 External_Cleanup); 1373 1374 // Count the number of GPUs and List the gpuIds 1375 numGpus = 0; 1376 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++) 1377 { 1378 const NvU32 gidFlags = 1379 DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _FORMAT, _BINARY) | 1380 DRF_DEF(2080_GPU_CMD, _GPU_GET_GID_FLAGS, _TYPE, _SHA1); 1381 1382 pGpu = gpumgrGetGpu(gpu); 1383 1384 if (pGpu) 1385 { 1386 numGpus++; 1387 1388 prbEncAddUInt32(pPrbEnc, 1389 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_ID, 1390 pGpu->gpuId); 1391 1392 nvStatus = gpuGetGidInfo(pGpu, &pGidString, 1393 &gidStrlen, gidFlags); 1394 if (NV_OK == nvStatus) 1395 { 1396 prbEncAddBytes(pPrbEnc, 1397 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID, 1398 pGidString, gidStrlen); 1399 portMemFree(pGidString); 1400 } 1401 else if (pGpu->gpuUuid.isInitialized) 1402 { 1403 prbEncAddBytes(pPrbEnc, 1404 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID, 1405 pGpu->gpuUuid.uuid, sizeof(pGpu->gpuUuid.uuid)); 1406 } 1407 else 1408 { 1409 prbEncAddString(pPrbEnc, 1410 NVDEBUG_SYSTEMINFO_GPUINFO_GPU_UUID, 1411 GPU_NA_UUID); 1412 } 1413 1414 prbEncAddUInt32(pPrbEnc, 1415 NVDEBUG_SYSTEMINFO_GPUINFO_DEVICE_ID, 1416 pGpu->idInfo.PCIDeviceID); 1417 1418 prbEncAddUInt32(pPrbEnc, 1419 NVDEBUG_SYSTEMINFO_GPUINFO_PMCBOOT0, 1420 pGpu->chipId0); 1421 1422 prbEncAddUInt32(pPrbEnc, 1423 NVDEBUG_SYSTEMINFO_GPUINFO_SUBDEV_ID, 1424 pGpu->idInfo.PCISubDeviceID); 1425 } 1426 } 1427 1428 prbEncAddUInt32(pPrbEnc, 1429 NVDEBUG_SYSTEMINFO_GPUINFO_NUM_GPUS, 1430 numGpus); 1431 1432 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_INFO 1433 prbEncNestedEnd(pPrbEnc), 1434 External_Cleanup); 1435 1436 //OS Info 1437 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1438 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_OS_INFO), 1439 External_Cleanup); 1440 1441 nvStatus = osGetVersionDump(pPrbEnc); 1442 if (nvStatus != NV_OK) 1443 goto External_Cleanup; 1444 1445 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_OS_INFO 1446 prbEncNestedEnd(pPrbEnc), 1447 External_Cleanup); 1448 1449 // Driver Info 1450 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1451 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_DRIVER_INFO), 1452 External_Cleanup); 1453 1454 sizeStr = (sizeof("RELEASE") < sizeof(NV_DISPLAY_DRIVER_TITLE) ? 1455 sizeof("RELEASE") : 1456 sizeof(NV_DISPLAY_DRIVER_TITLE)); 1457 1458 if (portMemCmp(NV_DISPLAY_DRIVER_TITLE, "RELEASE", sizeStr) == 0) 1459 bRelease = NV_TRUE; 1460 else 1461 bRelease = NV_FALSE; 1462 1463 prbEncAddBool(pPrbEnc, 1464 NVDEBUG_SYSTEMINFO_DRIVERINFO_IS_RELEASE, 1465 bRelease); 1466 1467 prbEncAddString(pPrbEnc, 1468 NVDEBUG_SYSTEMINFO_DRIVERINFO_VERSION, 1469 NV_VERSION_STRING); 1470 1471 prbEncAddString(pPrbEnc, 1472 NVDEBUG_SYSTEMINFO_DRIVERINFO_BRANCH, 1473 NV_BUILD_BRANCH_VERSION); 1474 1475 prbEncAddUInt32(pPrbEnc, 1476 NVDEBUG_SYSTEMINFO_DRIVERINFO_CHANGELIST, 1477 NV_LAST_OFFICIAL_CHANGELIST_NUM); 1478 1479 // Only write previous driver version if loaded more than once. 1480 if (pRcDB->driverLoadCount > 1) 1481 { 1482 if (pRcDB->previousDriverVersion != NULL) 1483 { 1484 prbEncAddString(pPrbEnc, 1485 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_VERSION, 1486 pRcDB->previousDriverVersion); 1487 } 1488 1489 if (pRcDB->previousDriverBranch != NULL) 1490 { 1491 prbEncAddString(pPrbEnc, 1492 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_BRANCH, 1493 pRcDB->previousDriverBranch); 1494 } 1495 1496 prbEncAddUInt32(pPrbEnc, 1497 NVDEBUG_SYSTEMINFO_DRIVERINFO_PREVIOUS_CHANGELIST, 1498 pRcDB->prevDriverChangelist); 1499 } 1500 1501 prbEncAddUInt32(pPrbEnc, 1502 NVDEBUG_SYSTEMINFO_DRIVERINFO_LOAD_COUNT, 1503 pRcDB->driverLoadCount); 1504 1505 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_DRIVER_INFO 1506 prbEncNestedEnd(pPrbEnc), 1507 External_Cleanup); 1508 1509 // Dump an table of 1510 // Master GPU -- gpuId 1511 // List all gpus involved by gpuIds 1512 portMemSet(bGpuDone, NV_FALSE, sizeof(bGpuDone)); 1513 for (gpu = 0; gpu < NV_MAX_DEVICES; gpu++) 1514 { 1515 pGpu = gpumgrGetGpu(gpu); 1516 1517 if ((pGpu) && (bGpuDone[gpu] == NV_FALSE)) 1518 { 1519 pParent = gpumgrGetParentGPU(pGpu); 1520 1521 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1522 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_GPU_CONFIG), 1523 External_Cleanup); 1524 1525 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_MASTER_ID, pParent->gpuId); 1526 gpuMask = gpumgrGetGpuMask(pGpu); 1527 gpuIndex = 0; 1528 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex); 1529 while (pGpu) 1530 { 1531 prbEncAddUInt32(pPrbEnc, NVDEBUG_SYSTEMINFO_CONFIG_GPU_ID, pGpu->gpuId); 1532 1533 // gpuIndex is either the next or the MAX 1534 bGpuDone[gpuIndex - 1] = NV_TRUE; 1535 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex); 1536 } 1537 1538 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, // NVDEBUG_SYSTEMINFO_GPU_CONFIG 1539 prbEncNestedEnd(pPrbEnc), 1540 External_Cleanup); 1541 } 1542 } 1543 1544 // Error state 1545 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 1546 prbEncNestedStart(pPrbEnc, NVDEBUG_SYSTEMINFO_ERROR_STATE), 1547 External_Cleanup); 1548 1549 prbEncAddUInt32(pPrbEnc, 1550 NVDEBUG_SYSTEMINFO_ERRORSTATE_BUGCHECK_CODE, 1551 pNvDumpState->bugCheckCode); 1552 1553 prbEncAddBool(pPrbEnc, 1554 NVDEBUG_SYSTEMINFO_ERRORSTATE_GOT_RM_LOCK, 1555 pNvDumpState->bRMLock); 1556 1557 prbEncAddUInt32(pPrbEnc, 1558 NVDEBUG_SYSTEMINFO_ERRORSTATE_DUMP_BUFFER_SIZE, 1559 pNvDumpState->initialbufferSize); 1560 1561 // 1562 // prbEncNestedEnd for NVDEBUG_SYSTEMINFO_ERROR_STATE and 1563 // NVDEBUG_NVDUMP_SYSTEM_INFO are handled by prbEncUnwindNesting. 1564 // 1565 1566 External_Cleanup: 1567 // Unwind the protobuf to the correct depth. 1568 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR, 1569 prbEncUnwindNesting(pPrbEnc, startingDepth)); 1570 1571 return nvStatus; 1572 } 1573 1574 // 1575 // Routine to dump RcDB Debug Info 1576 // 1577 NV_STATUS 1578 rcdbDumpSystemFunc_IMPL 1579 ( 1580 OBJRCDB *pRcDB, 1581 PRB_ENCODER *pPrbEnc, 1582 NVD_STATE *pNvDumpState 1583 ) 1584 { 1585 OBJGPU *pGpu = gpumgrGetSomeGpu(); 1586 1587 switch (DRF_VAL(_NVD, _ERROR_CODE, _MAJOR, pNvDumpState->internalCode)) 1588 { 1589 case NVD_GPU_GENERATED: 1590 case NVD_SKIP_ZERO: 1591 // don't report on these internal codes. 1592 return NV_OK; 1593 break; 1594 } 1595 1596 rcdbDumpJournal(pRcDB, pGpu, pPrbEnc, pNvDumpState, NVDEBUG_NVDUMP_DCL_MSG); 1597 if (pGpu != NULL) 1598 { 1599 rcdbDumpErrorCounters(pRcDB, pGpu, pPrbEnc); 1600 } 1601 else 1602 { 1603 NV_PRINTF(LEVEL_WARNING, 1604 "no GPU - won't dump ring buffers or journal\n"); 1605 } 1606 1607 return NV_OK; 1608 } 1609 1610 static NvU32 1611 _rcdbInsertErrorHistoryToList(RmRCCommonJournal_RECORD *pList, NVD_STATE *pNvDumpState) 1612 { 1613 OBJSYS *pSys = SYS_GET_INSTANCE(); 1614 Journal *pRcDB = SYS_GET_RCDB(pSys); 1615 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo; 1616 RMPRBERRORELEMENT_V2* pPrbErrorElement; 1617 RMCD_ERROR_BLOCK* pErrorBlock; 1618 NV_STATUS status = NV_OK; 1619 1620 // 1621 // If we are called from the OCA dump, make sure we have the rm lock. 1622 // TO DO: Try to dump as much as possible without the lock. 1623 // 1624 if (!pNvDumpState->bRMLock) 1625 return NV_OK; 1626 1627 // Get Past Exceptions 1628 pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList; 1629 while (NULL != pPrbErrorElement) 1630 { 1631 pErrorBlock = pPrbErrorElement->ErrorHeader.pErrorBlock; 1632 switch (pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType) 1633 { 1634 case RmPrbErrorInfo_V2: 1635 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common)); 1636 break; 1637 1638 case RmPrbFullDump_V2: 1639 // 1640 // Full crash dumps are a single NvDebug.NvDump message, and 1641 // should be contained in a single block. 1642 // 1643 if (pErrorBlock != NULL) 1644 { 1645 if (pErrorBlock->pNext != NULL) 1646 { 1647 NV_PRINTF(LEVEL_WARNING, 1648 "only one error block expected!\n"); 1649 } 1650 _rcdbInsertJournalRecordToList (pList, &(pPrbErrorElement->RmPrbErrorData.common)); 1651 } 1652 break; 1653 default: 1654 // Can only handle protobuf formatted messages 1655 NV_PRINTF(LEVEL_ERROR, "unknown error element type: %d\n", 1656 pPrbErrorElement->RmPrbErrorData.common.Header.cRecordType); 1657 break; 1658 } 1659 pPrbErrorElement = (RMPRBERRORELEMENT_V2*)pPrbErrorElement->ErrorHeader.pNextError; 1660 } 1661 return status; 1662 } 1663 1664 static void 1665 _rcdbDumpCommonJournalRecord 1666 ( 1667 PRB_ENCODER *pPrbEnc, 1668 const PRB_FIELD_DESC *pFieldDesc, 1669 RmRCCommonJournal_RECORD *pRec 1670 ) 1671 { 1672 NV_STATUS nvStatus = NV_OK; 1673 1674 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1675 prbEncNestedStart(pPrbEnc, pFieldDesc)); 1676 1677 if (nvStatus == NV_OK) 1678 { 1679 if (pRec->timeStamp != 0) 1680 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_TIME_STAMP, pRec->timeStamp); 1681 if (pRec->GPUTag != 0) 1682 prbEncAddUInt32(pPrbEnc, JOURNAL_COMMON_GPU_TAG, pRec->GPUTag); 1683 if (pRec->CPUTag != 0) 1684 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_CPU_TAG, pRec->CPUTag); 1685 if (pRec->stateMask != 0) 1686 prbEncAddUInt64(pPrbEnc, JOURNAL_COMMON_STATE_MASK, pRec->stateMask); 1687 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1688 } 1689 } 1690 1691 static void 1692 rcdbDumpCommonAssertRecord 1693 ( 1694 PRB_ENCODER *pPrbEnc, 1695 NVD_STATE *pNvDumpState, 1696 RmRCCommonAssert_RECORD *pRec, 1697 NvU32 type 1698 ) 1699 { 1700 NvU32 i; 1701 1702 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_TYPE, type); 1703 1704 if (pRec->lastTimeStamp != 0) 1705 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_LAST_TIME_STAMP, pRec->lastTimeStamp); 1706 1707 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_BREAKPOINT_ADDR_HINT, pRec->breakpointAddrHint); 1708 1709 // if there is a line number, add it to the message. 1710 if (pRec->lineNum != NV_RM_ASSERT_UNKNOWN_LINE_NUM) 1711 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_SOURCE_LINE, pRec->lineNum); 1712 1713 if (pRec->count != 1) 1714 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_COUNT, pRec->count); 1715 1716 for (i = 0; i < NV_ARRAY_ELEMENTS(pRec->callStack); i++) 1717 { 1718 if (pRec->callStack[i] == 0) 1719 break; 1720 1721 prbEncAddUInt64(pPrbEnc, JOURNAL_ASSERT_CALL_STACK, pRec->callStack[i]); 1722 } 1723 } 1724 1725 static NV_STATUS 1726 _rcdbDumpDclMsgRecord( 1727 PRB_ENCODER *pPrbEnc, 1728 NVD_STATE *pNvDumpState, 1729 const PRB_FIELD_DESC *pFieldDesc, 1730 RmRCCommonJournal_RECORD *pDclRecord 1731 ) 1732 { 1733 NV_STATUS nvStatus = NV_OK; 1734 1735 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 1736 prbEncNestedStart(pPrbEnc, pFieldDesc)); 1737 1738 _rcdbDumpCommonJournalRecord(pPrbEnc, DCL_DCLMSG_COMMON, pDclRecord); 1739 1740 switch (pDclRecord->Header.cRecordType) 1741 { 1742 case RmRC2SwDbgBreakpoint_V3: 1743 case RmRC2SwRmAssert_V3: 1744 { 1745 RmRC2SwRmAssert3_RECORD* pRecord = (RmRC2SwRmAssert3_RECORD*)pDclRecord; 1746 1747 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1748 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT)); 1749 if (nvStatus == NV_OK) 1750 { 1751 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState, 1752 &pRecord->commonAssert, pDclRecord->Header.cRecordType); 1753 1754 prbEncAddUInt32(pPrbEnc, JOURNAL_ASSERT_LEVEL, pRecord->level); 1755 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1756 } 1757 break; 1758 } 1759 case RmRC2GpuTimeout_V3: 1760 { 1761 RmRC2GpuTimeout3_RECORD* pRecord = (RmRC2GpuTimeout3_RECORD*)pDclRecord; 1762 1763 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1764 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_ASSERT)); 1765 if (nvStatus == NV_OK) 1766 { 1767 rcdbDumpCommonAssertRecord(pPrbEnc, pNvDumpState, pRecord, pDclRecord->Header.cRecordType); 1768 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1769 } 1770 break; 1771 } 1772 case RmBadRead_V2: 1773 { 1774 RmRC2BadRead2_RECORD* pRecord = (RmRC2BadRead2_RECORD*)pDclRecord; 1775 1776 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1777 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BADREAD)); 1778 if (nvStatus == NV_OK) 1779 { 1780 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MEMORY_SPACE, pRecord->MemorySpace); 1781 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_OFFSET, pRecord->Offset); 1782 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_MASK, pRecord->Mask); 1783 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_VALUE, pRecord->Value); 1784 prbEncAddUInt32(pPrbEnc, JOURNAL_BADREAD_REASON, pRecord->Reason); 1785 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1786 } 1787 break; 1788 } 1789 case RmDclMsg: 1790 { 1791 RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord; 1792 // Add the bytes after RM_DATA_COLLECTION_RECORD 1793 prbEncAddBytes(pPrbEnc, pRecord->fieldDesc, (void *) (pRecord + 1), 1794 pRecord->common.Header.wRecordSize - sizeof(*pRecord)); 1795 break; 1796 } 1797 case RmJournalEngDump: 1798 { 1799 RM_DATA_COLLECTION_RECORD *pRecord = (RM_DATA_COLLECTION_RECORD*) pDclRecord; 1800 // Add the bytes after RM_DATA_COLLECTION_RECORD 1801 prbEncCatMsg(pPrbEnc, (void *)(pRecord + 1), 1802 pRecord->common.Header.wRecordSize - sizeof(*pRecord)); 1803 break; 1804 } 1805 case RmJournalBugCheck: 1806 { 1807 RmJournalBugcheck_RECORD* pRecord = (RmJournalBugcheck_RECORD*)pDclRecord; 1808 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1809 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_BUGCHECK)); 1810 if (nvStatus == NV_OK) 1811 { 1812 prbEncAddUInt32(pPrbEnc, JOURNAL_BUGCHECK_CODE, pRecord->bugCheckCode); 1813 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1814 } 1815 break; 1816 } 1817 case RmPrbErrorInfo_V2: 1818 case RmPrbFullDump_V2: 1819 { 1820 RMPRBERRORELEMENT_V2* pRecord = (RMPRBERRORELEMENT_V2*)((NvU8 *)pDclRecord 1821 - NV_OFFSETOF(RMPRBERRORELEMENT_V2, RmPrbErrorData)); 1822 RMCD_ERROR_BLOCK* pErrorBlock; 1823 1824 for (pErrorBlock = pRecord->ErrorHeader.pErrorBlock; 1825 (pErrorBlock != NULL); pErrorBlock = pErrorBlock->pNext) 1826 { 1827 prbEncCatMsg(pPrbEnc, (void *)pErrorBlock->pBlock, 1828 pErrorBlock->blockSize); 1829 } 1830 break; 1831 } 1832 case RmNocatReport: 1833 { 1834 // currently not added to the OCA dump 1835 break; 1836 } 1837 1838 default: 1839 // These are the only ones we know about 1840 NV_PRINTF(LEVEL_ERROR, 1841 "unknown Dcl Record entry type: %d\n", 1842 pDclRecord->Header.cRecordType); 1843 break; 1844 } 1845 1846 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1847 return 0; 1848 } 1849 1850 static NvU32 1851 _rcdbInsertJournalRecordToList (RmRCCommonJournal_RECORD *pList, RmRCCommonJournal_RECORD *pRecord) 1852 { 1853 RmRCCommonJournal_RECORD *pCurrentRecord = pList; 1854 RmRCCommonJournal_RECORD *pNextRecord; 1855 1856 if ((NULL != pList) && (NULL != pRecord)) 1857 { 1858 for (pNextRecord = (RmRCCommonJournal_RECORD *)pList->pNext; pNextRecord != pList; pNextRecord = (RmRCCommonJournal_RECORD *)pNextRecord->pNext) 1859 { 1860 if (pRecord->timeStamp < pNextRecord->timeStamp) 1861 { 1862 break; 1863 } 1864 pCurrentRecord = pNextRecord; 1865 } 1866 pRecord->pNext = pCurrentRecord->pNext; 1867 pCurrentRecord->pNext = (NvU8 *)pRecord; 1868 } 1869 return 0; 1870 } 1871 1872 // Todo: format the records into a protobuf DCL record at the source 1873 static NvU32 1874 rcdbInsertRingBufferToList( 1875 Journal *pRcDB, 1876 RmRCCommonJournal_RECORD *pList, 1877 RING_BUFFER_LOG *pRingBuffer 1878 ) 1879 { 1880 RmRCCommonJournal_RECORD *pCommon; 1881 NvU32 recordSize; 1882 NvU32 i; 1883 1884 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, pRingBuffer->entryType); 1885 1886 // 1887 // Order does not matter here because the record will be inserted into the 1888 // list based on the time of the record, not its postion in the buffer. 1889 // 1890 for (i = 0; i < pRingBuffer->numEntries; i++) 1891 { 1892 pCommon = (RmRCCommonJournal_RECORD *)(((NvU8 *)pRingBuffer->pBuffer) + (recordSize * i)); 1893 1894 _rcdbInsertJournalRecordToList (pList, pCommon); 1895 } 1896 1897 return 0; // return value should be discarded 1898 } 1899 1900 static NvU32 1901 rcdbInsertRingBufferCollectionToList( 1902 Journal *pRcDB, 1903 RmRCCommonJournal_RECORD *pList) 1904 { 1905 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 1906 RING_BUFFER_LOG *pCurrentBuffer; 1907 NvU32 i; 1908 1909 1910 pCurrentBuffer = pRingBufferColl->pFirstEntry; 1911 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++) 1912 { 1913 NvU32 recSize = pCurrentBuffer->bufferSize; 1914 1915 NV_ASSERT(pCurrentBuffer->maxEntries * 1916 rcdbGetOcaRecordSizeWithHeader(pRcDB, pCurrentBuffer->entryType) == 1917 pCurrentBuffer->bufferSize); 1918 1919 if (recSize > 0) 1920 { 1921 rcdbInsertRingBufferToList (pRcDB, pList, pCurrentBuffer); 1922 } 1923 pCurrentBuffer = pCurrentBuffer->pNextRingBuffer; 1924 } 1925 1926 // Assert that we traversed through the entire list. 1927 NV_ASSERT(pCurrentBuffer == NULL); 1928 1929 // return value should be ignored 1930 return 0; 1931 } 1932 1933 NvU32 1934 rcdbDumpJournal_IMPL 1935 ( 1936 OBJRCDB *pRcDB, 1937 OBJGPU *pGpu, 1938 PRB_ENCODER *pPrbEnc, 1939 NVD_STATE *pNvDumpState, 1940 const PRB_FIELD_DESC *pFieldDesc 1941 ) 1942 { 1943 OS_DRIVER_BLOCK DriverBlock; 1944 EVENT_JOURNAL *pJournal = &pRcDB->Journal; 1945 NvU8 *pJournalBuff = pJournal->pBuffer; 1946 RmRCCommonJournal_RECORD *pRecord; 1947 NvU32 recSize; 1948 NV_STATUS nvStatus = NV_OK; 1949 RmRCCommonJournal_RECORD List; 1950 1951 // It is OK to dump the journal entries without the RM lock. 1952 // No need to check pRcDB->nvDumpState.bNoRMLock; 1953 1954 recSize = pJournal->BufferSize - pJournal->BufferRemaining; 1955 1956 if (NULL != pGpu) 1957 { 1958 // 1959 // Add RVA Header, even when there are no journal records. 1960 // This header is required to resolve code addresses using the PDB file. 1961 // We can log code addresses outside of the journal entries. 1962 // 1963 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedStart(pPrbEnc, pFieldDesc)); 1964 if (nvStatus == NV_OK) 1965 { 1966 NV_CHECK_OK(nvStatus, LEVEL_ERROR, 1967 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_JOURNAL_RVAHEADER)); 1968 if (nvStatus == NV_OK) 1969 { 1970 portMemSet(&DriverBlock, 0x00, sizeof(DriverBlock)); 1971 osGetDriverBlock(pGpu->pOsGpuInfo, &DriverBlock); 1972 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_DRIVER_START, (NvU64)DriverBlock.driverStart); 1973 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_OFFSET, DriverBlock.offset); 1974 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_POINTER_SIZE, sizeof(pJournal)); 1975 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_HIGH, *((NvU64*) DriverBlock.unique_id)); 1976 prbEncAddUInt64(pPrbEnc, JOURNAL_RVAHEADER_UNIQUE_ID_LOW, *((NvU64*) (DriverBlock.unique_id + 8))); 1977 prbEncAddUInt32(pPrbEnc, JOURNAL_RVAHEADER_AGE, DriverBlock.age); 1978 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1979 } 1980 NV_CHECK_OK(nvStatus, LEVEL_ERROR, prbEncNestedEnd(pPrbEnc)); 1981 } 1982 } 1983 1984 // init the list to an empty state 1985 portMemSet(&List, 0x00, sizeof(List)); 1986 List.pNext = (NvU8 *)&List; 1987 1988 // 1989 // Don't dump the ring buffers if something is adding to them. 1990 // If we can dump the ring buffers, hold the lock for them until the 1991 // dump is complete to insure that a record is not changed mid-dump. 1992 // 1993 if (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1) 1994 { 1995 // 1996 // If IRQL is low, spin until it gets available 1997 // 1998 if (!osIsRaisedIRQL() && (NULL != pGpu)) 1999 { 2000 RMTIMEOUT timeout; 2001 NV_STATUS status = NV_OK; 2002 gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0); 2003 do { 2004 portAtomicDecrementS32(&concurrentRingBufferAccess); 2005 2006 if (NV_ERR_TIMEOUT == status) 2007 { 2008 NV_PRINTF(LEVEL_ERROR, 2009 "timed out waiting for Rm journal ring buffer to be available\n"); 2010 DBG_BREAKPOINT(); 2011 return 0; 2012 } 2013 status = gpuCheckTimeout(pGpu, &timeout); 2014 osSpinLoop(); 2015 } while (portAtomicIncrementS32(&concurrentRingBufferAccess) != 1); 2016 } 2017 else 2018 { 2019 NV_ASSERT_FAILED("Ring Buffer unavailable for dump at high irql."); 2020 } 2021 } 2022 2023 rcdbInsertRingBufferCollectionToList (pRcDB, &List); 2024 2025 _rcdbInsertErrorHistoryToList(&List, pNvDumpState); 2026 2027 // Skip if size is smaller than a header 2028 while (recSize > sizeof(RmRCCommonJournal_RECORD)) 2029 { 2030 pRecord = (RmRCCommonJournal_RECORD *)pJournalBuff; 2031 2032 if (pRecord->Header.cRecordGroup != RmGroup) 2033 { 2034 // We only log RM related data 2035 NV_ASSERT(pRecord->Header.cRecordGroup == RmGroup); 2036 break; 2037 } 2038 2039 // Just a safety net... 2040 if (pRecord->Header.wRecordSize > recSize) 2041 { 2042 break; 2043 } 2044 _rcdbInsertJournalRecordToList (&List, pRecord); 2045 2046 recSize -= pRecord->Header.wRecordSize; 2047 pJournalBuff += pRecord->Header.wRecordSize; 2048 } 2049 2050 2051 // dump out the records that have been added to the list. 2052 for (pRecord = (RmRCCommonJournal_RECORD *)List.pNext; pRecord != &List; pRecord = (RmRCCommonJournal_RECORD *)pRecord->pNext) 2053 { 2054 _rcdbDumpDclMsgRecord(pPrbEnc, pNvDumpState, pFieldDesc, pRecord); 2055 } 2056 portAtomicDecrementS32(&concurrentRingBufferAccess); 2057 2058 // return value should be ignored 2059 return 0; 2060 } 2061 2062 NvU32 2063 rcdbDumpErrorCounters_IMPL(Journal *pRcDB, OBJGPU *pGpu, PRB_ENCODER *pPrbEnc) 2064 { 2065 NvU32 i; 2066 NvU32 rcErrTyp = RC_ERROR_COUNTER_TYPE_INVALID; 2067 NV_STATUS nvStatus = NV_OK; 2068 NvU8 startingDepth = prbEncNestingLevel(pPrbEnc); 2069 2070 // Opens NVDEBUG_NVDUMP_DCL_MSG 2071 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 2072 prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_DCL_MSG), 2073 cleanupAndExit); 2074 2075 for (i = 0; i <= RC_ERROR_COUNTER_OTHER_INDEX; i++) 2076 { 2077 // For Counters 2078 rcErrTyp = pRcDB->rcErrorCounterArray[i].rcErrorType; 2079 if (rcErrTyp != RC_ERROR_COUNTER_TYPE_INVALID) 2080 { 2081 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 2082 prbEncNestedStart(pPrbEnc, DCL_DCLMSG_RCCOUNTER), 2083 cleanupAndExit); 2084 2085 // Write Power Event 2086 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCERRORTYPE, rcErrTyp); 2087 2088 // Write Power State 2089 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_COUNT, pRcDB->rcErrorCounterArray[i].rcErrorCount); 2090 2091 // Dump the channel ID and the last time when this error occurred on this channel ID 2092 prbEncAddUInt32(pPrbEnc, RC_RCCOUNTER_RCLASTCHID, pRcDB->rcErrorCounterArray[i].rcLastCHID); 2093 prbEncAddUInt64(pPrbEnc, RC_RCCOUNTER_RCLASTTIME, pRcDB->rcErrorCounterArray[i].rcLastTime); 2094 2095 NV_CHECK_OK_OR_GOTO(nvStatus, LEVEL_ERROR, 2096 prbEncNestedEnd(pPrbEnc), 2097 cleanupAndExit); 2098 } 2099 } // For Counters 2100 2101 // Close NVDEBUG_NVDUMP_DCL_MSG handled by prbEncUnwindNesting. 2102 2103 cleanupAndExit: 2104 // Unwind the protobuff to inital depth 2105 NV_CHECK_OK_OR_CAPTURE_FIRST_ERROR(nvStatus, LEVEL_ERROR, 2106 prbEncUnwindNesting(pPrbEnc, startingDepth)); 2107 2108 return 0; 2109 } 2110 2111 static void 2112 _rcdbAddRmGpuDumpCallback 2113 ( 2114 void *pData 2115 ) 2116 { 2117 OBJSYS *pSys = SYS_GET_INSTANCE(); 2118 NV_STATUS status; 2119 2120 NvU32 gpuInstance = *((NvU32 *)pData); 2121 status = osAcquireRmSema(pSys->pSema); 2122 if (status == NV_OK) 2123 { 2124 // LOCK: acquire API lock 2125 status = rmapiLockAcquire(API_LOCK_FLAGS_NONE, RM_LOCK_MODULES_DIAG); 2126 if (status == NV_OK) 2127 { 2128 // LOCK: acquire GPUs lock 2129 status = rmGpuLocksAcquire(GPUS_LOCK_FLAGS_NONE, 2130 RM_LOCK_MODULES_DIAG); 2131 if (status == NV_OK) 2132 { 2133 Journal *pRcDB = SYS_GET_RCDB(pSys); 2134 OBJGPU *pGpu = gpumgrGetGpu(gpuInstance); 2135 2136 // 2137 // Mark the Journal object as in the deferred dump path so we won't 2138 // re-attempt again. 2139 // 2140 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_TRUE); 2141 2142 status = rcdbAddRmGpuDump(pGpu); 2143 NV_ASSERT(status == NV_OK); 2144 2145 pRcDB->setProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH, NV_FALSE); 2146 2147 // UNLOCK: release GPUs lock 2148 rmGpuLocksRelease(GPUS_LOCK_FLAGS_NONE, NULL); 2149 } 2150 else 2151 { 2152 NV_PRINTF(LEVEL_ERROR, "failed to acquire the GPU locks!\n"); 2153 } 2154 // UNLOCK: release API lock 2155 rmapiLockRelease(); 2156 } 2157 else 2158 { 2159 NV_PRINTF(LEVEL_ERROR, "failed to acquire the API lock!\n"); 2160 } 2161 osReleaseRmSema(pSys->pSema, NULL); 2162 } 2163 else 2164 { 2165 NV_PRINTF(LEVEL_ERROR, "failed to acquire the OS semaphore!\n"); 2166 } 2167 } 2168 2169 static NV_STATUS 2170 nvdDebuggerBufferCallback(void *pEncoder, NvBool bBufferFull) 2171 { 2172 if (bBufferFull) 2173 { 2174 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_BUFFER_FULL; 2175 } 2176 else 2177 { 2178 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_END_OF_MSG; 2179 } 2180 2181 return NV_OK; 2182 } 2183 2184 /*! 2185 * @brief NvDebug kernel debugger dump control 2186 * 2187 * Allows external kernel debuggers to control the RM's dump interface 2188 * without assuming anything about the current system state. 2189 * 2190 * WARNING! This function should never be called directly! 2191 * 2192 * If correctly setup, a kernel debugger will place a processor 2193 * hardware watchpoint on the nvDumpConfig.handshake variable. 2194 * Each time this is written to, the debugger will break and get a chance 2195 * to examine the rest of the nvDumpConfig state. 2196 * 2197 * @return This function should never return! External debugger should abort it! 2198 */ 2199 static void 2200 nvdDebuggerControlFunc(void) 2201 { 2202 OBJSYS *pSys = SYS_GET_INSTANCE(); 2203 Journal *pRcDB = SYS_GET_RCDB(pSys); 2204 OBJGPU *pGpu = NULL; 2205 NvDebugDump *pNvd = NULL; 2206 NVDUMP_BUFFER *pBuffer = (NVDUMP_BUFFER *)&nvDumpConfig.buffer; // discard volatile 2207 2208 // Process actions while debugger provides work to do. 2209 while (nvDumpConfig.dumpStatus != NVDUMP_STATUS_IDLE) 2210 { 2211 nvDumpConfig.rmStatus = NV_OK; 2212 2213 NV_PRINTF(LEVEL_INFO, 2214 "Dump triggered: gpuSelect=%u, component=%u, dumpStatus=%u\n", 2215 nvDumpConfig.gpuSelect, nvDumpConfig.component, 2216 nvDumpConfig.dumpStatus); 2217 2218 if (NVDUMP_IS_GPU_COMPONENT(nvDumpConfig.component)) 2219 { 2220 pGpu = gpumgrGetGpu(nvDumpConfig.gpuSelect); 2221 pNvd = GPU_GET_NVD(pGpu); 2222 2223 switch (nvDumpConfig.dumpStatus) 2224 { 2225 case NVDUMP_STATUS_COUNT_REQUESTED: 2226 nvDumpConfig.rmStatus = nvdDumpComponent( 2227 pGpu, pNvd, nvDumpConfig.component, pBuffer, 2228 NVDUMP_BUFFER_COUNT, NULL); 2229 nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE; 2230 break; 2231 case NVDUMP_STATUS_DUMP_REQUESTED: 2232 nvDumpConfig.rmStatus = nvdDumpComponent( 2233 pGpu, pNvd, nvDumpConfig.component, pBuffer, 2234 NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback); 2235 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE; 2236 break; 2237 default: 2238 NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n", 2239 nvDumpConfig.dumpStatus); 2240 nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE; 2241 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR; 2242 break; 2243 } 2244 } 2245 else if (NVDUMP_IS_SYS_COMPONENT(nvDumpConfig.component)) 2246 { 2247 switch (nvDumpConfig.dumpStatus) 2248 { 2249 case NVDUMP_STATUS_COUNT_REQUESTED: 2250 nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB, 2251 nvDumpConfig.component, pBuffer, 2252 NVDUMP_BUFFER_COUNT, NULL); 2253 nvDumpConfig.dumpStatus = NVDUMP_STATUS_COUNT_COMPLETE; 2254 break; 2255 case NVDUMP_STATUS_DUMP_REQUESTED: 2256 nvDumpConfig.rmStatus = rcdbDumpComponent(pRcDB, 2257 nvDumpConfig.component, pBuffer, 2258 NVDUMP_BUFFER_PROVIDED, &nvdDebuggerBufferCallback); 2259 nvDumpConfig.dumpStatus = NVDUMP_STATUS_DUMP_COMPLETE; 2260 break; 2261 default: 2262 NV_PRINTF(LEVEL_ERROR, "Invalid dumpStatus %u\n", 2263 nvDumpConfig.dumpStatus); 2264 nvDumpConfig.rmStatus = NV_ERR_INVALID_STATE; 2265 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR; 2266 2267 break; 2268 } 2269 } 2270 else 2271 { 2272 NV_PRINTF(LEVEL_ERROR, "Invalid component %u\n", 2273 nvDumpConfig.component); 2274 nvDumpConfig.rmStatus = NV_ERR_INVALID_PARAM_STRUCT; 2275 nvDumpConfig.dumpStatus = NVDUMP_STATUS_ERROR; 2276 } 2277 } 2278 2279 // Ensure we really don't exit this function without debugger. 2280 while (1) 2281 { 2282 NV_PRINTF(LEVEL_ERROR, "Should never reach this point!\n"); 2283 DBG_BREAKPOINT(); 2284 } 2285 } 2286 2287 /*! 2288 * @brief Release Build NV_ASSERT function 2289 * 2290 * @details Called by NV_ASSERT when the assertion fails. 2291 * By putting this logic in its own function, we save on binary size. 2292 */ 2293 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS) 2294 static void _rcdbRmAssert(NvU32 level, NvU32 lineNum, NvU64 ip) 2295 { 2296 RmRC2SwRmAssert3_RECORD* pRec = NULL; 2297 if (rcdbAddAssertJournalRecWithLine(NULL, lineNum, (void **)&pRec, RmGroup, 2298 RmRC2SwRmAssert_V3, sizeof(RmRC2SwRmAssert3_RECORD), 2299 level, ip) == NV_OK) 2300 { 2301 pRec->level = level; 2302 } 2303 2304 #if !defined(DEBUG) && !defined(QA_BUILD) 2305 { 2306 OBJSYS *pSys = SYS_GET_INSTANCE(); 2307 2308 // Add assert to NvLog. But skip when nvLog asserts to avoid stack overflow. 2309 if (portAtomicIncrementS32(&nvLogRecursion) == 1) 2310 { 2311 // check for GPU lost. 2312 rcdProbeAllGpusPresent(ip); 2313 } 2314 portAtomicDecrementS32(&nvLogRecursion); 2315 2316 if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_ASSERT) & 2317 DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags))) 2318 { 2319 REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release NV_ASSERT Break\n"); 2320 } 2321 } 2322 2323 // If enabled bugcheck on assert 2324 osDbgBugCheckOnAssert(); 2325 2326 #endif 2327 } 2328 2329 // 2330 // Some param-less wrappers for rcdbXxxEx() functions. 2331 // If the params are not needed, calling these functions saves on binary size 2332 // 2333 void rcdbRmAssert(NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(0, LineNum, ip); } 2334 void rcdbRmAssertStatus(NvU32 status, NvU32 LineNum, NvU64 ip) { _rcdbRmAssert(status, LineNum, ip); } 2335 2336 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX) || RMCFG_FEATURE_PLATFORM_GSP) && !defined(NV_MODS) 2337 2338 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS) 2339 2340 /*! 2341 * @brief Release Build DBGBREAKPOINT() function 2342 * 2343 * @details Called by DBGBREAKPOINT when the assertion fails. 2344 * By putting this logic in its own function, we save on binary size. 2345 */ 2346 static void _rcdbDbgBreakEx(void *pGpu, NvU32 lineNum, NvU32 level, NvU64 ip) 2347 { 2348 RmRC2SwRmAssert3_RECORD* pRec = NULL; 2349 if (rcdbAddAssertJournalRecWithLine(pGpu, lineNum, (void**)&pRec, RmGroup, 2350 RmRC2SwDbgBreakpoint_V3, sizeof(RmRC2SwRmAssert3_RECORD), level, ip) == NV_OK) 2351 { 2352 pRec->level = level; 2353 } 2354 2355 #if !defined(DEBUG) && !defined(QA_BUILD) 2356 { 2357 OBJSYS *pSys = SYS_GET_INSTANCE(); 2358 2359 // Add assert to NvLog. But skip when nvLog asserts to avoid stack overflow. 2360 if (portAtomicIncrementS32(&nvLogRecursion) == 1) 2361 { 2362 NV_PRINTF(LEVEL_NOTICE, "Breakpoint at 0x%llx.\n", ip); 2363 } 2364 portAtomicDecrementS32(&nvLogRecursion); 2365 2366 if ((pSys != NULL) && ((NV_DEBUG_BREAK_ATTRIBUTES_DBG_BREAK) & 2367 DRF_VAL(_DEBUG, _BREAK, _ATTRIBUTES, pSys->debugFlags))) 2368 { 2369 REL_DBG_BREAKPOINT_MSG("NVRM-RC: Nvidia Release Debug Break\n"); 2370 } 2371 } 2372 #endif 2373 2374 // If enabled bugcheck on assert 2375 osDbgBugCheckOnAssert(); 2376 } 2377 2378 void rcdbDbgBreak(NvU64 ip) { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0, ip); } 2379 void rcdbDbgBreakGpu(void *pGpu, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, 0, ip); } 2380 void rcdbDbgBreakStatus(NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(NULL, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); } 2381 void rcdbDbgBreakEx(void *pGpu, NvU32 status, NvU64 ip) { _rcdbDbgBreakEx(pGpu, NV_RM_ASSERT_UNKNOWN_LINE_NUM, status, ip); } 2382 2383 #endif 2384 2385 NV_STATUS 2386 rcdbAddRmEngDump 2387 ( 2388 OBJGPU *pGpu, 2389 NvU32 component 2390 ) 2391 { 2392 OBJSYS *pSys = SYS_GET_INSTANCE(); 2393 Journal *pRcDB = SYS_GET_RCDB(pSys); 2394 NvDebugDump *pNvd = GPU_GET_NVD(pGpu); 2395 NVDUMP_BUFFER nvDumpBuffer = {0}; 2396 RM_DATA_COLLECTION_RECORD *pRec; 2397 NV_STATUS rmStatus; 2398 NvU16 totalSize; 2399 2400 nvDumpBuffer.size = NVDUMP_MAX_DUMP_SIZE; 2401 2402 rmStatus = nvdDumpComponent(pGpu, pNvd, component, &nvDumpBuffer, 2403 NVDUMP_BUFFER_ALLOCATE, NULL); 2404 if (rmStatus != NV_OK) 2405 { 2406 goto rcdbAddRmEngDump_error_handle; 2407 } 2408 2409 totalSize = (NvU16)(nvDumpBuffer.curNumBytes + sizeof(*pRec)); 2410 //align to 8 bytes to keep the readability of RM journal 2411 totalSize = (totalSize + 0x7) & ~0x7; 2412 // check for overflow 2413 if (((NvU32)totalSize) < nvDumpBuffer.curNumBytes + sizeof(*pRec)) 2414 { 2415 goto rcdbAddRmEngDump_error_handle; 2416 } 2417 2418 rmStatus = rcdbAllocNextJournalRec(pRcDB, (NVCD_RECORD **)&pRec, RmGroup, 2419 RmJournalEngDump, totalSize); 2420 if (rmStatus != NV_OK) 2421 { 2422 goto rcdbAddRmEngDump_error_handle; 2423 } 2424 rcdbSetCommonJournalRecord(pGpu, &pRec->common); 2425 2426 // copy the dump buffer right after the RM_DATA_COLLECTION_RECORD struct 2427 portMemCopy((void *)(pRec + 1), nvDumpBuffer.curNumBytes, NvP64_VALUE(nvDumpBuffer.address), nvDumpBuffer.curNumBytes); 2428 2429 pRec->fieldDesc = NVDEBUG_NVDUMP_GPU_INFO; 2430 2431 rcdbAddRmEngDump_error_handle: 2432 if (nvDumpBuffer.address != NvP64_NULL) 2433 { 2434 portMemFree(NvP64_VALUE(nvDumpBuffer.address)); 2435 } 2436 2437 return rmStatus; 2438 } 2439 2440 2441 // Finds the ring buffer for a corresponding type. Returns error if not allocated. 2442 static void 2443 rcdbFindRingBufferForType 2444 ( 2445 Journal *pRcDB, 2446 RMCD_RECORD_TYPE recType, 2447 RING_BUFFER_LOG **ppRingBuffer 2448 ) 2449 { 2450 NvU32 i; 2451 RING_BUFFER_LOG *pCurrentRingBuffer = NULL; 2452 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 2453 2454 NV_ASSERT(ppRingBuffer != NULL); 2455 *ppRingBuffer = NULL; 2456 2457 // 2458 // Loop through our ring buffer collection, and find the 2459 // ring buffer corresponding to our type. 2460 // 2461 pCurrentRingBuffer = pRingBufferColl->pFirstEntry; 2462 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++) 2463 { 2464 NV_ASSERT(pCurrentRingBuffer != NULL); 2465 if (pCurrentRingBuffer->entryType == recType) 2466 { 2467 *ppRingBuffer = pCurrentRingBuffer; 2468 return; 2469 } 2470 pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer; 2471 } 2472 2473 NV_PRINTF(LEVEL_INFO, "Ring Buffer not found for type %d\n", recType); 2474 return; 2475 } 2476 2477 // 2478 // Creates a ring buffer capable of holding "maxEntries" number of entries, and 2479 // adds it to the ring buffer collection. 2480 // Returns a pointer to the created ring buffer so that individual modules can 2481 // examine the data on-demand easily. 2482 // 2483 //PRINT_BUFFER_LOG 2484 NvU8 * 2485 rcdbCreateRingBuffer_IMPL 2486 ( 2487 Journal *pRcDB, 2488 RMCD_RECORD_TYPE type, 2489 NvU32 maxEntries 2490 ) 2491 { 2492 NV_STATUS status; 2493 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 2494 RING_BUFFER_LOG *pRingBuffer; 2495 NvU8* pBuffer = NULL; 2496 NvU32 bufferSize, entrySize; 2497 2498 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer); 2499 2500 entrySize = rcdbGetOcaRecordSizeWithHeader(pRcDB, type); 2501 if (entrySize == 0) 2502 { 2503 NV_ASSERT(entrySize != 0); 2504 return NULL; 2505 } 2506 2507 // We need to store maxEntries number of entries. Check for overflow too 2508 if (portSafeMulU32(maxEntries, entrySize, &bufferSize) == NV_FALSE) 2509 { 2510 return NULL; 2511 } 2512 2513 if (pRingBuffer != NULL) 2514 { 2515 NvU32 totalSize; 2516 2517 if (portSafeAddU32(bufferSize, pRingBuffer->bufferSize, &totalSize) == NV_FALSE) 2518 { 2519 return NULL; 2520 } 2521 2522 bufferSize = totalSize; 2523 pRingBuffer->refCount++; 2524 2525 // 2526 // XXX The collect-all design of the ring buffers allows for 2527 // interleaved entries for different GPUs. This makes it 2528 // hard to dynamically shrink any given ring buffer as GPUs are 2529 // torn down, and requires that an upper bound be placed on 2530 // the buffer's size. 2531 // 2532 // The upper bound, as chosen, is somewhat arbitrary, but at 2533 // the time of this writing, consistent with the use of 2534 // this interface (i.e. the number of entries for each type is 2535 // the same for each GPU). 2536 // 2537 if (bufferSize > pRingBuffer->maxBufferSize) 2538 return NULL; 2539 } 2540 else 2541 { 2542 pRingBuffer = portMemAllocNonPaged(sizeof(RING_BUFFER_LOG)); 2543 if (pRingBuffer == NULL) 2544 { 2545 status = NV_ERR_NO_MEMORY; 2546 NV_ASSERT(status == NV_OK); 2547 return NULL; 2548 } 2549 2550 portMemSet(pRingBuffer, 0x00, sizeof(*pRingBuffer)); 2551 pRingBuffer->refCount = 1; 2552 } 2553 2554 pBuffer = portMemAllocNonPaged(bufferSize); 2555 if (pBuffer == NULL) 2556 { 2557 status = NV_ERR_NO_MEMORY; 2558 NV_ASSERT(status == NV_OK); 2559 pRingBuffer->refCount--; 2560 if (pRingBuffer->pBuffer == NULL) 2561 { 2562 portMemFree(pRingBuffer); 2563 } 2564 return NULL; 2565 } 2566 2567 // Now, initialize the entries the RING_BUFFER structure. 2568 pRingBuffer->maxEntries += maxEntries; 2569 2570 // Add the ring buffer to the beginning of the ring buffer collection. 2571 if (pRingBuffer->pBuffer == NULL) 2572 { 2573 if (portSafeMulU32(bufferSize, NV_MAX_DEVICES, &pRingBuffer->maxBufferSize) == NV_FALSE) 2574 { 2575 pRingBuffer->refCount--; 2576 if (pRingBuffer->pBuffer == NULL) 2577 { 2578 portMemFree(pRingBuffer); 2579 } 2580 2581 portMemFree(pBuffer); 2582 return NULL; 2583 } 2584 2585 pRingBuffer->maxBufferSize = (bufferSize * NV_MAX_DEVICES); 2586 pRingBuffer->entryType = type; 2587 pRingBuffer->pNextRingBuffer = pRingBufferColl->pFirstEntry; 2588 pRingBufferColl->pFirstEntry = pRingBuffer; 2589 pRingBufferColl->NumRingBuffers++; 2590 } 2591 else 2592 { 2593 NvU32 copySize; 2594 2595 if (portSafeSubU32(bufferSize, pRingBuffer->bufferSize, ©Size) == NV_FALSE) 2596 { 2597 pRingBuffer->refCount--; 2598 if (pRingBuffer->pBuffer == NULL) 2599 { 2600 portMemFree(pRingBuffer); 2601 } 2602 2603 portMemFree(pBuffer); 2604 return NULL; 2605 } 2606 2607 portMemCopy(pBuffer, copySize, pRingBuffer->pBuffer, copySize); 2608 portMemFree(pRingBuffer->pBuffer); 2609 } 2610 2611 pRingBuffer->bufferSize = bufferSize; 2612 pRingBuffer->pBuffer = pBuffer; 2613 return (NvU8 *)pRingBuffer; 2614 } 2615 2616 void 2617 rcdbDestroyRingBuffer_IMPL 2618 ( 2619 Journal *pRcDB, 2620 RMCD_RECORD_TYPE type 2621 ) 2622 { 2623 RING_BUFFER_LOG_COLLECTION *pRingBufferColl = &pRcDB->RingBufferColl; 2624 RING_BUFFER_LOG *pRingBuffer, *pCurrentRingBuffer; 2625 NvU32 i; 2626 2627 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer); 2628 if (pRingBuffer == NULL) 2629 return; 2630 2631 if (--pRingBuffer->refCount > 0) 2632 return; 2633 2634 pCurrentRingBuffer = pRingBufferColl->pFirstEntry; 2635 if (pCurrentRingBuffer == pRingBuffer) 2636 { 2637 pRingBufferColl->pFirstEntry = pCurrentRingBuffer->pNextRingBuffer; 2638 } 2639 else 2640 { 2641 for (i = 0; i < pRingBufferColl->NumRingBuffers; i++) 2642 { 2643 if (pCurrentRingBuffer->pNextRingBuffer == pRingBuffer) 2644 { 2645 pCurrentRingBuffer->pNextRingBuffer = 2646 pRingBuffer->pNextRingBuffer; 2647 break; 2648 } 2649 pCurrentRingBuffer = pCurrentRingBuffer->pNextRingBuffer; 2650 } 2651 } 2652 2653 portMemFree(pRingBuffer->pBuffer); 2654 portMemFree(pRingBuffer); 2655 2656 pRingBufferColl->NumRingBuffers--; 2657 } 2658 2659 /* 2660 ** _rcdbAllocRecFromRingBuffer allocates a buffer entry from the 2661 ** specified ring buffer. 2662 ** 2663 ** parameters: 2664 ** pGpu a pointer to the GPU object associated with the entry. 2665 ** pRcdb a pointer toe the Journal that contains the ring buffers 2666 ** type the record type to locate a buffer for. 2667 ** recordSize the size of the expected record 2668 ** 2669 ** notes: 2670 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock. 2671 ** failure to do so can result in concurrency issues. 2672 */ 2673 RmRCCommonJournal_RECORD * 2674 _rcdbAllocRecFromRingBuffer 2675 ( 2676 OBJGPU *pGpu, 2677 Journal *pRcDB, 2678 RMCD_RECORD_TYPE type 2679 ) 2680 { 2681 RING_BUFFER_LOG *pRingBuffer = NULL; 2682 NvU32 newItemIndex; 2683 RmRCCommonJournal_RECORD 2684 *pCommon = NULL; 2685 2686 // Find the ring buffer for this entry in the collection. 2687 rcdbFindRingBufferForType(pRcDB, type, &pRingBuffer); 2688 2689 if (pRingBuffer == NULL) 2690 { 2691 NV_ASSERT(0); 2692 // 2693 // There is no ring buffer allocated for this type. 2694 // Nothing we can do about it. 2695 // 2696 return NULL; 2697 } 2698 2699 newItemIndex = (pRingBuffer->numEntries + pRingBuffer->headIndex) % pRingBuffer->maxEntries; 2700 2701 // prepend the rmJournalCommon record to record. 2702 pCommon = (RmRCCommonJournal_RECORD*)(pRingBuffer->pBuffer + (rcdbGetOcaRecordSizeWithHeader(pRcDB, type) * newItemIndex)); 2703 pCommon->Header.cRecordGroup = RmGroup; 2704 pCommon->Header.cRecordType = type; 2705 pCommon->Header.wRecordSize = (NvU16)rcdbGetOcaRecordSizeWithHeader(pRcDB, type); 2706 rcdbSetCommonJournalRecord(pGpu, pCommon); 2707 2708 // Increment the number of entries or advance the head index. 2709 if (pRingBuffer->numEntries < pRingBuffer->maxEntries) 2710 { 2711 ++pRingBuffer->numEntries; 2712 } 2713 else 2714 { 2715 ++(pRingBuffer->headIndex); 2716 if (pRingBuffer->headIndex >= pRingBuffer->maxEntries) 2717 { 2718 pRingBuffer->headIndex = 0; 2719 } 2720 } 2721 return pCommon; 2722 } 2723 2724 /* 2725 ** rcdbAddRecToRingBuffer_IMPL allocates a buffer entry from the 2726 ** specified ring buffer & copies the supplied data buffer into it. 2727 ** 2728 ** parameters: 2729 ** pGpu a pointer to the GPU object associated with the entry. 2730 ** pRcdb a pointer toe the Journal that contains the ring buffers 2731 ** type the record type to locate a buffer for. 2732 ** recordSize the size of the expected record 2733 ** pRecord a pointer to the data that will populate the new ring buffer entry. 2734 ** 2735 ** notes: 2736 */ 2737 RmRCCommonJournal_RECORD * 2738 rcdbAddRecToRingBuffer_IMPL 2739 ( 2740 OBJGPU *pGpu, 2741 Journal *pRcDB, 2742 RMCD_RECORD_TYPE type, 2743 NvU32 recordSize, 2744 NvU8 *pRecord 2745 ) 2746 { 2747 RmRCCommonJournal_RECORD *pCommon = NULL; 2748 2749 NV_ASSERT(recordSize == _rcdbGetOcaRecordSize(pRcDB, type)); 2750 2751 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 2752 { 2753 pCommon = _rcdbAllocRecFromRingBuffer(pGpu, pRcDB, type); 2754 if (pCommon != NULL) 2755 { 2756 // copy the record to follow the common header. 2757 portMemCopy(&(pCommon[1]), recordSize, pRecord, recordSize); 2758 } 2759 } 2760 portAtomicDecrementS32(&concurrentRingBufferAccess); 2761 2762 return pCommon; 2763 } 2764 2765 static NvU32 _rcdbGetOcaRecordSize(Journal *pRcDB, RMCD_RECORD_TYPE type) 2766 { 2767 switch(type) 2768 { 2769 case RmRcDiagReport: 2770 return sizeof(RmRcDiag_RECORD); 2771 break; 2772 case RmNocatReport: 2773 return sizeof(RM_NOCAT_JOURNAL_ENTRY); 2774 break; 2775 default: 2776 return 0; 2777 } 2778 } 2779 2780 NvU32 rcdbGetOcaRecordSizeWithHeader_IMPL(Journal *pRcDB, RMCD_RECORD_TYPE type) 2781 { 2782 NvU32 recSz; 2783 2784 recSz = _rcdbGetOcaRecordSize(pRcDB, type); 2785 if (0 < recSz) 2786 { 2787 recSz += sizeof(RmRCCommonJournal_RECORD); 2788 } 2789 2790 // 2791 // On architecture like RISC-V, loads/stores need to be aligned to the 2792 // request size (1, 2, 4, 8-byte). Here, OCA record and header are stored 2793 // in a ring buffer, hence total recSz needs to be 8-byte aligned for both 2794 // producer (GSP RM) and consumer (CPU RM) of this data. 2795 // 2796 return NV_ALIGN_UP(recSz, 8); 2797 } 2798 2799 NV_STATUS 2800 rcdbAddRmGpuDump 2801 ( 2802 OBJGPU *pGpu 2803 ) 2804 { 2805 NV_STATUS status = NV_OK; 2806 OBJSYS *pSys = SYS_GET_INSTANCE(); 2807 Journal *pRcDB = SYS_GET_RCDB(pSys); 2808 NvDebugDump *pNvd = GPU_GET_NVD(pGpu); 2809 NVD_STATE *pNvDumpState = &pRcDB->nvDumpState; 2810 SYS_ERROR_INFO *pSysErrorInfo = &pRcDB->ErrorInfo; 2811 RMPRBERRORELEMENT_V2 *pPrbErrorInfo = NULL; 2812 RMPRBERRORELEMENT_V2 *pErrorList = NULL; 2813 RMCD_ERROR_BLOCK *pNewErrorBlock = NULL; 2814 RMERRORHEADER *pErrorHeader = NULL; 2815 PRB_ENCODER prbEnc; 2816 NvU32 bufferUsed; 2817 NvU8 *pBuf = NULL; 2818 2819 // 2820 // The deferred dump codepath will block out other dumps until the DPC can 2821 // be executed. If this is the deferred callback attempting to do the dump, 2822 // carry on. 2823 // 2824 if (pNvDumpState->bDumpInProcess && 2825 !pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH)) 2826 { 2827 return NV_ERR_STATE_IN_USE; 2828 } 2829 2830 prbEnc.depth = 0; 2831 pNvDumpState->bDumpInProcess = NV_TRUE; 2832 pNvDumpState->nvDumpType = NVD_DUMP_TYPE_OCA; 2833 pNvDumpState->bRMLock = rmapiLockIsOwner(); 2834 2835 rcdbDumpInitGpuAccessibleFlag(pGpu, pRcDB); 2836 2837 // 2838 // General process: 2839 // 1. Start the protobuf encoder in ALLOCATE mode, and dump the data 2840 // 2. Allocate an error element to stick in the Journal list 2841 // 3. Add the protobuf dump to the error element 2842 // 4. Put the error element at the end of the error list on OBJRCDB 2843 // 2844 status = prbEncStartAlloc(&prbEnc, NVDEBUG_NVDUMP, NVDUMP_MAX_DUMP_SIZE, 2845 NULL); 2846 if (status != NV_OK) 2847 { 2848 // 2849 // If we couldn't allocate the memory, it may be because we're at a 2850 // raised IRQL. It's not a great idea to be gathering a bunch of state 2851 // from the interrupt context anyway, so queue a work item to come back 2852 // later and try again. 2853 // 2854 NvU32 *pGpuInstance = NULL; 2855 2856 // 2857 // If that's what we've already done and we're still failing, bail out 2858 // to avoid an infinite fail/queue-work-item loop. 2859 // 2860 if (pRcDB->getProperty(pRcDB, PDB_PROP_RCDB_IN_DEFERRED_DUMP_CODEPATH)) 2861 { 2862 NV_PRINTF(LEVEL_ERROR, 2863 "deferred GPU dump encoder init failed (status = 0x%x)\n", 2864 status); 2865 goto done; 2866 } 2867 2868 NV_PRINTF(LEVEL_INFO, "deferring GPU dump for normal context\n"); 2869 2870 // 2871 // This will be freed by the OS work item layer. We pass the GPU 2872 // instance as the data separately because if the GPU has fallen off 2873 // the bus, the OS layer may refuse to execute work items attached to 2874 // it. Instead, use the system work item interface and handle the GPU 2875 // ourselves. 2876 // 2877 pGpuInstance = portMemAllocNonPaged(sizeof(NvU32)); 2878 if (pGpuInstance == NULL) 2879 { 2880 status = NV_ERR_NO_MEMORY; 2881 goto done; 2882 } 2883 2884 *pGpuInstance = gpuGetInstance(pGpu); 2885 status = osQueueSystemWorkItem(_rcdbAddRmGpuDumpCallback, 2886 pGpuInstance); 2887 if (status != NV_OK) 2888 { 2889 portMemFree(pGpuInstance); 2890 goto done; 2891 } 2892 2893 // 2894 // Since we've queued the work item, leave the dump state marked as in 2895 // use to prevent other interrupts and codepaths from attempting to 2896 // initiate the dump and/or queue a new work item. 2897 // 2898 return NV_WARN_MORE_PROCESSING_REQUIRED; 2899 } 2900 2901 status = nvdDumpAllEngines(pGpu, pNvd, &prbEnc, pNvDumpState); 2902 if (status != NV_OK) 2903 { 2904 // 2905 // If the dump failed somewhere, unwind the encoder and then drop 2906 // through to finish it out so we can get the pointer to the 2907 // allocated buffer to free. 2908 // 2909 while (prbEnc.depth > 1) 2910 { 2911 prbEncNestedEnd(&prbEnc); 2912 } 2913 } 2914 2915 bufferUsed = prbEncFinish(&prbEnc, (void **)&pBuf); 2916 2917 if (status != NV_OK) 2918 { 2919 goto done; 2920 } 2921 2922 // Allocate and initialize the error element 2923 pPrbErrorInfo = portMemAllocNonPaged(sizeof(RMPRBERRORELEMENT_V2)); 2924 if (pPrbErrorInfo == NULL) 2925 { 2926 status = NV_ERR_NO_MEMORY; 2927 goto done; 2928 } 2929 2930 portMemSet(pPrbErrorInfo, 0, sizeof(RMPRBERRORELEMENT_V2)); 2931 pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordGroup = RmGroup; 2932 pPrbErrorInfo->RmPrbErrorData.common.Header.cRecordType = RmPrbFullDump_V2; 2933 pPrbErrorInfo->RmPrbErrorData.common.Header.wRecordSize = sizeof(RMPRBERRORELEMENT_V2); 2934 rcdbSetCommonJournalRecord(pGpu, &(pPrbErrorInfo->RmPrbErrorData.common)); 2935 pErrorHeader = &pPrbErrorInfo->ErrorHeader; 2936 pErrorHeader->pErrorBlock = NULL; 2937 2938 // 2939 // Allocate and initialize the error "block" associated with this protobuf 2940 // dump 2941 // 2942 pNewErrorBlock = portMemAllocNonPaged(sizeof(RMCD_ERROR_BLOCK)); 2943 if (pNewErrorBlock == NULL) 2944 { 2945 status = NV_ERR_NO_MEMORY; 2946 goto done; 2947 } 2948 2949 portMemSet(pNewErrorBlock, 0, sizeof(RMCD_ERROR_BLOCK)); 2950 pNewErrorBlock->pBlock = pBuf; 2951 pNewErrorBlock->blockSize = bufferUsed; 2952 pNewErrorBlock->pNext = NULL; 2953 pErrorHeader->pErrorBlock = pNewErrorBlock; 2954 2955 // Add the error element to the Journal list 2956 if (pSysErrorInfo->pErrorList != NULL) 2957 { 2958 pErrorList = (RMPRBERRORELEMENT_V2*)pSysErrorInfo->pErrorList; 2959 while (pErrorList->ErrorHeader.pNextError != NULL) 2960 { 2961 pErrorList = (RMPRBERRORELEMENT_V2*)pErrorList->ErrorHeader.pNextError; 2962 } 2963 2964 pErrorList->ErrorHeader.pNextError = (RMFIFOERRORELEMENT_V3*)pPrbErrorInfo; 2965 } 2966 else 2967 { 2968 pSysErrorInfo->pErrorList = pPrbErrorInfo; 2969 } 2970 2971 pSysErrorInfo->ErrorCount++; 2972 2973 done: 2974 if (status != NV_OK) 2975 { 2976 if (pBuf != NULL) 2977 { 2978 portMemFree(pPrbErrorInfo); 2979 portMemFree(pBuf); 2980 } 2981 } 2982 2983 pNvDumpState->bDumpInProcess = NV_FALSE; 2984 return status; 2985 } 2986 2987 #if (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS) 2988 #if !defined(DEBUG) && !defined(QA_BUILD) 2989 /* 2990 */ 2991 NvBool 2992 rcdProbeGpuPresent( 2993 OBJGPU *pGpu, 2994 NvU64 ip 2995 ) 2996 { 2997 NvU32 testValue; 2998 NvBool bFoundLostGpu = NV_FALSE; 2999 3000 // protect against recursion when probing the GPU. 3001 if (portAtomicIncrementS32(&probeGpuRecursion) == 1) 3002 { 3003 if (NULL != pGpu) 3004 { 3005 // is the GPU we are checking allready reported lost? 3006 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH) && 3007 !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST)) 3008 { 3009 testValue = GPU_CHECK_REG_RD32(pGpu, NV_PMC_BOOT_0, (~(pGpu->chipId0))); 3010 if (testValue == GPU_REG_VALUE_INVALID) 3011 { 3012 // there shouldn't be a need to make a journal entry, 3013 // as that should have been done by GPU_CHECK_REG_RD32 3014 3015 // Add GPU lost detection to to NvLog. 3016 // But skip when nvLog asserts to avoid stack overflow. 3017 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)) 3018 if (portAtomicIncrementS32(&nvLogRecursion) == 1) 3019 #endif 3020 { 3021 NV_PRINTF(LEVEL_ERROR, 3022 "found GPU %d (0x%p) inaccessible After assert\n", 3023 pGpu->gpuInstance, pGpu); 3024 } 3025 #if defined(DEBUG) || defined(QA_BUILD) || ((defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS)) 3026 portAtomicDecrementS32(&nvLogRecursion); 3027 #endif 3028 bFoundLostGpu = NV_TRUE; 3029 } 3030 } 3031 } 3032 } 3033 portAtomicDecrementS32(&probeGpuRecursion); 3034 return bFoundLostGpu; 3035 } 3036 3037 NvBool 3038 rcdProbeAllGpusPresent( 3039 NvU64 ip 3040 ) 3041 { 3042 NvBool bFoundLostGpu = NV_FALSE; 3043 OBJGPU *pGpu; 3044 NvU32 gpuMask; 3045 NvU32 gpuIndex = 0; 3046 3047 gpumgrGetGpuAttachInfo(NULL, &gpuMask); 3048 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex); 3049 while (pGpu) 3050 { 3051 bFoundLostGpu = bFoundLostGpu || rcdProbeGpuPresent(pGpu, ip); 3052 pGpu = gpumgrGetNextGpu(gpuMask, &gpuIndex); 3053 } 3054 return bFoundLostGpu; 3055 } 3056 #endif // !defined(DEBUG) && !defined(QA_BUILD) 3057 #endif // (defined(_WIN32) || defined(_WIN64) || defined(NV_UNIX)) && !defined(NV_MODS) 3058 3059 void 3060 rcdbAddCrashedFalcon 3061 ( 3062 Falcon *pFlcn 3063 ) 3064 { 3065 OBJSYS *pSys = SYS_GET_INSTANCE(); 3066 Journal *pRcDB = SYS_GET_RCDB(pSys); 3067 3068 pRcDB->pCrashedFlcn = pFlcn; 3069 } 3070 3071 3072 /* 3073 ** _rcdbNocatCollectContext records the context of the GPU at the time the error is reported. 3074 ** 3075 ** parameters: 3076 ** pGpu pointer to GPU to be reported on. 3077 ** pContext pointer to context structure to be filled in. 3078 ** 3079 ** returns: 3080 ** NV_ERR_INVALID_ARGUMENT -- pContext is NULL 3081 */ 3082 NV_STATUS 3083 _rcdbNocatCollectContext(OBJGPU *pGpu, Journal* pRcdb, NV2080_NOCAT_JOURNAL_GPU_STATE* pContext) 3084 { 3085 NV2080_NOCAT_JOURNAL_GPU_STATE* pContextCache = NULL; 3086 const char *pTag; 3087 3088 if (pRcdb == NULL) 3089 { 3090 return NV_ERR_INVALID_ARGUMENT; 3091 } 3092 3093 // determine which tag to use. 3094 if (pRcdb->nocatJournalDescriptor.tag[0] != '\0') 3095 { 3096 pTag = (char *)pRcdb->nocatJournalDescriptor.tag; 3097 } 3098 else 3099 { 3100 pTag = NOCAT_DEFAULT_TAG_VALUE_STR; 3101 } 3102 if (pGpu == NULL) 3103 { 3104 // w/o a GPU the only thing we can do is set the tag. 3105 if (pContext != NULL) 3106 { 3107 portMemSet(pContext, 0, sizeof(*pContext)); 3108 3109 portStringCopy((char *)pContext->tag, 3110 NV2080_NOCAT_JOURNAL_MAX_STR_LEN, 3111 pTag, 3112 portStringLength(pTag) + 1); 3113 } 3114 return NV_OK; 3115 } 3116 #if NOCAT_COLLECT_PERF 3117 pGpuCache = &(pGpu->nocatGpuCache); 3118 #endif 3119 pContextCache = &(pRcdb->nocatJournalDescriptor.nocatGpuState); 3120 3121 // insert tag if we have one. 3122 portStringCopy((char *)pContextCache->tag, 3123 NV2080_NOCAT_JOURNAL_MAX_STR_LEN, 3124 pTag, 3125 portStringLength(pTag) + 1); 3126 3127 if (!pContextCache->bValid) 3128 { 3129 pContextCache->deviceId = (NvU16)(DRF_VAL(_PCI, _DEVID, _DEVICE, pGpu->idInfo.PCIDeviceID)); 3130 pContextCache->vendorId = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCIDeviceID)); 3131 pContextCache->subsystemVendor = (NvU16)(DRF_VAL(_PCI, _SUBID, _VENDOR, pGpu->idInfo.PCISubDeviceID)); 3132 pContextCache->subsystemId = (NvU16)(DRF_VAL(_PCI, _SUBID, _DEVICE, pGpu->idInfo.PCISubDeviceID)); 3133 pContextCache->revision = pGpu->idInfo.PCIRevisionID; 3134 pContextCache->type = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_MOBILE); 3135 pContextCache->bMsHybrid = FLD_TEST_DRF(_JT_FUNC, _CAPS, _MSHYB_ENABLED, _TRUE, 3136 pGpu->acpiMethodData.jtMethodData.jtCaps); 3137 3138 portStringCopy((char *)pContextCache->vbiosProject, NV2080_NOCAT_JOURNAL_MAX_STR_LEN, 3139 NOCAT_UNKNOWN_STR, portStringLength(NOCAT_UNKNOWN_STR) + 1); 3140 3141 if (!osIsRaisedIRQL()) 3142 { 3143 NV_STATUS status = pGpu->acpiMethodData.capsMethodData.status; 3144 if (status == NV_OK) 3145 { 3146 pContextCache->bOptimus = 3147 FLD_TEST_DRF(OP_FUNC, _OPTIMUSCAPS, _OPTIMUS_CAPABILITIES, 3148 _DYNAMIC_POWER_CONTROL, pGpu->acpiMethodData.capsMethodData.optimusCaps); 3149 } 3150 3151 pContextCache->bValid = NV_TRUE; 3152 } 3153 } 3154 if (pContext != NULL) 3155 { 3156 portMemSet(pContext, 0, sizeof(*pContext)); 3157 3158 *pContext = *pContextCache; 3159 3160 pContext->bFullPower = gpuIsGpuFullPower(pGpu); 3161 pContext->bInGc6Reset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_GC6_RESET); 3162 pContext->bInFullchipReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_FULLCHIP_RESET); 3163 pContext->bInSecBusReset = pGpu->getProperty(pGpu, PDB_PROP_GPU_IN_SECONDARY_BUS_RESET); 3164 } 3165 return NV_OK; 3166 } 3167 3168 /* 3169 ** _rcdbSetTdrReason translates the reason code to a string & puts that string 3170 ** in the provided buffer. 3171 ** 3172 ** parameters: 3173 ** tdrReason the reason code for the TDR 3174 ** pTdrReasonStr pointer to the place to copy the reason string to 3175 ** maxLen the size of the buffer pointed to in pTdrReasonStr. 3176 ** 3177 */ 3178 void _rcdbSetTdrReason 3179 ( 3180 Journal *pRcdb, 3181 NvU32 tdrReason, 3182 char *pTdrReasonStr, 3183 NvU32 maxLen 3184 ) 3185 { 3186 const char *pTmpStr; 3187 3188 // validate inputs. 3189 if (pRcdb == NULL) 3190 { 3191 return; 3192 } 3193 3194 // is there a string buffer & is it large enough to hold more than a NULL string 3195 if ((pTdrReasonStr == NULL) || (maxLen < 2)) 3196 { 3197 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++; 3198 return; 3199 } 3200 switch (tdrReason) 3201 { 3202 case NV2080_CTRL_NOCAT_TDR_TYPE_NONE: 3203 pTmpStr = NOCAT_NA_STR; 3204 break; 3205 case NV2080_CTRL_NOCAT_TDR_TYPE_LEGACY: 3206 pTmpStr = NOCAT_LEGACY_STR; 3207 break; 3208 case NV2080_CTRL_NOCAT_TDR_TYPE_FULLCHIP: 3209 pTmpStr = NOCAT_FULLCHIP_TDR_STR; 3210 break; 3211 case NV2080_CTRL_NOCAT_TDR_TYPE_BUSRESET: 3212 pTmpStr = NOCAT_BUS_RESET_TDR_STR; 3213 break; 3214 case NV2080_CTRL_NOCAT_TDR_TYPE_GC6_RESET: 3215 pTmpStr = NOCAT_GC6_RESET_TDR_STR; 3216 break; 3217 case NV2080_CTRL_NOCAT_TDR_TYPE_SURPRISE_REMOVAL: 3218 pTmpStr = NOCAT_SURPRISE_REMOVAL_TDR_STR; 3219 break; 3220 case NV2080_CTRL_NOCAT_TDR_TYPE_UCODE_RESET: 3221 pTmpStr = NOCAT_UCODE_RESET_TDR_STR; 3222 break; 3223 default: 3224 pTmpStr = NOCAT_UNKNOWN_STR; 3225 break; 3226 } 3227 portStringCopy(pTdrReasonStr, maxLen, 3228 pTmpStr, portStringLength(pTmpStr) + 1); 3229 } 3230 3231 /* 3232 ** _rcdbAllocNocatJournalRecord allocates a buffer entry from the Journal ring buffer 3233 ** for the specified type 3234 ** 3235 ** parameters: 3236 ** pGpu a pointer to the GPU object associated with the entry. 3237 ** pRcdb a pointer toe the Journal that contains the ring buffers 3238 ** type the record type to locate a buffer for. 3239 ** 3240 ** returns a pointer to a record in the ring buffer, or NULL if a record could not be allocated. 3241 ** 3242 ** notes: 3243 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock. 3244 ** the lock should be held until access the buffer is completed. 3245 ** failure to do so can result in concurrency issues. 3246 ** 3247 ** if successful, the buffer that is returned is cleared & an id assigned. 3248 */ 3249 RM_NOCAT_JOURNAL_ENTRY* _rcdbAllocNocatJournalRecord 3250 ( 3251 OBJGPU *pGpu, 3252 OBJRCDB *pRcdb, 3253 RmRCCommonJournal_RECORD **ppCommon 3254 ) 3255 { 3256 nocatQueueDescriptor *pDesc = NULL; 3257 RmRCCommonJournal_RECORD* pCommon; 3258 RM_NOCAT_JOURNAL_ENTRY * pNocatEntry = NULL; 3259 3260 // make sure someone has the lock. 3261 if (concurrentRingBufferAccess == 0) 3262 { 3263 return NULL; 3264 } 3265 3266 pDesc = &pRcdb->nocatJournalDescriptor; 3267 3268 // Get the next record from the appropriate nocat ring buffer. 3269 pCommon = _rcdbAllocRecFromRingBuffer( 3270 pGpu, 3271 pRcdb, 3272 RmNocatReport); 3273 3274 if (pCommon != NULL) 3275 { 3276 // advance the pointer past the common header. 3277 pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD)); 3278 3279 // clear the record & assign an id. 3280 portMemSet(pNocatEntry, 0, sizeof(*pNocatEntry)); 3281 pNocatEntry->id = pDesc->nextRecordId++; 3282 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOCATED_IDX]++; 3283 portAtomicIncrementS32(&pNocatEntry->inUse); 3284 } 3285 else 3286 { 3287 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_ALLOC_FAILED_IDX]++; 3288 } 3289 if (ppCommon != NULL) 3290 { 3291 *ppCommon = pCommon; 3292 } 3293 return pNocatEntry; 3294 } 3295 3296 /* 3297 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record, 3298 ** or optionally the oldest record if the requested one is not available. 3299 ** 3300 ** parameters: 3301 ** pRcdb a pointer toe the Journal that contains the ring buffers 3302 ** id id of the record we are looking for 3303 ** bExactMatch indicates if we want an exact match, or the closest record. 3304 ** ppCommon a pointer to a pointer that will hold the pointer to 3305 ** the common part of the record. 3306 ** this can be NULL 3307 ** ppReturnedNocatEntry 3308 ** a pointer to a pointer that will hold the pointer to 3309 ** the nocat part of the record 3310 ** this can be NULL 3311 ** 3312 ** notes: 3313 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock. 3314 ** the lock should be held until access the buffer is completed. 3315 ** failure to do so can result in concurrency issues. 3316 */ 3317 NV_STATUS 3318 _rcdbGetNocatJournalRecord 3319 ( 3320 OBJRCDB *pRcdb, 3321 NvU32 reqId, 3322 NvBool bExactMatch, 3323 RmRCCommonJournal_RECORD 3324 **ppReturnedCommon, 3325 RM_NOCAT_JOURNAL_ENTRY 3326 **ppReturnedNocatEntry 3327 ) 3328 { 3329 nocatQueueDescriptor *pDesc; 3330 RmRCCommonJournal_RECORD *pCommon = NULL; 3331 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL; 3332 RING_BUFFER_LOG *pRingBuffer = NULL; 3333 NvS32 offset; 3334 NvS32 idx; 3335 3336 // make sure someone has the lock. 3337 if (concurrentRingBufferAccess == 0) 3338 { 3339 return NV_ERR_BUSY_RETRY; 3340 } 3341 3342 // is there anything to do 3343 if ((ppReturnedCommon == NULL) && (ppReturnedNocatEntry == NULL)) 3344 { 3345 return NV_OK; 3346 } 3347 3348 // validate inputs. 3349 if (pRcdb == NULL) 3350 { 3351 return NV_ERR_INVALID_ARGUMENT; 3352 } 3353 pDesc = &pRcdb->nocatJournalDescriptor; 3354 3355 // assume we will fail 3356 if (ppReturnedCommon != NULL) 3357 { 3358 *ppReturnedCommon = NULL; 3359 } 3360 if (ppReturnedNocatEntry != NULL) 3361 { 3362 *ppReturnedNocatEntry = NULL; 3363 } 3364 3365 // if there is nothing in the buffer, 3366 // we can't return a record. 3367 if ((pDesc->nextRecordId - pDesc->nextReportedId) == 0) 3368 { 3369 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_RECORDS_IDX]++; 3370 return NV_ERR_OBJECT_NOT_FOUND; 3371 } 3372 3373 // Find the ring buffer for the diag reports 3374 rcdbFindRingBufferForType(pRcdb, RmNocatReport, &pRingBuffer); 3375 if (pRingBuffer == NULL) 3376 { 3377 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_BUFFER_IDX]++; 3378 return NV_ERR_OBJECT_NOT_FOUND; 3379 } 3380 // determine how far back from the head our record should be. 3381 offset = pDesc->nextRecordId - reqId; 3382 3383 // start of from the next record we will replace. 3384 // this will be the oldest buffer in the record, 3385 // or the next empty record, either way, we will wrap to the right one 3386 idx = pRingBuffer->headIndex; 3387 3388 // is the requested record in the buffer? 3389 if ((0 <= offset) && ((NvU16)offset <= pRingBuffer->numEntries)) 3390 { 3391 // back out the offset from the newest/empty record. 3392 idx += pRingBuffer->numEntries - offset; 3393 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_MATCH_FOUND_IDX]++; 3394 } 3395 else if (bExactMatch) 3396 { 3397 // the record is not in the buffer, & we weren't asked for the closest match. 3398 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NO_MATCH_IDX]++; 3399 return NV_ERR_OBJECT_NOT_FOUND; 3400 } 3401 else 3402 { 3403 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_CLOSEST_FOUND_IDX]++; 3404 } 3405 // wrap the idx to the current size of the buffer. 3406 idx %= pRingBuffer->numEntries; 3407 3408 // get a pointer to the common record & the record from the buffer. 3409 pCommon = (RmRCCommonJournal_RECORD*)(((NvU8*)pRingBuffer->pBuffer) + (rcdbGetOcaRecordSizeWithHeader(pRcdb, RmNocatReport) * idx)); 3410 3411 // get a pointer to the data that follows the common header, that is the record data. 3412 pNocatEntry = (RM_NOCAT_JOURNAL_ENTRY*)(((NvU8*)pCommon) + sizeof(RmRCCommonJournal_RECORD)); 3413 portAtomicIncrementS32(&pNocatEntry->inUse); 3414 3415 // pass the record along 3416 if (ppReturnedCommon != NULL) 3417 { 3418 *ppReturnedCommon = pCommon; 3419 } 3420 if (ppReturnedNocatEntry != NULL) 3421 { 3422 *ppReturnedNocatEntry = pNocatEntry; 3423 } 3424 return NV_OK; 3425 } 3426 /* 3427 ** _rcdbGetNocatJournalRecord returns a pointer to the requested record, 3428 ** or optionally the oldest record if the requested one is not available. 3429 ** 3430 ** parameters: 3431 ** pRcdb a pointer toe the Journal that contains the ring buffers 3432 ** id id of the record we are looking for 3433 ** bExactMatch indicates if we want an exact match, or the closest record. 3434 ** ppCommon a pointer to a pointer that will hold the pointer to 3435 ** the common part of the record. 3436 ** this can be NULL 3437 ** ppReturnedNocatEntry 3438 ** a pointer to a pointer that will hold the pointer to 3439 ** the nocat part of the record 3440 ** this can be NULL 3441 ** 3442 ** notes: 3443 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock. 3444 ** the lock should be held until access the buffer is completed. 3445 ** failure to do so can result in concurrency issues. 3446 */ 3447 NV_STATUS 3448 _rcdbReleaseNocatJournalRecord 3449 ( 3450 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry 3451 ) 3452 { 3453 if (pNocatEntry == NULL) 3454 { 3455 return NV_ERR_INVALID_ARGUMENT; 3456 } 3457 if (portAtomicDecrementS32(&pNocatEntry->inUse) != 0) 3458 { 3459 return NV_ERR_BUSY_RETRY; 3460 } 3461 return NV_OK; 3462 } 3463 3464 /* 3465 ** _rcdbGetNewestNocatJournalRecordForType returns a pointer to the newest record for the 3466 ** specified type if there is one. 3467 ** 3468 ** parameters: 3469 ** pRcdb a pointer toe the Journal that contains the ring buffers 3470 ** type type of record we want. 3471 ** ppCommon a pointer to a pointer that will hold the pointer to 3472 ** the common part of the record. 3473 ** this can be NULL 3474 ** ppCommon a pointer to a pointer that will hold the pointer to 3475 ** the nocat part of the record 3476 ** this can be NULL 3477 ** 3478 ** notes: 3479 ** it is assumed the caller has successfully acquired the concurrentRingBufferAccess lock. 3480 ** the lock should be held until access the buffer is completed. 3481 ** failure to do so can result in concurrency issues. 3482 */ 3483 NV_STATUS 3484 _rcdbGetNewestNocatJournalRecordForType 3485 ( 3486 OBJRCDB *pRcdb, 3487 NvU32 type, 3488 RmRCCommonJournal_RECORD 3489 **ppReturnedCommon, 3490 RM_NOCAT_JOURNAL_ENTRY 3491 **ppReturnedNocatEntry 3492 ) 3493 { 3494 if (type >= NV2080_NOCAT_JOURNAL_REC_TYPE_COUNT) 3495 { 3496 // we failed 3497 if (ppReturnedCommon != NULL) 3498 { 3499 *ppReturnedCommon = NULL; 3500 } 3501 if (ppReturnedNocatEntry != NULL) 3502 { 3503 *ppReturnedNocatEntry = NULL; 3504 } 3505 return NV_ERR_OBJECT_NOT_FOUND; 3506 } 3507 return _rcdbGetNocatJournalRecord(pRcdb, pRcdb->nocatJournalDescriptor.lastRecordId[type], NV_TRUE, 3508 ppReturnedCommon, ppReturnedNocatEntry); 3509 } 3510 3511 /* 3512 ** rcdbReportNextNocatJournalEntry fills in the provided Nocat Journal record with the next record 3513 ** to be reported, then updates the last reported id. 3514 ** 3515 ** parameters: 3516 ** pReturnedNocatEntry a pointer to the buffer where the journal record will be transferred to 3517 ** 3518 ** returns: 3519 ** NV_OK -- the record was successfully updated with the next record to report. 3520 ** NV_ERR_INVALID_ARGUMENT -- the provided pointer is NULL 3521 ** NV_ERR_OBJECT_NOT_FOUND -- we could not locate a record to report. 3522 ** 3523 ** notes: 3524 ** we are transferring the record to the target location here instead of passing a pointer 3525 ** to insure the data is transferred while we hold the concurrentRingBufferAccess lock. 3526 ** failure to do so can result in concurrency issues. 3527 ** 3528 ** priority is determined by the record journal queue values. the lower value has 3529 ** higher priority. 3530 ** 3531 ** now that we have moved from a single entry, to a queue, we need to 3532 ** consume the entry once we report it 3533 ** 3534 */ 3535 NV_STATUS 3536 rcdbReportNextNocatJournalEntry 3537 ( 3538 NV2080_NOCAT_JOURNAL_RECORD 3539 *pReturnedNocatEntry 3540 ) 3541 { 3542 OBJSYS *pSys = SYS_GET_INSTANCE(); 3543 Journal *pRcdb = SYS_GET_RCDB(pSys); 3544 NV_STATUS status = NV_ERR_OBJECT_NOT_FOUND; 3545 nocatQueueDescriptor *pDesc; 3546 RmRCCommonJournal_RECORD *pCommon = NULL; 3547 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL; 3548 3549 // validate inputs. 3550 if (pRcdb == NULL) 3551 { 3552 return NV_ERR_INVALID_ARGUMENT; 3553 } 3554 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REQUESTED_IDX]++; 3555 3556 if (pReturnedNocatEntry == NULL) 3557 { 3558 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++; 3559 return NV_ERR_INVALID_ARGUMENT; 3560 } 3561 portMemSet(pReturnedNocatEntry, 0, sizeof(*pReturnedNocatEntry)); 3562 3563 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 3564 { 3565 pDesc = &pRcdb->nocatJournalDescriptor; 3566 _rcdbGetNocatJournalRecord(pRcdb, 3567 pDesc->nextReportedId, NV_FALSE, 3568 &pCommon, &pNocatEntry); 3569 if ((pCommon != NULL) && (pNocatEntry != NULL)) 3570 { 3571 // we have a record, push it into the return buffer 3572 pReturnedNocatEntry->GPUTag = pCommon->GPUTag; 3573 3574 // copy over the data into the supplied buffer. 3575 pReturnedNocatEntry->loadAddress = pDesc->loadAddress; 3576 pReturnedNocatEntry->timeStamp = pCommon->timeStamp; 3577 pReturnedNocatEntry->stateMask = pCommon->stateMask; 3578 pReturnedNocatEntry->nocatGpuState = pNocatEntry->nocatGpuState; 3579 pReturnedNocatEntry->nocatJournalEntry = pNocatEntry->nocatJournalEntry; 3580 3581 // check if we lost any records. 3582 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_DROPPED_IDX] += 3583 pNocatEntry->id - pDesc->nextReportedId; 3584 3585 // update the NocatJournalNextReportedId 3586 pDesc->nextReportedId = pNocatEntry->id + 1; 3587 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_REPORTED_IDX]++; 3588 3589 _rcdbReleaseNocatJournalRecord(pNocatEntry); 3590 status = NV_OK; 3591 3592 } 3593 } 3594 else 3595 { 3596 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++; 3597 status = NV_ERR_BUSY_RETRY; 3598 } 3599 portAtomicDecrementS32(&concurrentRingBufferAccess); 3600 if ((pRcdb->nocatJournalDescriptor.lockTimestamp != 0) && (rcdbGetNocatOutstandingCount(pRcdb) == 0)) 3601 { 3602 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_UNLOCKED_IDX]++; 3603 pRcdb->nocatJournalDescriptor.lockTimestamp = 0; 3604 } 3605 return status; 3606 } 3607 3608 /* 3609 ** rcdbGetNocatOutstandingCount returns the number of NOCAT events that have 3610 ** been recorded since the last reported record. 3611 ** 3612 ** parameters: 3613 ** pRcdb -- a pointer to the Journal object. 3614 ** 3615 ** returns: 3616 ** number of NOCAT events that have been recorded since the last reported record. 3617 ** or NV_U32_MAX if a NULL journal object pointer is provided. 3618 ** 3619 ** notes: 3620 ** the returned count includes records that have been dropped due to wrapping. 3621 ** 3622 */ 3623 NvU32 3624 rcdbGetNocatOutstandingCount(Journal *pRcdb) 3625 { 3626 NvU32 count = NV_U32_MAX; 3627 if (pRcdb != NULL) 3628 { 3629 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 3630 { 3631 count = pRcdb->nocatJournalDescriptor.nextRecordId 3632 - pRcdb->nocatJournalDescriptor.nextReportedId; 3633 } 3634 portAtomicDecrementS32(&concurrentRingBufferAccess); 3635 } 3636 return count; 3637 } 3638 3639 /* 3640 ** _rcdbSendNocatJournalNotification sends an ETW Notification that a NOCAT Journal record has been posted. 3641 ** 3642 ** parameters: 3643 ** pGpu -- a pointer to the GPU object associated with teh new entry 3644 ** (may be NULL) 3645 ** pRcdb -- a pointer to the Journal object NOCAT is using. 3646 ** posted -- the number of records posted since the last record that was retrieved. 3647 ** pCommon -- a pointer to the common record header associated with the record. 3648 ** type -- the record type 3649 ** 3650 ** returns: 3651 ** NV_OK -- the call to post the record was made. 3652 ** note that the call to post the record does not return a status, 3653 ** so we do not know if the call was successful. 3654 ** NV_ERR_INVALID_ARGUMENT -- one of the required pointers is NULL 3655 ** 3656 */ 3657 NV_STATUS 3658 _rcdbSendNocatJournalNotification 3659 ( 3660 OBJGPU *pGpu, 3661 Journal *pRcdb, 3662 NvU32 posted, 3663 RmRCCommonJournal_RECORD *pCommon, // todo: pass in timestamp instead of common. 3664 NvU32 type 3665 ) 3666 { 3667 if ((pCommon == NULL) || (pRcdb == NULL)) 3668 { 3669 return NV_ERR_INVALID_ARGUMENT; 3670 } 3671 RMTRACE_NOCAT(_REPORT_PENDING, (pGpu ? pGpu->gpuId : RMTRACE_UNKNOWN_GPUID), 3672 RmNocatReport, 3673 posted, 3674 type, 3675 rcdbGetNocatOutstandingCount(pRcdb), 3676 pCommon->timeStamp); 3677 3678 // count the number of notifications. 3679 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_NOTIFICATIONS_IDX]++; 3680 return NV_OK; 3681 } 3682 3683 /* 3684 ** rcdbInitNocatGpuCache_IMPL initializes a per GPU cache held in the GPU object to be used by NOCAT 3685 ** 3686 ** parameters: 3687 ** pGpu -- a pointer to the GPU Object the containing the cache 3688 ** 3689 ** notes: 3690 ** this function: 3691 ** * caches the driver load address 3692 ** * allocates a small block of memory in the frame buffer for testing 3693 ** * initializes the GPU context cache 3694 ** 3695 */ 3696 void rcdbInitNocatGpuCache_IMPL(OBJGPU *pGpu) 3697 { 3698 OS_DRIVER_BLOCK driverBlock; 3699 OBJSYS *pSys = SYS_GET_INSTANCE(); 3700 Journal *pRcdb = SYS_GET_RCDB(pSys); 3701 #if NOCAT_PROBE_FB_MEMORY 3702 NvU8 *pCpuPtr; 3703 NV_STATUS status; 3704 #endif 3705 3706 if (pGpu == NULL) 3707 { 3708 return; 3709 } 3710 portMemSet(&driverBlock, 0x00, sizeof(driverBlock)); 3711 if (osGetDriverBlock(pGpu->pOsGpuInfo, &driverBlock) == NV_OK) 3712 { 3713 pRcdb->nocatJournalDescriptor.loadAddress = (NvU64)driverBlock.driverStart; 3714 } 3715 3716 #if NOCAT_PROBE_FB_MEMORY 3717 // Allocate some memory for virtual BAR2 testing 3718 if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_ALL_INST_IN_SYSMEM) && !IsAMODEL(pGpu)) 3719 { 3720 memdescCreateExisting(&pGpu->nocatGpuCache.fbTestMemDesc, 3721 pGpu, NOCAT_FBSIZETESTED, ADDR_FBMEM, NV_MEMORY_UNCACHED, MEMDESC_FLAGS_NONE); 3722 memdescTagAlloc(status, NV_FB_ALLOC_RM_INTERNAL_OWNER_UNNAMED_TAG_102, 3723 (&pGpu->nocatGpuCache.fbTestMemDesc)); 3724 if (status != NV_OK) 3725 { 3726 NV_PRINTF(LEVEL_ERROR, "Could not allocate vidmem for NOCAT bar2 testing\n"); 3727 return; 3728 } 3729 pCpuPtr = kbusMapRmAperture_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc); 3730 if (pCpuPtr == NULL) 3731 { 3732 memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc); 3733 memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc); 3734 pGpu->nocatGpuCache.pCpuPtr = NULL; 3735 return; 3736 } 3737 pGpu->nocatGpuCache.pCpuPtr = pCpuPtr; 3738 } 3739 #endif 3740 // initialize the context cache 3741 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 3742 { 3743 _rcdbNocatCollectContext(pGpu, pRcdb, NULL); 3744 } 3745 portAtomicDecrementS32(&concurrentRingBufferAccess); 3746 3747 return; 3748 } 3749 3750 /* 3751 ** rcdbCleanupNocatGpuCache_IMPL returns per GPU resources used by NOCAT. 3752 ** 3753 ** parameters: 3754 ** pGpu -- a pointer to the GPU Object the containing the cache 3755 ** 3756 ** notes: 3757 ** This will free up the FB test window if allocated, and clear out the cache 3758 ** 3759 */ 3760 void rcdbCleanupNocatGpuCache_IMPL(OBJGPU *pGpu) 3761 { 3762 #if NOCAT_PROBE_FB_MEMORY 3763 if (pGpu == NULL) 3764 { 3765 return; 3766 } 3767 if (pGpu->nocatGpuCache.pCpuPtr != NULL) 3768 { 3769 kbusUnmapRmApertureWithFlags_HAL(pGpu, &pGpu->nocatGpuCache.fbTestMemDesc, 3770 &pGpu->nocatGpuCache.pCpuPtr, TRANSFER_FLAGS_NONE); 3771 memdescFree(&pGpu->nocatGpuCache.fbTestMemDesc); 3772 memdescDestroy(&pGpu->nocatGpuCache.fbTestMemDesc); 3773 } 3774 portMemSet(&pGpu->nocatGpuCache, 0, sizeof(pGpu->nocatGpuCache)); 3775 #endif 3776 3777 return; 3778 } 3779 3780 3781 3782 /* 3783 ** rcdbNocatInsertNocatError records a reported NOCAT error 3784 ** 3785 ** parameters: 3786 ** pGpu Pointer to GPU associated with the error 3787 ** may be NULL if there is no GPU associated with the error 3788 ** if NULL the primary GPU is used 3789 ** pNewEntry A pointer to a structure that contains all the available data for the report 3790 */ 3791 NvU32 3792 rcdbNocatInsertNocatError( 3793 OBJGPU *pGpu, 3794 NOCAT_JOURNAL_PARAMS *pNewEntry 3795 ) 3796 { 3797 OBJSYS *pSys = SYS_GET_INSTANCE(); 3798 Journal *pRcdb = SYS_GET_RCDB(pSys); 3799 #if(NOCAT_PROBE_FB_MEMORY) 3800 NvBool bCheckFBState = NV_FALSE; 3801 #endif 3802 RmRCCommonJournal_RECORD *pCommon = NULL; 3803 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL; 3804 NvU32 id = INVALID_RCDB_NOCAT_ID; 3805 const char *pSource = NULL; 3806 NvU32 diagBufferLen = 0; 3807 const char *pFaultingEngine = NULL; 3808 NvBool postRecord; 3809 // validate inputs. 3810 if (pRcdb == NULL) 3811 { 3812 return NV_ERR_INVALID_ARGUMENT; 3813 } 3814 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_REQ_IDX]++; 3815 if (pNewEntry == NULL) 3816 { 3817 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BAD_PARAM_IDX]++; 3818 return 0; 3819 } 3820 // assign a timestamp if none was provided 3821 if (pNewEntry->timestamp == 0) 3822 { 3823 pNewEntry->timestamp = osGetTimestamp(); 3824 } 3825 3826 // initially set postRecord based on the current state of the lock; 3827 postRecord = pRcdb->nocatJournalDescriptor.lockTimestamp == 0; 3828 3829 // perform any record type specific setup 3830 switch (pNewEntry->recType) 3831 { 3832 case NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK: 3833 #if(NOCAT_PROBE_FB_MEMORY) 3834 bCheckFBState = NV_TRUE; 3835 #endif 3836 // fall thru 3837 3838 case NV2080_NOCAT_JOURNAL_REC_TYPE_TDR: 3839 // lock the journal so we don't wrap over the record we are inserting. 3840 if (pRcdb->nocatJournalDescriptor.lockTimestamp == 0) 3841 { 3842 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCKED_IDX]++; 3843 } 3844 else 3845 { 3846 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_JOURNAL_LOCK_UPDATED_IDX]++; 3847 } 3848 3849 pRcdb->nocatJournalDescriptor.lockTimestamp = pNewEntry->timestamp; 3850 postRecord = NV_TRUE; 3851 break; 3852 3853 case NV2080_NOCAT_JOURNAL_REC_TYPE_RC: 3854 #if(NOCAT_PROBE_FB_MEMORY) 3855 bCheckFBState = NV_TRUE; 3856 #endif 3857 // set the source 3858 pSource = "RC Error"; 3859 break; 3860 3861 case NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT: 3862 // set the source 3863 pSource = "ASSERT"; 3864 break; 3865 3866 case NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE: 3867 break; 3868 3869 case NV2080_NOCAT_JOURNAL_REC_TYPE_UNKNOWN: 3870 default: 3871 return 0; 3872 break; 3873 } 3874 // check if we should post the record when locked. 3875 if (!postRecord) 3876 { 3877 if ((NvS64)(pNewEntry->timestamp - pRcdb->nocatJournalDescriptor.lockTimestamp) < 0) 3878 { 3879 // the record predates the lock, so it's Grandfathered in. 3880 postRecord = NV_TRUE; 3881 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_GRANDFATHERED_RECORD_IDX]++; 3882 } 3883 else 3884 { 3885 // we are dropping the record, count that. 3886 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_LOCKED_OUT_IDX]++; 3887 } 3888 } 3889 if (postRecord) 3890 { 3891 // is the buffer available? 3892 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 3893 { 3894 // start recording this new record by allocating a record from the buffer. 3895 pNocatEntry = _rcdbAllocNocatJournalRecord(pGpu, pRcdb, &pCommon); 3896 if (pNocatEntry != NULL) 3897 { 3898 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECTED_IDX]++; 3899 3900 // update the time stamp to the one supplied. 3901 pCommon->timeStamp = pNewEntry->timestamp; 3902 3903 // save the record Id for the type. 3904 pRcdb->nocatJournalDescriptor.lastRecordId[pNewEntry->recType] = 3905 pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY] = 3906 pRcdb->nocatJournalDescriptor.nextRecordId - 1; 3907 3908 // set the type. 3909 pNocatEntry->nocatJournalEntry.recType = pNewEntry->recType; 3910 3911 // set bugcheck 3912 pNocatEntry->nocatJournalEntry.bugcheck = pNewEntry->bugcheck; 3913 3914 // get context 3915 _rcdbNocatCollectContext(pGpu, pRcdb, &(pNocatEntry->nocatGpuState)); 3916 3917 #if(NOCAT_PROBE_FB_MEMORY) 3918 if ((bCheckFBState) 3919 && (pGpu != NULL) 3920 && (pGpu->nocatGpuCache.pCpuPtr != NULL) 3921 // If using Coherent CPU mapping instead of BAR2 do not call VerifyBar2 3922 && !pGpu->getProperty(pGpu, PDB_PROP_GPU_COHERENT_CPU_MAPPING)) 3923 { 3924 switch (kbusVerifyBar2_HAL(pGpu, GPU_GET_KERNEL_BUS(pGpu), 3925 &pGpu->nocatGpuCache.fbTestMemDesc, pGpu->nocatGpuCache.pCpuPtr, 0, NOCAT_FBSIZETESTED)) 3926 { 3927 case NV_OK: // everything passed 3928 break; 3929 3930 case NV_ERR_MEMORY_ERROR: // BAR 0 failed & BAR 2 was not checked, or BAR 2 failed 3931 // for now we don't know which BAR failed, so mark both. 3932 // but only one BAR failed. 3933 // (if BAR 0 Failed, BAR 2 was not checked) 3934 pCommon->stateMask |= 3935 NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR0 3936 | NV_RM_JOURNAL_STATE_MASK_VIDMEM_FAILED_BAR2; 3937 break; 3938 3939 default: // some other processing error cause us to not test the BAR 3940 break; 3941 } 3942 } 3943 #endif 3944 // is there a valid string for source? 3945 // (non NULL ptr & more than just a termination) 3946 if ((pNewEntry->pSource != NULL) && (pNewEntry->pSource[0] != '\0')) 3947 { 3948 // yes, use that. 3949 pSource = pNewEntry->pSource; 3950 } 3951 // the caller did not supply a source, 3952 // did we set a default source based on record type? 3953 else if (pSource == NULL) 3954 { 3955 // no, supply the unknown string for source. 3956 pSource = NOCAT_UNKNOWN_STR; 3957 } 3958 portStringCopy((char*)pNocatEntry->nocatJournalEntry.source, 3959 NV2080_NOCAT_JOURNAL_MAX_STR_LEN, 3960 pSource, 3961 portStringLength(pSource) + 1); 3962 3963 pNocatEntry->nocatJournalEntry.subsystem = pNewEntry->subsystem; 3964 pNocatEntry->nocatJournalEntry.errorCode = pNewEntry->errorCode; 3965 3966 if ((pNewEntry->pDiagBuffer != NULL) && (pNewEntry->diagBufferLen != 0)) 3967 { 3968 // checking length here as we don't want portMemCopy to assert 3969 if (pNewEntry->diagBufferLen < NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer)) 3970 { 3971 diagBufferLen = pNewEntry->diagBufferLen; 3972 } 3973 else 3974 { 3975 // make best effort 3976 diagBufferLen = NV_ARRAY_ELEMENTS(pNocatEntry->nocatJournalEntry.diagBuffer); 3977 } 3978 portMemCopy(pNocatEntry->nocatJournalEntry.diagBuffer, 3979 sizeof(pNocatEntry->nocatJournalEntry.diagBuffer), 3980 pNewEntry->pDiagBuffer, diagBufferLen); 3981 } 3982 pNocatEntry->nocatJournalEntry.diagBufferLen = diagBufferLen; 3983 3984 pFaultingEngine = pNewEntry->pFaultingEngine != NULL ? 3985 pNewEntry->pFaultingEngine : NOCAT_UNKNOWN_STR; 3986 3987 portStringCopy((char*)pNocatEntry->nocatJournalEntry.faultingEngine, 3988 NV2080_NOCAT_JOURNAL_MAX_STR_LEN, 3989 pFaultingEngine, portStringLength(pFaultingEngine) + 1); 3990 3991 _rcdbSetTdrReason(pRcdb, pNewEntry->tdrReason, 3992 (char*)pNocatEntry->nocatJournalEntry.tdrReason, 3993 sizeof(pNocatEntry->nocatJournalEntry.tdrReason)); 3994 3995 _rcdbReleaseNocatJournalRecord(pNocatEntry); 3996 } 3997 else 3998 { 3999 // record was not allocated, bail. 4000 postRecord = NV_FALSE; 4001 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_COLLECT_FAILED_IDX]++; 4002 } 4003 } 4004 else 4005 { 4006 // we are busy, so we can't insert the record, count the record as dropped & count the busy. 4007 postRecord = NV_FALSE; 4008 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++; 4009 } 4010 portAtomicDecrementS32(&concurrentRingBufferAccess); 4011 } 4012 4013 // no matter what happened, trigger the event to indicate a record was processed. 4014 _rcdbSendNocatJournalNotification(pGpu, pRcdb, postRecord, pCommon, pNewEntry->recType); 4015 4016 return id; 4017 } 4018 /* 4019 ** rcdbNocatInsertBugcheck is the interface to record a bugcheck NOCAT report 4020 ** 4021 ** parameters: 4022 ** deviceInstance The instance of the GPU associated with the bugcheck. 4023 ** bugcheck The bugcheck number 4024 */ 4025 NvU32 4026 rcdbNocatInsertBugcheck 4027 ( 4028 NvU32 deviceInstance, 4029 NvU32 bugCheckCode) 4030 { 4031 NOCAT_JOURNAL_PARAMS newEntry; 4032 4033 portMemSet(&newEntry, 0, sizeof(newEntry)); 4034 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_BUGCHECK; 4035 newEntry.bugcheck = bugCheckCode; 4036 newEntry.pSource = "OS"; 4037 newEntry.errorCode = bugCheckCode; 4038 return rcdbNocatInsertNocatError(gpumgrGetGpu(deviceInstance), &newEntry); 4039 } 4040 4041 /* 4042 ** rcdbNocatInitEngineErrorEvent initializes a parameter structure for an engine error event 4043 ** 4044 ** parameters: 4045 ** pNewEntry Pointer to event parameter structure to be initialized 4046 */ 4047 NV_STATUS 4048 rcdbNocatInitEngineErrorEvent 4049 ( 4050 NOCAT_JOURNAL_PARAMS *pNewEntry 4051 ) 4052 { 4053 if (pNewEntry == NULL) 4054 { 4055 return NV_ERR_INVALID_ARGUMENT; 4056 } 4057 portMemSet(pNewEntry, 0, sizeof(*pNewEntry)); 4058 pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ENGINE; 4059 return NV_OK; 4060 } 4061 4062 /* 4063 ** rcdbNocatInsertEngineError records a reported NOCAT error from an engine, 4064 ** 4065 ** parameters: 4066 ** pGpu Pointer to GPU associated with the error 4067 ** may be NULL if there is no GPU associated with the error 4068 ** if NULL the primary GPU is used 4069 ** pSource A string indicating the reporting source of the error. 4070 ** if NULL, a default values will be used 4071 ** subsystem The optional subsystem ID used by the source to identify the error 4072 ** errorCode The error code 4073 ** pDiagBuffer A pointer to the diagnostic buffer associated with the error 4074 ** may be NULL 4075 ** diagBufferLen The size of the diagnostic buffer 4076 ** if the size exceeds the supported diagBuffer size, the buffer contents will be truncated to fit. 4077 */ 4078 NvU32 4079 rcdbNocatInsertEngineError( 4080 OBJGPU *pGpu, 4081 const char *pSource, 4082 NvU32 subsystem, 4083 NvU64 errorCode, 4084 NvU8 *pDiagBuffer, 4085 NvU32 diagBufferLen 4086 ) 4087 { 4088 NOCAT_JOURNAL_PARAMS newEntry; 4089 4090 rcdbNocatInitEngineErrorEvent(&newEntry); 4091 newEntry.pSource = pSource; 4092 newEntry.subsystem = subsystem; 4093 newEntry.errorCode = errorCode; 4094 newEntry.pDiagBuffer = pDiagBuffer; 4095 newEntry.diagBufferLen = diagBufferLen; 4096 return rcdbNocatInsertNocatError(pGpu, &newEntry); 4097 } 4098 4099 /* 4100 ** rcdbNocatInsertTDRError records an TDR error, 4101 ** 4102 ** parameters: 4103 ** pGpu Pointer to GPU associated with the error 4104 ** may be NULL if there is no GPU associated with the error 4105 ** if NULL the primary GPU is used 4106 ** pSource A string indicating the reporting source of the error. 4107 ** if NULL, a default values will be used 4108 ** subsystem The optional subsystem ID used by the source to identify the error 4109 ** errorCode The error code 4110 ** TDRBucket The TDR bucket 4111 ** pDiagBuffer A pointer to the diagnostic buffer associated with the error 4112 ** may be NULL 4113 ** diagBufferLen The size of the diagnostic buffer 4114 ** if the size exceeds the supported diagBuffer size, 4115 ** the buffer contents will be truncated to fit. 4116 ** tdrReason A reason code for the TDR 4117 ** pFaultingApp A pointer to the faulting app name if known 4118 */ 4119 NvU32 4120 rcdbNocatInsertTDRError 4121 ( 4122 OBJGPU *pGpu, 4123 const char *pSource, 4124 NvU32 subsystem, 4125 NvU64 errorCode, 4126 NvU32 TdrBucket, 4127 NvU8 *pDiagBuffer, 4128 NvU32 diagBufferLen, 4129 NvU32 tdrReason, 4130 const char *pFaultingEngine 4131 ) 4132 { 4133 NOCAT_JOURNAL_PARAMS newEntry; 4134 4135 portMemSet(&newEntry, 0, sizeof(newEntry)); 4136 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR; 4137 newEntry.pSource = pSource; 4138 newEntry.subsystem = subsystem; 4139 newEntry.errorCode = errorCode; 4140 newEntry.pDiagBuffer = pDiagBuffer; 4141 newEntry.diagBufferLen = diagBufferLen; 4142 newEntry.pFaultingEngine = pFaultingEngine; 4143 return rcdbNocatInsertNocatError(pGpu, &newEntry); 4144 } 4145 NV_STATUS 4146 rcdbNocatInitRCErrorEvent 4147 ( 4148 NOCAT_JOURNAL_PARAMS *pNewEntry 4149 ) 4150 { 4151 if (pNewEntry == NULL) 4152 { 4153 return NV_ERR_INVALID_ARGUMENT; 4154 } 4155 portMemSet(pNewEntry, 0, sizeof(*pNewEntry)); 4156 pNewEntry->recType = NV2080_NOCAT_JOURNAL_REC_TYPE_RC; 4157 pNewEntry->pSource = "RC ERROR"; 4158 return NV_OK; 4159 } 4160 4161 /* 4162 ** _rcdbNocatReportAssert adds an assert record. 4163 ** 4164 ** parameters: 4165 ** pGpu Pointer to GPU associated with the error 4166 ** may be NULL 4167 ** pAssertRec A pointer to the assert to report 4168 */ 4169 NV_STATUS 4170 _rcdbNocatReportAssert 4171 ( 4172 OBJGPU *pGpu, 4173 RmRCCommonAssert_RECORD *pAssertRec 4174 ) 4175 { 4176 OBJSYS *pSys = SYS_GET_INSTANCE(); 4177 Journal *pRcdb = SYS_GET_RCDB(pSys); 4178 NOCAT_JOURNAL_PARAMS newEntry; 4179 RM_NOCAT_ASSERT_DIAG_BUFFER diagBuffer; 4180 RM_NOCAT_ASSERT_DIAG_BUFFER *pDiagData; 4181 NvU32 idx; 4182 RM_NOCAT_JOURNAL_ENTRY *pNocatEntry = NULL; 4183 NvU32 gpuCnt= 0; 4184 OBJGPU *pTmpGpu = gpumgrGetGpu(0); 4185 4186 // validate inputs. 4187 if ((pRcdb == NULL) || (pAssertRec == NULL)) 4188 { 4189 return NV_ERR_INVALID_ARGUMENT; 4190 } 4191 if (pGpu == NULL) 4192 { 4193 // we don't have a GPU, if there is only 1 GPU, 4194 // we can safely use it for logging this assert 4195 gpumgrGetGpuAttachInfo(&gpuCnt, NULL); 4196 if (gpuCnt == 1) 4197 { 4198 pGpu = pTmpGpu; 4199 } 4200 } 4201 4202 // start off assuming we will be recording a report 4203 portMemSet(&newEntry, 0, sizeof(newEntry)); 4204 newEntry.timestamp = pAssertRec->common.timeStamp; 4205 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT; 4206 newEntry.pSource = "ASSERT"; 4207 4208 // save the assert point as the error code. 4209 newEntry.errorCode = 4210 (NvU32)((pAssertRec->breakpointAddrHint - pRcdb->nocatJournalDescriptor.loadAddress) 4211 & 0xffffffff); 4212 4213 // put the line number in the upper 32 bits. 4214 newEntry.errorCode |= ((NvU64)pAssertRec->lineNum) << 32; 4215 4216 for (idx = 0; idx < NV_ARRAY_ELEMENTS(pAssertRec->callStack); idx++) 4217 { 4218 diagBuffer.callStack[idx] = 4219 (NvU32)((pAssertRec->callStack[idx] - pRcdb->nocatJournalDescriptor.loadAddress) 4220 & 0xffffffff); 4221 } 4222 // initialize count 4223 diagBuffer.count = 1; 4224 4225 // setup the pointer to our diag buffer & its length 4226 newEntry.pDiagBuffer = (NvU8 *)&diagBuffer; 4227 4228 newEntry.diagBufferLen = sizeof(diagBuffer); 4229 4230 // is the last thing we logged an assert, & is this the same assert? 4231 if ((pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT] 4232 == pRcdb->nocatJournalDescriptor.lastRecordId[NV2080_NOCAT_JOURNAL_REC_TYPE_ANY]) 4233 && (0 == portMemCmp(&pRcdb->nocatJournalDescriptor.lastAssertData, 4234 diagBuffer.callStack, // same stack 4235 sizeof(diagBuffer.callStack)))) 4236 { 4237 // it is the same as the last assert we logged. so don't log it again. 4238 // but see if we can increment the counter in an unreported assert. 4239 // check if the last record is also an assert 4240 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 4241 { 4242 // get the last record from the buffer 4243 _rcdbGetNewestNocatJournalRecordForType(pRcdb, 4244 NV2080_NOCAT_JOURNAL_REC_TYPE_ANY, 4245 NULL, &pNocatEntry); 4246 if (pNocatEntry != NULL) 4247 { 4248 // is it an assert? 4249 if (pNocatEntry->nocatJournalEntry.recType == (NV2080_NOCAT_JOURNAL_REC_TYPE_ASSERT)) 4250 { 4251 // increment the count 4252 pDiagData = (RM_NOCAT_ASSERT_DIAG_BUFFER*)&pNocatEntry->nocatJournalEntry.diagBuffer; 4253 pDiagData->count++; 4254 } 4255 _rcdbReleaseNocatJournalRecord(pNocatEntry); 4256 4257 } 4258 } 4259 else 4260 { 4261 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_BUSY_IDX]++; 4262 } 4263 portAtomicDecrementS32(&concurrentRingBufferAccess); 4264 } 4265 else 4266 { 4267 // we are logging this assert, save off the stack so we can use it to 4268 // compare against future asserts. 4269 portMemCopy(&pRcdb->nocatJournalDescriptor.lastAssertData, 4270 sizeof(pRcdb->nocatJournalDescriptor.lastAssertData), 4271 &diagBuffer, sizeof(diagBuffer)); 4272 rcdbNocatInsertNocatError(pGpu, &newEntry); 4273 } 4274 4275 return NV_OK; 4276 } 4277 4278 /* 4279 ** rcdbNocatInsertRMCDErrorEvent creates an event from an RMCD error block 4280 ** 4281 ** parameters: 4282 ** pGpu pointer to GPU object associated with the error 4283 ** recType the type of event to create 4284 ** pSource a pointer to the source string 4285 ** subsystem the subsystem associated with the event. 4286 ** errorCode error code for the event 4287 ** pFault pointer to a faulting component identifier associated with the error 4288 */ 4289 NvU32 rcdbNocatInsertRMCDErrorEvent(OBJGPU *pGpu, NvU32 recType, 4290 const char *pSource, NvU32 subsystem, NvU64 errorCode, const char *pFault, 4291 RMCD_ERROR_BLOCK *pRcdError) 4292 { 4293 NOCAT_JOURNAL_PARAMS newEntry; 4294 4295 portMemSet(&newEntry, 0, sizeof(newEntry)); 4296 newEntry.recType = recType; 4297 newEntry.pSource = pSource; 4298 newEntry.subsystem = subsystem; 4299 newEntry.errorCode = errorCode; 4300 newEntry.pFaultingEngine = pFault; 4301 if (pRcdError != NULL) 4302 { 4303 newEntry.pDiagBuffer = (NvU8 * )pRcdError->pBlock; 4304 newEntry.diagBufferLen = pRcdError->blockSize; 4305 } 4306 return rcdbNocatInsertNocatError(pGpu, &newEntry); 4307 } 4308 4309 /* 4310 ** rcdbSetNocatTdrReason sets the TDR reason code in the most recent TDR record if there is one, 4311 ** otherwise, it creates one with the reason code. 4312 ** 4313 ** parameters: 4314 ** pReasonData the data supplied with including the reason code. 4315 ** if a TDR record exists, the reason will be added to the existing record, 4316 ** otherwise the rest of the data will be used to create a new TDR event. 4317 */ 4318 NV_STATUS rcdbSetNocatTdrReason 4319 ( 4320 NV2080CtrlNocatJournalDataTdrReason *pReasonData 4321 ) 4322 { 4323 OBJSYS *pSys = SYS_GET_INSTANCE(); 4324 Journal *pRcdb = SYS_GET_RCDB(pSys); 4325 RM_NOCAT_JOURNAL_ENTRY* pNocatEntry = NULL; 4326 4327 // validate inputs. 4328 if ((pRcdb == NULL) || (pReasonData == NULL)) 4329 { 4330 return NV_ERR_INVALID_ARGUMENT; 4331 } 4332 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATE_REQ_IDX]++; 4333 4334 if (portAtomicIncrementS32(&concurrentRingBufferAccess) == 1) 4335 { 4336 // see if there is a TDR record. 4337 _rcdbGetNewestNocatJournalRecordForType(pRcdb, 4338 NV2080_NOCAT_JOURNAL_REC_TYPE_TDR, 4339 NULL, &pNocatEntry); 4340 if (pNocatEntry != NULL) 4341 { 4342 // there is, set the reason. 4343 _rcdbSetTdrReason(pRcdb, pReasonData->reasonCode, 4344 (char *)pNocatEntry->nocatJournalEntry.tdrReason, 4345 sizeof(pNocatEntry->nocatJournalEntry.tdrReason)); 4346 pRcdb->nocatJournalDescriptor.nocatEventCounters[NV2080_NOCAT_JOURNAL_REPORT_ACTIVITY_UPDATED_IDX]++; 4347 _rcdbReleaseNocatJournalRecord(pNocatEntry); 4348 } 4349 } 4350 portAtomicDecrementS32(&concurrentRingBufferAccess); 4351 4352 // if we did not get a TDR record, create one. 4353 // we need to do it after the ring buffers are released. 4354 if (pNocatEntry == NULL) 4355 { 4356 NOCAT_JOURNAL_PARAMS newEntry; 4357 4358 portMemSet(&newEntry, 0, sizeof(newEntry)); 4359 newEntry.recType = NV2080_NOCAT_JOURNAL_REC_TYPE_TDR; 4360 newEntry.pSource = (char *)pReasonData->source; 4361 newEntry.subsystem = pReasonData->subsystem; 4362 newEntry.errorCode = pReasonData->errorCode; 4363 newEntry.tdrReason = pReasonData->reasonCode; 4364 return rcdbNocatInsertNocatError(NULL, &newEntry); 4365 } 4366 return NV_OK; 4367 } 4368