1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /*! 25 * @file error_cont.h 26 * @brief Holds data structures, defines and policy table required by the 27 * Ampere Error Containment feature / code. 28 */ 29 30 #ifndef _ERROR_CONT_H_ 31 #define _ERROR_CONT_H_ 32 33 /* ------------------------ Includes ---------------------------------------- */ 34 #include "core/core.h" 35 #include "kernel/gpu/gpu_engine_type.h" 36 #include "nverror.h" 37 38 /* ------------------------ Forward Definitions ----------------------------- */ 39 40 typedef struct Device Device; 41 42 /* ------------------------ Datatypes --------------------------------------- */ 43 44 /*! 45 * Error Containment error id enum 46 */ 47 typedef enum _NV_ERROR_CONT_ERR_ID 48 { 49 NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED = 0, // FD Error ID: E01: FB ECC DED 50 NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE = 1, // FD Error ID: E02: FB ECC DED in CBC 51 NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG = 2, // FD Error ID: E05: LTC ECC in data region 52 NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON = 3, // FD Error ID: E06: LTC Unsupported client poison 53 NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG = 4, // FD Error ID: E07: LTC Tag Parity error 54 NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG = 5, // FD Error ID: E08: LTC CBC Parity error 55 NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON = 6, // FD Error ID: E09: FBHUB Poison error 56 NV_ERROR_CONT_ERR_ID_E10_SM_POISON = 7, // FD Error ID: E10: SM Poison error 57 NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL = 8, // FD Error ID: E12: CE Poison error in user channel 58 NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL = 9, // FD Error ID: E12: CE Poison error in kernel channel 59 NV_ERROR_CONT_ERR_ID_E13_MMU_POISON = 10, // FD Error ID: E13: MMU Poison error 60 NV_ERROR_CONT_ERR_ID_E16_GCC_POISON = 11, // FD Error ID: E16: GCC Poison error 61 NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON = 12, // FD Error ID: E17: GPCCS/TPCCS Poison error 62 NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON = 13, // FD Error ID: E20: XALEP_EGRESS Poison error 63 } NV_ERROR_CONT_ERR_ID; 64 65 /*! 66 * Error Containment settings per error id, when SMC memory partitioning is disable or enabled. 67 */ 68 typedef struct _NV_ERROR_CONT_SMC_DIS_EN_SETTING 69 { 70 NvU32 rcErrorCode; 71 NvBool bGpuResetReqd; 72 NvBool bGpuDrainAndResetReqd; 73 NvBool bPrintSmcPartitionInfo; 74 NvU32 nv2080Notifier; 75 } NV_ERROR_CONT_SMC_DIS_EN_SETTING; 76 77 /*! 78 * Error Containment state table tracking policy settings for each error id 79 */ 80 typedef struct _NV_ERROR_CONT_STATE_TABLE 81 { 82 NV_ERROR_CONT_ERR_ID errorCode; 83 NV_ERROR_CONT_SMC_DIS_EN_SETTING smcDisEnSetting[2]; // 0: SMC memory partitioning disabled, 84 // 1: SMC memory partitioning enabled 85 } NV_ERROR_CONT_STATE_TABLE; 86 87 /*! 88 * Struct for LTC location 89 */ 90 typedef struct _NV_ERROR_CONT_LOCATION_LTC 91 { 92 NvU32 partition; 93 NvU32 slice; 94 } NV_ERROR_CONT_LOCATION_LTC; 95 96 /*! 97 * Struct for DRAM location 98 */ 99 typedef struct _NV_ERROR_CONT_LOCATION_DRAM 100 { 101 NvU32 partition; 102 NvU32 subPartition; 103 NvU64 physicalAddress; 104 } NV_ERROR_CONT_LOCATION_DRAM; 105 106 /*! 107 * Struct for Engine id 108 */ 109 typedef struct _NV_ERROR_CONT_LOCATION_ENG_ID 110 { 111 RM_ENGINE_TYPE rmEngineId; 112 Device *pDevice; 113 } NV_ERROR_CONT_LOCATION_ENG_ID; 114 115 /*! 116 * Error Containment location type 117 */ 118 typedef enum _NV_ERROR_CONT_LOCATION_TYPE 119 { 120 NV_ERROR_CONT_LOCATION_TYPE_NONE = 0, // No location information available 121 NV_ERROR_CONT_LOCATION_TYPE_DRAM = 1, // DRAM location 122 NV_ERROR_CONT_LOCATION_TYPE_LTC = 2, // LTC location 123 NV_ERROR_CONT_LOCATION_TYPE_ENGINE = 3 // Engine location 124 } NV_ERROR_CONT_LOCATION_TYPE; 125 126 /*! 127 * Union for Error Containment location information 128 */ 129 typedef union _NV_ERROR_CONT_LOCATION_INFO 130 { 131 NV_ERROR_CONT_LOCATION_DRAM dramLoc; // DRAM location 132 NV_ERROR_CONT_LOCATION_LTC ltcLoc; // LTC location 133 NV_ERROR_CONT_LOCATION_ENG_ID engineLoc; // Engine location 134 } NV_ERROR_CONT_LOCATION_INFO; 135 136 typedef struct _NV_ERROR_CONT_LOCATION 137 { 138 NV_ERROR_CONT_LOCATION_TYPE locType; 139 NV_ERROR_CONT_LOCATION_INFO locInfo; 140 } NV_ERROR_CONT_LOCATION; 141 142 /* ------------------------ Macros ------------------------------------------ */ 143 144 #define ROBUST_CHANNEL_CONTAINED_ERROR_STR "Contained" 145 #define ROBUST_CHANNEL_UNCONTAINED_ERROR_STR "Uncontained" 146 #define NO_XID NV_U32_MAX 147 #define NO_NV2080_NOTIFIER NV2080_NOTIFIERS_MAXCOUNT 148 #define NV_ERR_CONT_LOCATION_STRING_SIZE_MAX 64 149 150 /*! 151 * Error Containment error types string 152 */ 153 #define NV_ERROR_CONT_ERR_ID_STRING_PUBLIC {"FB DED", \ 154 "DED CBC", \ 155 "LTC Data", \ 156 "LTC GPC", \ 157 "LTC TAG", \ 158 "LTC CBC", \ 159 "FBHUB", \ 160 "SM", \ 161 "CE User Channel", \ 162 "CE Kernel Channel", \ 163 "MMU", \ 164 "GCC", \ 165 "CTXSW", \ 166 "PCIE"}; 167 168 /*! 169 * Error Containment state table showing policy settings for each error id 170 * 171 * Where: 172 * RC_Recovery_Type: 173 * Type of RC recovery handling in response to a given error. Possible values: 174 * 175 * - NO_RC : No RC Recovery performed. Subsequent 2nd interrupt by engine 176 * consuming poison will determine the RC Recovery type. 177 * - RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE : RC Recovery compute channels of only the processes whose 178 * channels were loaded on halted TSG when _E10_SM_POISON or 179 * _E12A_CE_POISON_IN_USER_CHANNEL occurs. 180 * - RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION : RC Recovery compute channels of only specific MIG partition 181 * as that error can be attributed to a specific MIG partition. 182 * - RC_ALL_COMPUTE_CHANNELS : RC Recovery ALL compute channels on a GPU that saw this interrupt. 183 * (If MIG is enabled, then RC Recovery compute channels in all MIG partitions) 184 * - CE_TSG_RESET : Reset the halted CE Engine. Impacts the CE channels loaded on the TSG when the CE Halted. 185 * This is used in NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL & 186 * NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL along with additional 187 * Compute Channels RC policy (either RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE or 188 * RC_ALL_COMPUTE_CHANNELS). 189 */ 190 #define NV_ERROR_CONT_STATE_TABLE_SETTINGS \ 191 { \ 192 /* errorCode , rcErrorCode , bGpuResetReqd, bGpuDrainAndResetReqd, bPrintSmcPartitionInfo, nv2080Notifier , Dynamic Page Blacklisting , RC_Recovery_Type */ \ 193 { NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED , {{NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , Yes(PMA but not subheap) , NO_RC */ }, \ 194 {NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , Yes(PMA but not subheap) , NO_RC */ }}}, \ 195 { NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , Yes(PMA but not subheap) , RC_ALL_COMPUTE_CHANNELS */ }, \ 196 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , Yes(PMA but not subheap) , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ 197 { NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG , {{NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , No , NO_RC */ }, \ 198 {NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , No , NO_RC */ }}}, \ 199 { NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON, {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ 200 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ 201 { NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ 202 {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}}, \ 203 { NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ 204 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ 205 { NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ 206 {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}}, \ 207 { NV_ERROR_CONT_ERR_ID_E10_SM_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ 208 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ 209 { NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ }, \ 210 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ }}}, \ 211 { NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS + CE_TSG_RESET */ }, \ 212 {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS + CE_TSG_RESET */ }}}, \ 213 { NV_ERROR_CONT_ERR_ID_E13_MMU_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , NO_RC */ }, \ 214 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , NO_RC */ }}}, \ 215 { NV_ERROR_CONT_ERR_ID_E16_GCC_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ 216 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ 217 { NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ 218 {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ 219 { NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ 220 {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}} \ 221 } 222 223 /* ------------------------ Function Prototypes ----------------------------- */ 224 225 #endif // _ERROR_CONT_H_ 226