/* * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /*! * @file error_cont.h * @brief Holds data structures, defines and policy table required by the * Ampere Error Containment feature / code. */ #ifndef _ERROR_CONT_H_ #define _ERROR_CONT_H_ /* ------------------------ Includes ---------------------------------------- */ #include "core/core.h" #include "kernel/gpu/gpu_engine_type.h" #include "nverror.h" /* ------------------------ Forward Definitions ----------------------------- */ typedef struct Device Device; /* ------------------------ Datatypes --------------------------------------- */ /*! * Error Containment error id enum */ typedef enum _NV_ERROR_CONT_ERR_ID { NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED = 0, // FD Error ID: E01: FB ECC DED NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE = 1, // FD Error ID: E02: FB ECC DED in CBC NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG = 2, // FD Error ID: E05: LTC ECC in data region NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON = 3, // FD Error ID: E06: LTC Unsupported client poison NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG = 4, // FD Error ID: E07: LTC Tag Parity error NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG = 5, // FD Error ID: E08: LTC CBC Parity error NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON = 6, // FD Error ID: E09: FBHUB Poison error NV_ERROR_CONT_ERR_ID_E10_SM_POISON = 7, // FD Error ID: E10: SM Poison error NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL = 8, // FD Error ID: E12: CE Poison error in user channel NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL = 9, // FD Error ID: E12: CE Poison error in kernel channel NV_ERROR_CONT_ERR_ID_E13_MMU_POISON = 10, // FD Error ID: E13: MMU Poison error NV_ERROR_CONT_ERR_ID_E16_GCC_POISON = 11, // FD Error ID: E16: GCC Poison error NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON = 12, // FD Error ID: E17: GPCCS/TPCCS Poison error NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON = 13, // FD Error ID: E20: XALEP_EGRESS Poison error } NV_ERROR_CONT_ERR_ID; /*! * Error Containment settings per error id, when SMC memory partitioning is disable or enabled. */ typedef struct _NV_ERROR_CONT_SMC_DIS_EN_SETTING { NvU32 rcErrorCode; NvBool bGpuResetReqd; NvBool bGpuDrainAndResetReqd; NvBool bPrintSmcPartitionInfo; NvU32 nv2080Notifier; } NV_ERROR_CONT_SMC_DIS_EN_SETTING; /*! * Error Containment state table tracking policy settings for each error id */ typedef struct _NV_ERROR_CONT_STATE_TABLE { NV_ERROR_CONT_ERR_ID errorCode; NV_ERROR_CONT_SMC_DIS_EN_SETTING smcDisEnSetting[2]; // 0: SMC memory partitioning disabled, // 1: SMC memory partitioning enabled } NV_ERROR_CONT_STATE_TABLE; /*! * Struct for LTC location */ typedef struct _NV_ERROR_CONT_LOCATION_LTC { NvU32 partition; NvU32 slice; } NV_ERROR_CONT_LOCATION_LTC; /*! * Struct for DRAM location */ typedef struct _NV_ERROR_CONT_LOCATION_DRAM { NvU32 partition; NvU32 subPartition; NvU64 physicalAddress; } NV_ERROR_CONT_LOCATION_DRAM; /*! * Struct for Engine id */ typedef struct _NV_ERROR_CONT_LOCATION_ENG_ID { RM_ENGINE_TYPE rmEngineId; Device *pDevice; } NV_ERROR_CONT_LOCATION_ENG_ID; /*! * Error Containment location type */ typedef enum _NV_ERROR_CONT_LOCATION_TYPE { NV_ERROR_CONT_LOCATION_TYPE_NONE = 0, // No location information available NV_ERROR_CONT_LOCATION_TYPE_DRAM = 1, // DRAM location NV_ERROR_CONT_LOCATION_TYPE_LTC = 2, // LTC location NV_ERROR_CONT_LOCATION_TYPE_ENGINE = 3 // Engine location } NV_ERROR_CONT_LOCATION_TYPE; /*! * Union for Error Containment location information */ typedef union _NV_ERROR_CONT_LOCATION_INFO { NV_ERROR_CONT_LOCATION_DRAM dramLoc; // DRAM location NV_ERROR_CONT_LOCATION_LTC ltcLoc; // LTC location NV_ERROR_CONT_LOCATION_ENG_ID engineLoc; // Engine location } NV_ERROR_CONT_LOCATION_INFO; typedef struct _NV_ERROR_CONT_LOCATION { NV_ERROR_CONT_LOCATION_TYPE locType; NV_ERROR_CONT_LOCATION_INFO locInfo; } NV_ERROR_CONT_LOCATION; /* ------------------------ Macros ------------------------------------------ */ #define ROBUST_CHANNEL_CONTAINED_ERROR_STR "Contained" #define ROBUST_CHANNEL_UNCONTAINED_ERROR_STR "Uncontained" #define NO_XID NV_U32_MAX #define NO_NV2080_NOTIFIER NV2080_NOTIFIERS_MAXCOUNT #define NV_ERR_CONT_LOCATION_STRING_SIZE_MAX 64 /*! * Error Containment error types string */ #define NV_ERROR_CONT_ERR_ID_STRING_PUBLIC {"FB DED", \ "DED CBC", \ "LTC Data", \ "LTC GPC", \ "LTC TAG", \ "LTC CBC", \ "FBHUB", \ "SM", \ "CE User Channel", \ "CE Kernel Channel", \ "MMU", \ "GCC", \ "CTXSW", \ "PCIE"}; /*! * Error Containment state table showing policy settings for each error id * * Where: * RC_Recovery_Type: * Type of RC recovery handling in response to a given error. Possible values: * * - NO_RC : No RC Recovery performed. Subsequent 2nd interrupt by engine * consuming poison will determine the RC Recovery type. * - RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE : RC Recovery compute channels of only the processes whose * channels were loaded on halted TSG when _E10_SM_POISON or * _E12A_CE_POISON_IN_USER_CHANNEL occurs. * - RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION : RC Recovery compute channels of only specific MIG partition * as that error can be attributed to a specific MIG partition. * - RC_ALL_COMPUTE_CHANNELS : RC Recovery ALL compute channels on a GPU that saw this interrupt. * (If MIG is enabled, then RC Recovery compute channels in all MIG partitions) * - CE_TSG_RESET : Reset the halted CE Engine. Impacts the CE channels loaded on the TSG when the CE Halted. * This is used in NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL & * NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL along with additional * Compute Channels RC policy (either RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE or * RC_ALL_COMPUTE_CHANNELS). */ #define NV_ERROR_CONT_STATE_TABLE_SETTINGS \ { \ /* errorCode , rcErrorCode , bGpuResetReqd, bGpuDrainAndResetReqd, bPrintSmcPartitionInfo, nv2080Notifier , Dynamic Page Blacklisting , RC_Recovery_Type */ \ { NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED , {{NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , Yes(PMA but not subheap) , NO_RC */ }, \ {NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , Yes(PMA but not subheap) , NO_RC */ }}}, \ { NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , Yes(PMA but not subheap) , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , Yes(PMA but not subheap) , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ { NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG , {{NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , No , NO_RC */ }, \ {NO_XID , NV_FALSE , NV_FALSE , NV_FALSE , NO_NV2080_NOTIFIER /* , No , NO_RC */ }}}, \ { NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON, {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ { NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}}, \ { NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_TRUE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION */ }}}, \ { NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}}, \ { NV_ERROR_CONT_ERR_ID_E10_SM_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ { NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ }}}, \ { NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS + CE_TSG_RESET */ }, \ {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS + CE_TSG_RESET */ }}}, \ { NV_ERROR_CONT_ERR_ID_E13_MMU_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , NO_RC */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , NO_RC */ }}}, \ { NV_ERROR_CONT_ERR_ID_E16_GCC_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ { NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON , {{ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }, \ {ROBUST_CHANNEL_CONTAINED_ERROR , NV_FALSE , NV_FALSE , NV_TRUE , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE */ }}}, \ { NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }, \ {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE , NV_FALSE , NV_FALSE , NV2080_NOTIFIERS_POISON_ERROR_FATAL /* , No , RC_ALL_COMPUTE_CHANNELS */ }}} \ } /* ------------------------ Function Prototypes ----------------------------- */ #endif // _ERROR_CONT_H_