1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /*!
25  * @file    error_cont.h
26  * @brief   Holds data structures, defines and policy table required by the
27  *          Ampere Error Containment feature / code.
28  */
29 
30 #ifndef _ERROR_CONT_H_
31 #define _ERROR_CONT_H_
32 
33 /* ------------------------ Includes ---------------------------------------- */
34 #include "core/core.h"
35 #include "kernel/gpu/gpu_engine_type.h"
36 #include "nverror.h"
37 
38 /* ------------------------ Forward Definitions ----------------------------- */
39 
40 typedef struct Device Device;
41 
42 /* ------------------------ Datatypes --------------------------------------- */
43 
44 /*!
45  * Error Containment error id enum
46  */
47 typedef enum _NV_ERROR_CONT_ERR_ID
48 {
49     NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED                    =  0,   // FD Error ID: E01: FB ECC DED
50     NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE       =  1,   // FD Error ID: E02: FB ECC DED in CBC
51     NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG                  =  2,   // FD Error ID: E05: LTC ECC in data region
52     NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON =  3,   // FD Error ID: E06: LTC Unsupported client poison
53     NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG                  =  4,   // FD Error ID: E07: LTC Tag Parity error
54     NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG                  =  5,   // FD Error ID: E08: LTC CBC Parity error
55     NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON                  =  6,   // FD Error ID: E09: FBHUB Poison error
56     NV_ERROR_CONT_ERR_ID_E10_SM_POISON                     =  7,   // FD Error ID: E10: SM Poison error
57     NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL    =  8,   // FD Error ID: E12: CE Poison error in user channel
58     NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL  =  9,   // FD Error ID: E12: CE Poison error in kernel channel
59     NV_ERROR_CONT_ERR_ID_E13_MMU_POISON                    = 10,   // FD Error ID: E13: MMU Poison error
60     NV_ERROR_CONT_ERR_ID_E16_GCC_POISON                    = 11,   // FD Error ID: E16: GCC Poison error
61     NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON                  = 12,   // FD Error ID: E17: GPCCS/TPCCS Poison error
62     NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON                  = 13,   // FD Error ID: E20: XALEP_EGRESS Poison error
63 } NV_ERROR_CONT_ERR_ID;
64 
65 /*!
66  * Error Containment settings per error id, when SMC memory partitioning is disable or enabled.
67  */
68 typedef struct _NV_ERROR_CONT_SMC_DIS_EN_SETTING
69 {
70     NvU32     rcErrorCode;
71     NvBool    bGpuResetReqd;
72     NvBool    bGpuDrainAndResetReqd;
73     NvBool    bPrintSmcPartitionInfo;
74     NvU32     nv2080Notifier;
75 } NV_ERROR_CONT_SMC_DIS_EN_SETTING;
76 
77 /*!
78  * Error Containment state table tracking policy settings for each error id
79  */
80 typedef struct _NV_ERROR_CONT_STATE_TABLE
81 {
82     NV_ERROR_CONT_ERR_ID                    errorCode;
83     NV_ERROR_CONT_SMC_DIS_EN_SETTING        smcDisEnSetting[2]; // 0: SMC memory partitioning disabled,
84                                                                 // 1: SMC memory partitioning enabled
85 } NV_ERROR_CONT_STATE_TABLE;
86 
87 /*!
88  * Struct for LTC location
89  */
90 typedef struct _NV_ERROR_CONT_LOCATION_LTC
91 {
92     NvU32 partition;
93     NvU32 slice;
94 } NV_ERROR_CONT_LOCATION_LTC;
95 
96 /*!
97  * Struct for DRAM location
98  */
99 typedef struct _NV_ERROR_CONT_LOCATION_DRAM
100 {
101     NvU32 partition;
102     NvU32 subPartition;
103     NvU64 physicalAddress;
104 } NV_ERROR_CONT_LOCATION_DRAM;
105 
106 /*!
107  * Struct for Engine id
108  */
109 typedef struct _NV_ERROR_CONT_LOCATION_ENG_ID
110 {
111     RM_ENGINE_TYPE rmEngineId;
112     Device *pDevice;
113 } NV_ERROR_CONT_LOCATION_ENG_ID;
114 
115 /*!
116  * Error Containment location type
117  */
118 typedef enum _NV_ERROR_CONT_LOCATION_TYPE
119 {
120     NV_ERROR_CONT_LOCATION_TYPE_NONE    =  0,   // No location information available
121     NV_ERROR_CONT_LOCATION_TYPE_DRAM    =  1,   // DRAM location
122     NV_ERROR_CONT_LOCATION_TYPE_LTC     =  2,   // LTC location
123     NV_ERROR_CONT_LOCATION_TYPE_ENGINE  =  3    // Engine location
124 } NV_ERROR_CONT_LOCATION_TYPE;
125 
126 /*!
127  * Union for Error Containment location information
128  */
129 typedef union _NV_ERROR_CONT_LOCATION_INFO
130 {
131     NV_ERROR_CONT_LOCATION_DRAM dramLoc;      // DRAM location
132     NV_ERROR_CONT_LOCATION_LTC ltcLoc;        // LTC location
133     NV_ERROR_CONT_LOCATION_ENG_ID engineLoc;  // Engine location
134 } NV_ERROR_CONT_LOCATION_INFO;
135 
136 typedef struct _NV_ERROR_CONT_LOCATION
137 {
138     NV_ERROR_CONT_LOCATION_TYPE locType;
139     NV_ERROR_CONT_LOCATION_INFO locInfo;
140 } NV_ERROR_CONT_LOCATION;
141 
142 /* ------------------------ Macros ------------------------------------------ */
143 
144 #define ROBUST_CHANNEL_CONTAINED_ERROR_STR      "Contained"
145 #define ROBUST_CHANNEL_UNCONTAINED_ERROR_STR    "Uncontained"
146 #define NO_XID                                  NV_U32_MAX
147 #define NO_NV2080_NOTIFIER                      NV2080_NOTIFIERS_MAXCOUNT
148 #define NV_ERR_CONT_LOCATION_STRING_SIZE_MAX    64
149 
150 /*!
151  * Error Containment error types string
152  */
153 #define NV_ERROR_CONT_ERR_ID_STRING_PUBLIC {"FB DED",             \
154                                             "DED CBC",            \
155                                             "LTC Data",           \
156                                             "LTC GPC",            \
157                                             "LTC TAG",            \
158                                             "LTC CBC",            \
159                                             "FBHUB",              \
160                                             "SM",                 \
161                                             "CE User Channel",    \
162                                             "CE Kernel Channel",  \
163                                             "MMU",                \
164                                             "GCC",                \
165                                             "CTXSW",              \
166                                             "PCIE"};
167 
168 /*!
169  * Error Containment state table showing policy settings for each error id
170  *
171  * Where:
172  * RC_Recovery_Type:
173  *      Type of RC recovery handling in response to a given error. Possible values:
174  *
175  * - NO_RC                                         : No RC Recovery performed. Subsequent 2nd interrupt by engine
176  *                                                   consuming poison will determine the RC Recovery type.
177  * - RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE          : RC Recovery compute channels of only the processes whose
178  *                                                   channels were loaded on halted TSG when _E10_SM_POISON or
179  *                                                   _E12A_CE_POISON_IN_USER_CHANNEL occurs.
180  * - RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION : RC Recovery compute channels of only specific MIG partition
181  *                                                   as that error can be attributed to a specific MIG partition.
182  * - RC_ALL_COMPUTE_CHANNELS                       : RC Recovery ALL compute channels on a GPU that saw this interrupt.
183  *                                                   (If MIG is enabled, then RC Recovery compute channels in all MIG partitions)
184  * - CE_TSG_RESET                                  : Reset the halted CE Engine. Impacts the CE channels loaded on the TSG when the CE Halted.
185  *                                                   This is used in NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL &
186  *                                                   NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL along with additional
187  *                                                   Compute Channels RC policy (either RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE or
188  *                                                   RC_ALL_COMPUTE_CHANNELS).
189  */
190 #define NV_ERROR_CONT_STATE_TABLE_SETTINGS                                                                                                                                                                                                                                                           \
191 {                                                                                                                                                                                                                                                                                                    \
192     /* errorCode                                            ,  rcErrorCode                      , bGpuResetReqd, bGpuDrainAndResetReqd, bPrintSmcPartitionInfo, nv2080Notifier                             , Dynamic Page Blacklisting , RC_Recovery_Type                                    */      \
193     { NV_ERROR_CONT_ERR_ID_E01_FB_ECC_DED                   , {{NO_XID                          , NV_FALSE     , NV_FALSE             , NV_FALSE              , NO_NV2080_NOTIFIER                      /* , Yes(PMA but not subheap)  , NO_RC                                               */ },   \
194                                                                {NO_XID                          , NV_FALSE     , NV_FALSE             , NV_FALSE              , NO_NV2080_NOTIFIER                      /* , Yes(PMA but not subheap)  , NO_RC                                               */ }}}, \
195     { NV_ERROR_CONT_ERR_ID_E02_FB_ECC_DED_IN_CBC_STORE      , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , Yes(PMA but not subheap)  , RC_ALL_COMPUTE_CHANNELS                             */ },   \
196                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_TRUE              , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , Yes(PMA but not subheap)  , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION       */ }}}, \
197     { NV_ERROR_CONT_ERR_ID_E05_LTC_ECC_DSTG                 , {{NO_XID                          , NV_FALSE     , NV_FALSE             , NV_FALSE              , NO_NV2080_NOTIFIER                      /* , No                        , NO_RC                                               */ },   \
198                                                                {NO_XID                          , NV_FALSE     , NV_FALSE             , NV_FALSE              , NO_NV2080_NOTIFIER                      /* , No                        , NO_RC                                               */ }}}, \
199     { NV_ERROR_CONT_ERR_ID_E06_LTC_UNSUPPORTED_CLIENT_POISON, {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ },   \
200                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_TRUE              , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION       */ }}}, \
201     { NV_ERROR_CONT_ERR_ID_E07_LTC_ECC_TSTG                 , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ },   \
202                                                                {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ }}}, \
203     { NV_ERROR_CONT_ERR_ID_E08_LTC_ECC_RSTG                 , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ },   \
204                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_TRUE              , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_ALL_COMPUTE_CHANNELS_IN_SPECIFIC_PARTITION       */ }}}, \
205     { NV_ERROR_CONT_ERR_ID_E09_FBHUB_POISON                 , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ },   \
206                                                                {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ }}}, \
207     { NV_ERROR_CONT_ERR_ID_E10_SM_POISON                    , {{ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ },   \
208                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ }}}, \
209     { NV_ERROR_CONT_ERR_ID_E12A_CE_POISON_IN_USER_CHANNEL   , {{ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ },   \
210                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE + CE_TSG_RESET */ }}}, \
211     { NV_ERROR_CONT_ERR_ID_E12B_CE_POISON_IN_KERNEL_CHANNEL , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS              + CE_TSG_RESET */ },   \
212                                                                {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS              + CE_TSG_RESET */ }}}, \
213     { NV_ERROR_CONT_ERR_ID_E13_MMU_POISON                   , {{ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , NO_RC                                               */ },   \
214                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , NO_RC                                               */ }}}, \
215     { NV_ERROR_CONT_ERR_ID_E16_GCC_POISON                   , {{ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ },   \
216                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ }}}, \
217     { NV_ERROR_CONT_ERR_ID_E17_CTXSW_POISON                 , {{ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ },   \
218                                                                {ROBUST_CHANNEL_CONTAINED_ERROR  , NV_FALSE     , NV_FALSE             , NV_TRUE               , NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL /* , No                        , RC_COMPUTE_CHANNELS_IN_ADDRESS_SPACE                */ }}}, \
219     { NV_ERROR_CONT_ERR_ID_E20_XALEP_POISON                 , {{ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ },   \
220                                                                {ROBUST_CHANNEL_UNCONTAINED_ERROR, NV_TRUE      , NV_FALSE             , NV_FALSE              , NV2080_NOTIFIERS_POISON_ERROR_FATAL     /* , No                        , RC_ALL_COMPUTE_CHANNELS                             */ }}}  \
221 }
222 
223 /* ------------------------ Function Prototypes ----------------------------- */
224 
225 #endif // _ERROR_CONT_H_
226