1 /* $NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $ */
2
3 /*
4 * Copyright 2019 Advanced Micro Devices, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25 #include <sys/cdefs.h>
26 __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
27
28 #include "umc_v6_1.h"
29 #include "amdgpu_ras.h"
30 #include "amdgpu.h"
31
32 #include "rsmu/rsmu_0_0_2_offset.h"
33 #include "rsmu/rsmu_0_0_2_sh_mask.h"
34 #include "umc/umc_6_1_1_offset.h"
35 #include "umc/umc_6_1_1_sh_mask.h"
36 #include "umc/umc_6_1_2_offset.h"
37
38 #define UMC_6_INST_DIST 0x40000
39
40 /*
41 * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
42 * is the index of 8KB block
43 */
44 #define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5)
45 /* channel index is the index of 256B block */
46 #define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8)
47 /* offset in 256B block */
48 #define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL)
49
50 #define LOOP_UMC_INST(umc_inst) for ((umc_inst) = 0; (umc_inst) < adev->umc.umc_inst_num; (umc_inst)++)
51 #define LOOP_UMC_CH_INST(ch_inst) for ((ch_inst) = 0; (ch_inst) < adev->umc.channel_inst_num; (ch_inst)++)
52 #define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) LOOP_UMC_CH_INST((ch_inst))
53
54 const uint32_t
55 umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = {
56 {2, 18, 11, 27}, {4, 20, 13, 29},
57 {1, 17, 8, 24}, {7, 23, 14, 30},
58 {10, 26, 3, 19}, {12, 28, 5, 21},
59 {9, 25, 0, 16}, {15, 31, 6, 22}
60 };
61
umc_v6_1_enable_umc_index_mode(struct amdgpu_device * adev)62 static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev)
63 {
64 WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
65 RSMU_UMC_INDEX_MODE_EN, 1);
66 }
67
umc_v6_1_disable_umc_index_mode(struct amdgpu_device * adev)68 static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
69 {
70 WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
71 RSMU_UMC_INDEX_MODE_EN, 0);
72 }
73
umc_v6_1_get_umc_index_mode_state(struct amdgpu_device * adev)74 static uint32_t umc_v6_1_get_umc_index_mode_state(struct amdgpu_device *adev)
75 {
76 uint32_t rsmu_umc_index;
77
78 rsmu_umc_index = RREG32_SOC15(RSMU, 0,
79 mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
80
81 return REG_GET_FIELD(rsmu_umc_index,
82 RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
83 RSMU_UMC_INDEX_MODE_EN);
84 }
85
get_umc_6_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)86 static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
87 uint32_t umc_inst,
88 uint32_t ch_inst)
89 {
90 return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
91 }
92
umc_v6_1_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)93 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
94 uint32_t umc_reg_offset,
95 unsigned long *error_count)
96 {
97 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
98 uint32_t ecc_err_cnt, ecc_err_cnt_addr;
99 uint64_t mc_umc_status;
100 uint32_t mc_umc_status_addr;
101
102 if (adev->asic_type == CHIP_ARCTURUS) {
103 /* UMC 6_1_2 registers */
104 ecc_err_cnt_sel_addr =
105 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
106 ecc_err_cnt_addr =
107 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
108 mc_umc_status_addr =
109 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
110 } else {
111 /* UMC 6_1_1 registers */
112 ecc_err_cnt_sel_addr =
113 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
114 ecc_err_cnt_addr =
115 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
116 mc_umc_status_addr =
117 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
118 }
119
120 /* select the lower chip and check the error count */
121 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
122 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
123 EccErrCntCsSel, 0);
124 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
125 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
126 *error_count +=
127 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
128 UMC_V6_1_CE_CNT_INIT);
129 /* clear the lower chip err count */
130 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
131
132 /* select the higher chip and check the err counter */
133 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
134 EccErrCntCsSel, 1);
135 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
136 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
137 *error_count +=
138 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
139 UMC_V6_1_CE_CNT_INIT);
140 /* clear the higher chip err count */
141 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
142
143 /* check for SRAM correctable error
144 MCUMC_STATUS is a 64 bit register */
145 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
146 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
147 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
148 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
149 *error_count += 1;
150 }
151
umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)152 static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev,
153 uint32_t umc_reg_offset,
154 unsigned long *error_count)
155 {
156 uint64_t mc_umc_status;
157 uint32_t mc_umc_status_addr;
158
159 if (adev->asic_type == CHIP_ARCTURUS) {
160 /* UMC 6_1_2 registers */
161 mc_umc_status_addr =
162 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
163 } else {
164 /* UMC 6_1_1 registers */
165 mc_umc_status_addr =
166 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
167 }
168
169 /* check the MCUMC_STATUS */
170 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
171 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
172 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
173 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
174 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
175 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
176 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
177 *error_count += 1;
178 }
179
umc_v6_1_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)180 static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
181 void *ras_error_status)
182 {
183 struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
184
185 uint32_t umc_inst = 0;
186 uint32_t ch_inst = 0;
187 uint32_t umc_reg_offset = 0;
188
189 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
190
191 if (rsmu_umc_index_state)
192 umc_v6_1_disable_umc_index_mode(adev);
193
194 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
195 umc_reg_offset = get_umc_6_reg_offset(adev,
196 umc_inst,
197 ch_inst);
198
199 umc_v6_1_query_correctable_error_count(adev,
200 umc_reg_offset,
201 &(err_data->ce_count));
202 umc_v6_1_querry_uncorrectable_error_count(adev,
203 umc_reg_offset,
204 &(err_data->ue_count));
205 }
206
207 if (rsmu_umc_index_state)
208 umc_v6_1_enable_umc_index_mode(adev);
209 }
210
umc_v6_1_query_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint32_t umc_reg_offset,uint32_t ch_inst,uint32_t umc_inst)211 static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
212 struct ras_err_data *err_data,
213 uint32_t umc_reg_offset,
214 uint32_t ch_inst,
215 uint32_t umc_inst)
216 {
217 uint32_t lsb, mc_umc_status_addr;
218 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
219 struct eeprom_table_record *err_rec;
220 uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
221
222 if (adev->asic_type == CHIP_ARCTURUS) {
223 /* UMC 6_1_2 registers */
224 mc_umc_status_addr =
225 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
226 mc_umc_addrt0 =
227 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0_ARCT);
228 } else {
229 /* UMC 6_1_1 registers */
230 mc_umc_status_addr =
231 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
232 mc_umc_addrt0 =
233 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
234 }
235
236 /* skip error address process if -ENOMEM */
237 if (!err_data->err_addr) {
238 /* clear umc status */
239 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
240 return;
241 }
242
243 err_rec = &err_data->err_addr[err_data->err_addr_cnt];
244 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
245
246 /* calculate error address if ue/ce error is detected */
247 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
248 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
249 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
250
251 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
252 /* the lowest lsb bits should be ignored */
253 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
254 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
255 err_addr &= ~((0x1ULL << lsb) - 1);
256
257 /* translate umc channel address to soc pa, 3 parts are included */
258 retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
259 ADDR_OF_256B_BLOCK(channel_index) |
260 OFFSET_IN_256B_BLOCK(err_addr);
261
262 /* we only save ue error information currently, ce is skipped */
263 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
264 == 1) {
265 err_rec->address = err_addr;
266 /* page frame address is saved */
267 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
268 err_rec->ts = (uint64_t)ktime_get_real_seconds();
269 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
270 err_rec->cu = 0;
271 err_rec->mem_channel = channel_index;
272 err_rec->mcumc_id = umc_inst;
273
274 err_data->err_addr_cnt++;
275 }
276 }
277
278 /* clear umc status */
279 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
280 }
281
umc_v6_1_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)282 static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
283 void *ras_error_status)
284 {
285 struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
286
287 uint32_t umc_inst = 0;
288 uint32_t ch_inst = 0;
289 uint32_t umc_reg_offset = 0;
290
291 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
292
293 if (rsmu_umc_index_state)
294 umc_v6_1_disable_umc_index_mode(adev);
295
296 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
297 umc_reg_offset = get_umc_6_reg_offset(adev,
298 umc_inst,
299 ch_inst);
300
301 umc_v6_1_query_error_address(adev,
302 err_data,
303 umc_reg_offset,
304 ch_inst,
305 umc_inst);
306 }
307
308 if (rsmu_umc_index_state)
309 umc_v6_1_enable_umc_index_mode(adev);
310 }
311
umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)312 static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
313 uint32_t umc_reg_offset)
314 {
315 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
316 uint32_t ecc_err_cnt_addr;
317
318 if (adev->asic_type == CHIP_ARCTURUS) {
319 /* UMC 6_1_2 registers */
320 ecc_err_cnt_sel_addr =
321 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
322 ecc_err_cnt_addr =
323 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
324 } else {
325 /* UMC 6_1_1 registers */
326 ecc_err_cnt_sel_addr =
327 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
328 ecc_err_cnt_addr =
329 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
330 }
331
332 /* select the lower chip and check the error count */
333 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
334 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
335 EccErrCntCsSel, 0);
336 /* set ce error interrupt type to APIC based interrupt */
337 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
338 EccErrInt, 0x1);
339 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
340 /* set error count to initial value */
341 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
342
343 /* select the higher chip and check the err counter */
344 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
345 EccErrCntCsSel, 1);
346 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
347 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
348 }
349
umc_v6_1_err_cnt_init(struct amdgpu_device * adev)350 static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
351 {
352 uint32_t umc_inst = 0;
353 uint32_t ch_inst = 0;
354 uint32_t umc_reg_offset = 0;
355
356 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
357
358 if (rsmu_umc_index_state)
359 umc_v6_1_disable_umc_index_mode(adev);
360
361 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
362 umc_reg_offset = get_umc_6_reg_offset(adev,
363 umc_inst,
364 ch_inst);
365
366 umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset);
367 }
368
369 if (rsmu_umc_index_state)
370 umc_v6_1_enable_umc_index_mode(adev);
371 }
372
373 const struct amdgpu_umc_funcs umc_v6_1_funcs = {
374 .err_cnt_init = umc_v6_1_err_cnt_init,
375 .ras_late_init = amdgpu_umc_ras_late_init,
376 .query_ras_error_count = umc_v6_1_query_ras_error_count,
377 .query_ras_error_address = umc_v6_1_query_ras_error_address,
378 };
379