1 /*	$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
2 
3 /*
4  * Copyright 2019 Advanced Micro Devices, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  */
25 #include <sys/cdefs.h>
26 __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
27 
28 #include "umc_v6_1.h"
29 #include "amdgpu_ras.h"
30 #include "amdgpu.h"
31 
32 #include "rsmu/rsmu_0_0_2_offset.h"
33 #include "rsmu/rsmu_0_0_2_sh_mask.h"
34 #include "umc/umc_6_1_1_offset.h"
35 #include "umc/umc_6_1_1_sh_mask.h"
36 #include "umc/umc_6_1_2_offset.h"
37 
38 #define UMC_6_INST_DIST			0x40000
39 
40 /*
41  * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
42  * is the index of 8KB block
43  */
44 #define ADDR_OF_8KB_BLOCK(addr)			(((addr) & ~0xffULL) << 5)
45 /* channel index is the index of 256B block */
46 #define ADDR_OF_256B_BLOCK(channel_index)	((channel_index) << 8)
47 /* offset in 256B block */
48 #define OFFSET_IN_256B_BLOCK(addr)		((addr) & 0xffULL)
49 
50 #define LOOP_UMC_INST(umc_inst) for ((umc_inst) = 0; (umc_inst) < adev->umc.umc_inst_num; (umc_inst)++)
51 #define LOOP_UMC_CH_INST(ch_inst) for ((ch_inst) = 0; (ch_inst) < adev->umc.channel_inst_num; (ch_inst)++)
52 #define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) LOOP_UMC_CH_INST((ch_inst))
53 
54 const uint32_t
55 	umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = {
56 		{2, 18, 11, 27},	{4, 20, 13, 29},
57 		{1, 17, 8, 24},		{7, 23, 14, 30},
58 		{10, 26, 3, 19},	{12, 28, 5, 21},
59 		{9, 25, 0, 16},		{15, 31, 6, 22}
60 };
61 
umc_v6_1_enable_umc_index_mode(struct amdgpu_device * adev)62 static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev)
63 {
64 	WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
65 			RSMU_UMC_INDEX_MODE_EN, 1);
66 }
67 
umc_v6_1_disable_umc_index_mode(struct amdgpu_device * adev)68 static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
69 {
70 	WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
71 			RSMU_UMC_INDEX_MODE_EN, 0);
72 }
73 
umc_v6_1_get_umc_index_mode_state(struct amdgpu_device * adev)74 static uint32_t umc_v6_1_get_umc_index_mode_state(struct amdgpu_device *adev)
75 {
76 	uint32_t rsmu_umc_index;
77 
78 	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
79 			mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
80 
81 	return REG_GET_FIELD(rsmu_umc_index,
82 			RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
83 			RSMU_UMC_INDEX_MODE_EN);
84 }
85 
get_umc_6_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)86 static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
87 					    uint32_t umc_inst,
88 					    uint32_t ch_inst)
89 {
90 	return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
91 }
92 
umc_v6_1_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)93 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
94 						   uint32_t umc_reg_offset,
95 						   unsigned long *error_count)
96 {
97 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
98 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
99 	uint64_t mc_umc_status;
100 	uint32_t mc_umc_status_addr;
101 
102 	if (adev->asic_type == CHIP_ARCTURUS) {
103 		/* UMC 6_1_2 registers */
104 		ecc_err_cnt_sel_addr =
105 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
106 		ecc_err_cnt_addr =
107 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
108 		mc_umc_status_addr =
109 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
110 	} else {
111 		/* UMC 6_1_1 registers */
112 		ecc_err_cnt_sel_addr =
113 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
114 		ecc_err_cnt_addr =
115 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
116 		mc_umc_status_addr =
117 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
118 	}
119 
120 	/* select the lower chip and check the error count */
121 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
122 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
123 					EccErrCntCsSel, 0);
124 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
125 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
126 	*error_count +=
127 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
128 		 UMC_V6_1_CE_CNT_INIT);
129 	/* clear the lower chip err count */
130 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
131 
132 	/* select the higher chip and check the err counter */
133 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
134 					EccErrCntCsSel, 1);
135 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
136 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
137 	*error_count +=
138 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
139 		 UMC_V6_1_CE_CNT_INIT);
140 	/* clear the higher chip err count */
141 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
142 
143 	/* check for SRAM correctable error
144 	  MCUMC_STATUS is a 64 bit register */
145 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
146 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
147 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
148 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
149 		*error_count += 1;
150 }
151 
umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)152 static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev,
153 						      uint32_t umc_reg_offset,
154 						      unsigned long *error_count)
155 {
156 	uint64_t mc_umc_status;
157 	uint32_t mc_umc_status_addr;
158 
159 	if (adev->asic_type == CHIP_ARCTURUS) {
160 		/* UMC 6_1_2 registers */
161 		mc_umc_status_addr =
162 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
163 	} else {
164 		/* UMC 6_1_1 registers */
165 		mc_umc_status_addr =
166 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
167 	}
168 
169 	/* check the MCUMC_STATUS */
170 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
171 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
172 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
173 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
174 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
175 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
176 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
177 		*error_count += 1;
178 }
179 
umc_v6_1_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)180 static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
181 					   void *ras_error_status)
182 {
183 	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
184 
185 	uint32_t umc_inst        = 0;
186 	uint32_t ch_inst         = 0;
187 	uint32_t umc_reg_offset  = 0;
188 
189 	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
190 
191 	if (rsmu_umc_index_state)
192 		umc_v6_1_disable_umc_index_mode(adev);
193 
194 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
195 		umc_reg_offset = get_umc_6_reg_offset(adev,
196 						      umc_inst,
197 						      ch_inst);
198 
199 		umc_v6_1_query_correctable_error_count(adev,
200 						       umc_reg_offset,
201 						       &(err_data->ce_count));
202 		umc_v6_1_querry_uncorrectable_error_count(adev,
203 							  umc_reg_offset,
204 							  &(err_data->ue_count));
205 	}
206 
207 	if (rsmu_umc_index_state)
208 		umc_v6_1_enable_umc_index_mode(adev);
209 }
210 
umc_v6_1_query_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint32_t umc_reg_offset,uint32_t ch_inst,uint32_t umc_inst)211 static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
212 					 struct ras_err_data *err_data,
213 					 uint32_t umc_reg_offset,
214 					 uint32_t ch_inst,
215 					 uint32_t umc_inst)
216 {
217 	uint32_t lsb, mc_umc_status_addr;
218 	uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
219 	struct eeprom_table_record *err_rec;
220 	uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
221 
222 	if (adev->asic_type == CHIP_ARCTURUS) {
223 		/* UMC 6_1_2 registers */
224 		mc_umc_status_addr =
225 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
226 		mc_umc_addrt0 =
227 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0_ARCT);
228 	} else {
229 		/* UMC 6_1_1 registers */
230 		mc_umc_status_addr =
231 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
232 		mc_umc_addrt0 =
233 			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
234 	}
235 
236 	/* skip error address process if -ENOMEM */
237 	if (!err_data->err_addr) {
238 		/* clear umc status */
239 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
240 		return;
241 	}
242 
243 	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
244 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
245 
246 	/* calculate error address if ue/ce error is detected */
247 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
248 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
249 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
250 
251 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
252 		/* the lowest lsb bits should be ignored */
253 		lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
254 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
255 		err_addr &= ~((0x1ULL << lsb) - 1);
256 
257 		/* translate umc channel address to soc pa, 3 parts are included */
258 		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
259 				ADDR_OF_256B_BLOCK(channel_index) |
260 				OFFSET_IN_256B_BLOCK(err_addr);
261 
262 		/* we only save ue error information currently, ce is skipped */
263 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
264 				== 1) {
265 			err_rec->address = err_addr;
266 			/* page frame address is saved */
267 			err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
268 			err_rec->ts = (uint64_t)ktime_get_real_seconds();
269 			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
270 			err_rec->cu = 0;
271 			err_rec->mem_channel = channel_index;
272 			err_rec->mcumc_id = umc_inst;
273 
274 			err_data->err_addr_cnt++;
275 		}
276 	}
277 
278 	/* clear umc status */
279 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
280 }
281 
umc_v6_1_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)282 static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
283 					     void *ras_error_status)
284 {
285 	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
286 
287 	uint32_t umc_inst        = 0;
288 	uint32_t ch_inst         = 0;
289 	uint32_t umc_reg_offset  = 0;
290 
291 	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
292 
293 	if (rsmu_umc_index_state)
294 		umc_v6_1_disable_umc_index_mode(adev);
295 
296 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
297 		umc_reg_offset = get_umc_6_reg_offset(adev,
298 						      umc_inst,
299 						      ch_inst);
300 
301 		umc_v6_1_query_error_address(adev,
302 					     err_data,
303 					     umc_reg_offset,
304 					     ch_inst,
305 					     umc_inst);
306 	}
307 
308 	if (rsmu_umc_index_state)
309 		umc_v6_1_enable_umc_index_mode(adev);
310 }
311 
umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)312 static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
313 					      uint32_t umc_reg_offset)
314 {
315 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
316 	uint32_t ecc_err_cnt_addr;
317 
318 	if (adev->asic_type == CHIP_ARCTURUS) {
319 		/* UMC 6_1_2 registers */
320 		ecc_err_cnt_sel_addr =
321 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
322 		ecc_err_cnt_addr =
323 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
324 	} else {
325 		/* UMC 6_1_1 registers */
326 		ecc_err_cnt_sel_addr =
327 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
328 		ecc_err_cnt_addr =
329 			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
330 	}
331 
332 	/* select the lower chip and check the error count */
333 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
334 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
335 					EccErrCntCsSel, 0);
336 	/* set ce error interrupt type to APIC based interrupt */
337 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
338 					EccErrInt, 0x1);
339 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
340 	/* set error count to initial value */
341 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
342 
343 	/* select the higher chip and check the err counter */
344 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
345 					EccErrCntCsSel, 1);
346 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
347 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
348 }
349 
umc_v6_1_err_cnt_init(struct amdgpu_device * adev)350 static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
351 {
352 	uint32_t umc_inst        = 0;
353 	uint32_t ch_inst         = 0;
354 	uint32_t umc_reg_offset  = 0;
355 
356 	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
357 
358 	if (rsmu_umc_index_state)
359 		umc_v6_1_disable_umc_index_mode(adev);
360 
361 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
362 		umc_reg_offset = get_umc_6_reg_offset(adev,
363 						      umc_inst,
364 						      ch_inst);
365 
366 		umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset);
367 	}
368 
369 	if (rsmu_umc_index_state)
370 		umc_v6_1_enable_umc_index_mode(adev);
371 }
372 
373 const struct amdgpu_umc_funcs umc_v6_1_funcs = {
374 	.err_cnt_init = umc_v6_1_err_cnt_init,
375 	.ras_late_init = amdgpu_umc_ras_late_init,
376 	.query_ras_error_count = umc_v6_1_query_ras_error_count,
377 	.query_ras_error_address = umc_v6_1_query_ras_error_address,
378 };
379