1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "common_nvswitch.h"
25 #include "error_nvswitch.h"
26
27 #include "inforom/inforom_nvswitch.h"
28
29 NvlStatus
nvswitch_inforom_nvlink_flush(struct nvswitch_device * device)30 nvswitch_inforom_nvlink_flush
31 (
32 struct nvswitch_device *device
33 )
34 {
35 NvlStatus status = NVL_SUCCESS;
36 struct inforom *pInforom = device->pInforom;
37 PINFOROM_NVLINK_STATE pNvlinkState;
38
39 if (pInforom == NULL)
40 {
41 return -NVL_ERR_NOT_SUPPORTED;
42 }
43
44 pNvlinkState = pInforom->pNvlinkState;
45
46 if (pNvlinkState != NULL && pNvlinkState->bDirty)
47 {
48 status = nvswitch_inforom_write_object(device, "NVL",
49 pNvlinkState->pFmt, pNvlinkState->pNvl,
50 pNvlinkState->pPackedObject);
51 if (status != NVL_SUCCESS)
52 {
53 NVSWITCH_PRINT(device, ERROR,
54 "Failed to flush NVL object to InfoROM, rc: %d\n", status);
55 }
56 else
57 {
58 pNvlinkState->bDirty = NV_FALSE;
59 }
60 }
61
62 return status;
63 }
64
65 static void
_inforom_nvlink_get_correctable_error_counts(nvswitch_device * device,NvU32 linkId,INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS * pErrorCounts)66 _inforom_nvlink_get_correctable_error_counts
67 (
68 nvswitch_device *device,
69 NvU32 linkId,
70 INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS *pErrorCounts
71 )
72 {
73 NvlStatus status;
74 NvU32 lane, idx;
75 NVSWITCH_NVLINK_GET_COUNTERS_PARAMS p = { 0 };
76
77 ct_assert(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE__SIZE <=
78 INFOROM_NVL_OBJECT_MAX_SUBLINK_WIDTH);
79
80 nvswitch_os_memset(pErrorCounts, 0, sizeof(*pErrorCounts));
81
82 p.linkId = linkId;
83 p.counterMask = NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_FLIT
84 | NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_REPLAY
85 | NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_RECOVERY
86 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_REPLAY
87 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L0
88 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L1
89 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L2
90 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L3
91 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L4
92 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L5
93 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L6
94 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L7
95 | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L7;
96
97 status = device->hal.nvswitch_ctrl_get_counters(device, &p);
98 if (status != NVL_SUCCESS)
99 {
100 return;
101 }
102
103 pErrorCounts->flitCrc =
104 p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_FLIT)];
105
106 pErrorCounts->txLinkReplay =
107 p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_REPLAY)];
108
109 pErrorCounts->rxLinkReplay =
110 p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_REPLAY)];
111
112 pErrorCounts->linkRecovery =
113 p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_RECOVERY)];
114
115 for (lane = 0; lane < NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE__SIZE; lane++)
116 {
117 idx = BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L(lane));
118 pErrorCounts->laneCrc[lane] = p.nvlinkCounters[idx];
119 }
120 }
121
122 static void
_inforom_nvlink_update_correctable_error_rates(nvswitch_device * device,struct inforom * pInforom)123 _inforom_nvlink_update_correctable_error_rates
124 (
125 nvswitch_device *device,
126 struct inforom *pInforom
127
128 )
129 {
130 PINFOROM_NVLINK_STATE pNvlinkState = pInforom->pNvlinkState;
131 NvU64 enabledLinkMask;
132 NvU32 linkId, publicId, localLinkIdx;
133 NvBool bDirty = NV_FALSE;
134 NvBool bDirtyTemp;
135 INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS errorCounts = { 0 };
136
137 if (pNvlinkState == NULL)
138 {
139 return;
140 }
141
142 enabledLinkMask = nvswitch_get_enabled_link_mask(device);
143
144 FOR_EACH_INDEX_IN_MASK(64, linkId, enabledLinkMask)
145 {
146 if (device->hal.nvswitch_get_link_public_id(device, linkId, &publicId) != NVL_SUCCESS)
147 {
148 continue;
149 }
150
151 if (device->hal.nvswitch_get_link_local_idx(device, linkId, &localLinkIdx) != NVL_SUCCESS)
152 {
153 continue;
154 }
155
156 _inforom_nvlink_get_correctable_error_counts(device, linkId, &errorCounts);
157
158 if (device->hal.nvswitch_inforom_nvl_update_link_correctable_error_info(device,
159 pNvlinkState->pNvl, &pNvlinkState->correctableErrorRateState, linkId,
160 publicId, localLinkIdx, &errorCounts, &bDirtyTemp) != NVL_SUCCESS)
161 {
162 continue;
163 }
164
165 bDirty |= bDirtyTemp;
166 }
167 FOR_EACH_INDEX_IN_MASK_END;
168
169 pNvlinkState->bDirty |= bDirty;
170 }
171
_nvswitch_nvlink_1hz_callback(nvswitch_device * device)172 static void _nvswitch_nvlink_1hz_callback
173 (
174 nvswitch_device *device
175 )
176 {
177 struct inforom *pInforom = device->pInforom;
178
179 if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL) ||
180 pInforom->pNvlinkState->bCallbackPending)
181 {
182 return;
183 }
184
185 pInforom->pNvlinkState->bCallbackPending = NV_TRUE;
186 _inforom_nvlink_update_correctable_error_rates(device, pInforom);
187 pInforom->pNvlinkState->bCallbackPending = NV_FALSE;
188 }
189
190 static void
_inforom_nvlink_start_correctable_error_recording(nvswitch_device * device,struct inforom * pInforom)191 _inforom_nvlink_start_correctable_error_recording
192 (
193 nvswitch_device *device,
194 struct inforom *pInforom
195 )
196 {
197 PINFOROM_NVLINK_STATE pNvlinkState = pInforom->pNvlinkState;
198
199 if (pNvlinkState == NULL)
200 {
201 return;
202 }
203
204 if (pNvlinkState->bDisableCorrectableErrorLogging)
205 {
206
207 NVSWITCH_PRINT(device, INFO,
208 "%s: Correctable error recording disabled by regkey or unsupported\n",
209 __FUNCTION__);
210 return;
211 }
212
213 pNvlinkState->bCallbackPending = NV_FALSE;
214
215 nvswitch_task_create(device, &_nvswitch_nvlink_1hz_callback,
216 NVSWITCH_INTERVAL_1SEC_IN_NS, 0);
217 }
218
219 NvlStatus
nvswitch_inforom_nvlink_load(nvswitch_device * device)220 nvswitch_inforom_nvlink_load
221 (
222 nvswitch_device *device
223 )
224 {
225 NvlStatus status;
226 NvU8 version = 0;
227 NvU8 subversion = 0;
228 INFOROM_NVLINK_STATE *pNvlinkState = NULL;
229 struct inforom *pInforom = device->pInforom;
230
231 if (pInforom == NULL)
232 {
233 return -NVL_ERR_NOT_SUPPORTED;
234 }
235
236 status = nvswitch_inforom_get_object_version_info(device, "NVL", &version,
237 &subversion);
238 if (status != NVL_SUCCESS)
239 {
240 NVSWITCH_PRINT(device, WARN, "no NVL object found, rc:%d\n", status);
241 return NVL_SUCCESS;
242 }
243
244 if (!INFOROM_OBJECT_SUBVERSION_SUPPORTS_NVSWITCH(subversion))
245 {
246 NVSWITCH_PRINT(device, WARN, "NVL v%u.%u not supported\n",
247 version, subversion);
248 return -NVL_ERR_NOT_SUPPORTED;
249 }
250
251 NVSWITCH_PRINT(device, INFO, "NVL v%u.%u found\n", version, subversion);
252
253 pNvlinkState = nvswitch_os_malloc(sizeof(INFOROM_NVLINK_STATE));
254 if (pNvlinkState == NULL)
255 {
256 return -NVL_NO_MEM;
257 }
258 nvswitch_os_memset(pNvlinkState, 0, sizeof(INFOROM_NVLINK_STATE));
259
260 pNvlinkState->bDirty = NV_FALSE;
261 pNvlinkState->bDisableFatalErrorLogging = NV_FALSE;
262 pNvlinkState->bDisableCorrectableErrorLogging = NV_TRUE;
263
264 status = device->hal.nvswitch_inforom_nvl_setup_nvlink_state(device, pNvlinkState, version);
265 if (status != NVL_SUCCESS)
266 {
267 NVSWITCH_PRINT(device, ERROR, "Failed to set up NVL object, rc:%d\n", status);
268 goto nvswitch_inforom_nvlink_version_fail;
269 }
270
271 status = nvswitch_inforom_read_object(device, "NVL", pNvlinkState->pFmt,
272 pNvlinkState->pPackedObject,
273 pNvlinkState->pNvl);
274 if (status != NVL_SUCCESS)
275 {
276 NVSWITCH_PRINT(device, ERROR, "Failed to read NVL object, rc:%d\n", status);
277 goto nvswitch_inforom_read_fail;
278 }
279
280 status = nvswitch_inforom_add_object(pInforom, &pNvlinkState->pNvl->header);
281 if (status != NVL_SUCCESS)
282 {
283 NVSWITCH_PRINT(device, ERROR, "Failed to cache NVL object header, rc:%d\n",
284 status);
285 goto nvswitch_inforom_read_fail;
286 }
287
288 pInforom->pNvlinkState = pNvlinkState;
289
290 _inforom_nvlink_start_correctable_error_recording(device, pInforom);
291
292 return NVL_SUCCESS;
293
294 nvswitch_inforom_read_fail:
295 nvswitch_os_free(pNvlinkState->pPackedObject);
296 nvswitch_os_free(pNvlinkState->pNvl);
297 nvswitch_inforom_nvlink_version_fail:
298 nvswitch_os_free(pNvlinkState);
299
300 return status;
301 }
302
303 void
nvswitch_inforom_nvlink_unload(nvswitch_device * device)304 nvswitch_inforom_nvlink_unload
305 (
306 nvswitch_device *device
307 )
308 {
309 INFOROM_NVLINK_STATE *pNvlinkState;
310 struct inforom *pInforom = device->pInforom;
311
312 if (pInforom == NULL)
313 {
314 return;
315 }
316
317 pNvlinkState = pInforom->pNvlinkState;
318 if (pNvlinkState == NULL)
319 {
320 return;
321 }
322
323 if (nvswitch_inforom_nvlink_flush(device) != NVL_SUCCESS)
324 {
325 NVSWITCH_PRINT(device, ERROR, "Failed to flush NVL object on object unload\n");
326 }
327
328 nvswitch_os_free(pNvlinkState->pPackedObject);
329 nvswitch_os_free(pNvlinkState->pNvl);
330 nvswitch_os_free(pNvlinkState);
331 pInforom->pNvlinkState = NULL;
332 }
333
334 NvlStatus
nvswitch_inforom_nvlink_log_error_event(nvswitch_device * device,void * error_event)335 nvswitch_inforom_nvlink_log_error_event
336 (
337 nvswitch_device *device,
338 void *error_event
339 )
340 {
341 NvlStatus status;
342 NvBool bDirty = NV_FALSE;
343 struct inforom *pInforom = device->pInforom;
344 INFOROM_NVLINK_STATE *pNvlinkState;
345
346 if (pInforom == NULL)
347 {
348 return -NVL_ERR_NOT_SUPPORTED;
349 }
350
351 pNvlinkState = pInforom->pNvlinkState;
352 if (pNvlinkState == NULL)
353 {
354 return -NVL_ERR_NOT_SUPPORTED;
355 }
356
357 status = device->hal.nvswitch_inforom_nvl_log_error_event(device,
358 pNvlinkState->pNvl,
359 (INFOROM_NVLINK_ERROR_EVENT *)error_event,
360 &bDirty);
361 if (status != NVL_SUCCESS)
362 {
363 NVSWITCH_PRINT(device, ERROR, "Failed to log error to inforom, rc:%d\n",
364 status);
365 }
366
367 pNvlinkState->bDirty |= bDirty;
368
369 return status;
370 }
371
372 NvlStatus
nvswitch_inforom_nvlink_get_max_correctable_error_rate(nvswitch_device * device,NVSWITCH_GET_NVLINK_MAX_CORRECTABLE_ERROR_RATES_PARAMS * params)373 nvswitch_inforom_nvlink_get_max_correctable_error_rate
374 (
375 nvswitch_device *device,
376 NVSWITCH_GET_NVLINK_MAX_CORRECTABLE_ERROR_RATES_PARAMS *params
377 )
378 {
379 struct inforom *pInforom = device->pInforom;
380
381 if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL))
382 {
383 return -NVL_ERR_NOT_SUPPORTED;
384 }
385
386 return device->hal.nvswitch_inforom_nvl_get_max_correctable_error_rate(device, params);
387 }
388
389 NvlStatus
nvswitch_inforom_nvlink_get_errors(nvswitch_device * device,NVSWITCH_GET_NVLINK_ERROR_COUNTS_PARAMS * params)390 nvswitch_inforom_nvlink_get_errors
391 (
392 nvswitch_device *device,
393 NVSWITCH_GET_NVLINK_ERROR_COUNTS_PARAMS *params
394 )
395 {
396 struct inforom *pInforom = device->pInforom;
397
398 if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL))
399 {
400 return -NVL_ERR_NOT_SUPPORTED;
401 }
402
403 return device->hal.nvswitch_inforom_nvl_get_errors(device, params);
404 }
405
nvswitch_inforom_nvlink_setL1Threshold(nvswitch_device * device,NvU32 word1,NvU32 word2)406 NvlStatus nvswitch_inforom_nvlink_setL1Threshold
407 (
408 nvswitch_device *device,
409 NvU32 word1,
410 NvU32 word2
411 )
412 {
413 struct inforom *pInforom = device->pInforom;
414 INFOROM_NVLINK_STATE *pNvlinkState;
415
416 if (pInforom == NULL)
417 {
418 return -NVL_ERR_NOT_SUPPORTED;
419 }
420
421 pNvlinkState = pInforom->pNvlinkState;
422 if (pNvlinkState == NULL)
423 {
424 return -NVL_ERR_NOT_SUPPORTED;
425 }
426
427 return device->hal.nvswitch_inforom_nvl_setL1Threshold(device,
428 pNvlinkState->pNvl,
429 word1,
430 word2);
431 }
432
nvswitch_inforom_nvlink_getL1Threshold(nvswitch_device * device,NvU32 * word1,NvU32 * word2)433 NvlStatus nvswitch_inforom_nvlink_getL1Threshold
434 (
435 nvswitch_device *device,
436 NvU32 *word1,
437 NvU32 *word2
438 )
439 {
440 struct inforom *pInforom = device->pInforom;
441 INFOROM_NVLINK_STATE *pNvlinkState;
442
443 if (pInforom == NULL)
444 {
445 return -NVL_ERR_NOT_SUPPORTED;
446 }
447
448 pNvlinkState = pInforom->pNvlinkState;
449 if (pNvlinkState == NULL)
450 {
451 return -NVL_ERR_NOT_SUPPORTED;
452 }
453
454 return device->hal.nvswitch_inforom_nvl_getL1Threshold(device,
455 pNvlinkState->pNvl,
456 word1,
457 word2);
458 }
459
460