1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "common_nvswitch.h"
25 #include "error_nvswitch.h"
26 
27 #include "inforom/inforom_nvswitch.h"
28 
29 NvlStatus
nvswitch_inforom_nvlink_flush(struct nvswitch_device * device)30 nvswitch_inforom_nvlink_flush
31 (
32     struct nvswitch_device *device
33 )
34 {
35     NvlStatus status = NVL_SUCCESS;
36     struct inforom *pInforom = device->pInforom;
37     PINFOROM_NVLINK_STATE pNvlinkState;
38 
39     if (pInforom == NULL)
40     {
41         return -NVL_ERR_NOT_SUPPORTED;
42     }
43 
44     pNvlinkState = pInforom->pNvlinkState;
45 
46     if (pNvlinkState != NULL && pNvlinkState->bDirty)
47     {
48         status = nvswitch_inforom_write_object(device, "NVL",
49                                         pNvlinkState->pFmt, pNvlinkState->pNvl,
50                                         pNvlinkState->pPackedObject);
51         if (status != NVL_SUCCESS)
52         {
53             NVSWITCH_PRINT(device, ERROR,
54                 "Failed to flush NVL object to InfoROM, rc: %d\n", status);
55         }
56         else
57         {
58             pNvlinkState->bDirty = NV_FALSE;
59         }
60     }
61 
62     return status;
63 }
64 
65 static void
_inforom_nvlink_get_correctable_error_counts(nvswitch_device * device,NvU32 linkId,INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS * pErrorCounts)66 _inforom_nvlink_get_correctable_error_counts
67 (
68     nvswitch_device                         *device,
69     NvU32                                    linkId,
70     INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS *pErrorCounts
71 )
72 {
73     NvlStatus status;
74     NvU32 lane, idx;
75     NVSWITCH_NVLINK_GET_COUNTERS_PARAMS p = { 0 };
76 
77     ct_assert(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE__SIZE <=
78               INFOROM_NVL_OBJECT_MAX_SUBLINK_WIDTH);
79 
80     nvswitch_os_memset(pErrorCounts, 0, sizeof(*pErrorCounts));
81 
82     p.linkId = linkId;
83     p.counterMask = NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_FLIT
84                   | NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_REPLAY
85                   | NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_RECOVERY
86                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_REPLAY
87                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L0
88                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L1
89                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L2
90                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L3
91                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L4
92                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L5
93                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L6
94                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L7
95                   | NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L7;
96 
97     status = device->hal.nvswitch_ctrl_get_counters(device, &p);
98     if (status != NVL_SUCCESS)
99     {
100         return;
101     }
102 
103     pErrorCounts->flitCrc =
104         p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_FLIT)];
105 
106     pErrorCounts->txLinkReplay =
107         p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_REPLAY)];
108 
109     pErrorCounts->rxLinkReplay =
110         p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_REPLAY)];
111 
112     pErrorCounts->linkRecovery =
113         p.nvlinkCounters[BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_TX_ERR_RECOVERY)];
114 
115     for (lane = 0; lane < NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE__SIZE; lane++)
116     {
117         idx = BIT_IDX_32(NVSWITCH_NVLINK_COUNTER_DL_RX_ERR_CRC_LANE_L(lane));
118         pErrorCounts->laneCrc[lane] = p.nvlinkCounters[idx];
119     }
120 }
121 
122 static void
_inforom_nvlink_update_correctable_error_rates(nvswitch_device * device,struct inforom * pInforom)123 _inforom_nvlink_update_correctable_error_rates
124 (
125     nvswitch_device  *device,
126     struct inforom   *pInforom
127 
128 )
129 {
130     PINFOROM_NVLINK_STATE pNvlinkState = pInforom->pNvlinkState;
131     NvU64                 enabledLinkMask;
132     NvU32                 linkId, publicId, localLinkIdx;
133     NvBool                bDirty = NV_FALSE;
134     NvBool                bDirtyTemp;
135     INFOROM_NVLINK_CORRECTABLE_ERROR_COUNTS errorCounts = { 0 };
136 
137     if (pNvlinkState == NULL)
138     {
139         return;
140     }
141 
142     enabledLinkMask = nvswitch_get_enabled_link_mask(device);
143 
144     FOR_EACH_INDEX_IN_MASK(64, linkId, enabledLinkMask)
145     {
146         if (device->hal.nvswitch_get_link_public_id(device, linkId, &publicId) != NVL_SUCCESS)
147         {
148             continue;
149         }
150 
151         if (device->hal.nvswitch_get_link_local_idx(device, linkId, &localLinkIdx) != NVL_SUCCESS)
152         {
153             continue;
154         }
155 
156         _inforom_nvlink_get_correctable_error_counts(device, linkId, &errorCounts);
157 
158         if (device->hal.nvswitch_inforom_nvl_update_link_correctable_error_info(device,
159                 pNvlinkState->pNvl, &pNvlinkState->correctableErrorRateState, linkId,
160                 publicId, localLinkIdx, &errorCounts, &bDirtyTemp) != NVL_SUCCESS)
161         {
162             continue;
163         }
164 
165         bDirty |= bDirtyTemp;
166     }
167     FOR_EACH_INDEX_IN_MASK_END;
168 
169     pNvlinkState->bDirty |= bDirty;
170 }
171 
_nvswitch_nvlink_1hz_callback(nvswitch_device * device)172 static void _nvswitch_nvlink_1hz_callback
173 (
174     nvswitch_device *device
175 )
176 {
177     struct inforom *pInforom = device->pInforom;
178 
179     if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL) ||
180         pInforom->pNvlinkState->bCallbackPending)
181     {
182         return;
183     }
184 
185     pInforom->pNvlinkState->bCallbackPending = NV_TRUE;
186     _inforom_nvlink_update_correctable_error_rates(device, pInforom);
187     pInforom->pNvlinkState->bCallbackPending = NV_FALSE;
188 }
189 
190 static void
_inforom_nvlink_start_correctable_error_recording(nvswitch_device * device,struct inforom * pInforom)191 _inforom_nvlink_start_correctable_error_recording
192 (
193     nvswitch_device *device,
194     struct inforom  *pInforom
195 )
196 {
197     PINFOROM_NVLINK_STATE pNvlinkState = pInforom->pNvlinkState;
198 
199     if (pNvlinkState == NULL)
200     {
201         return;
202     }
203 
204     if (pNvlinkState->bDisableCorrectableErrorLogging)
205     {
206 
207         NVSWITCH_PRINT(device, INFO,
208                 "%s: Correctable error recording disabled by regkey or unsupported\n",
209                 __FUNCTION__);
210         return;
211     }
212 
213     pNvlinkState->bCallbackPending = NV_FALSE;
214 
215     nvswitch_task_create(device, &_nvswitch_nvlink_1hz_callback,
216                          NVSWITCH_INTERVAL_1SEC_IN_NS, 0);
217 }
218 
219 NvlStatus
nvswitch_inforom_nvlink_load(nvswitch_device * device)220 nvswitch_inforom_nvlink_load
221 (
222     nvswitch_device *device
223 )
224 {
225     NvlStatus status;
226     NvU8 version = 0;
227     NvU8 subversion = 0;
228     INFOROM_NVLINK_STATE *pNvlinkState = NULL;
229     struct inforom *pInforom = device->pInforom;
230 
231     if (pInforom == NULL)
232     {
233         return -NVL_ERR_NOT_SUPPORTED;
234     }
235 
236     status = nvswitch_inforom_get_object_version_info(device, "NVL", &version,
237                                                     &subversion);
238     if (status != NVL_SUCCESS)
239     {
240         NVSWITCH_PRINT(device, WARN, "no NVL object found, rc:%d\n", status);
241         return NVL_SUCCESS;
242     }
243 
244     if (!INFOROM_OBJECT_SUBVERSION_SUPPORTS_NVSWITCH(subversion))
245     {
246         NVSWITCH_PRINT(device, WARN, "NVL v%u.%u not supported\n",
247                     version, subversion);
248         return -NVL_ERR_NOT_SUPPORTED;
249     }
250 
251     NVSWITCH_PRINT(device, INFO, "NVL v%u.%u found\n", version, subversion);
252 
253     pNvlinkState = nvswitch_os_malloc(sizeof(INFOROM_NVLINK_STATE));
254     if (pNvlinkState == NULL)
255     {
256         return -NVL_NO_MEM;
257     }
258     nvswitch_os_memset(pNvlinkState, 0, sizeof(INFOROM_NVLINK_STATE));
259 
260     pNvlinkState->bDirty = NV_FALSE;
261     pNvlinkState->bDisableFatalErrorLogging = NV_FALSE;
262     pNvlinkState->bDisableCorrectableErrorLogging = NV_TRUE;
263 
264     status = device->hal.nvswitch_inforom_nvl_setup_nvlink_state(device, pNvlinkState, version);
265     if (status != NVL_SUCCESS)
266     {
267         NVSWITCH_PRINT(device, ERROR, "Failed to set up NVL object, rc:%d\n", status);
268         goto nvswitch_inforom_nvlink_version_fail;
269     }
270 
271     status = nvswitch_inforom_read_object(device, "NVL", pNvlinkState->pFmt,
272                                         pNvlinkState->pPackedObject,
273                                         pNvlinkState->pNvl);
274     if (status != NVL_SUCCESS)
275     {
276         NVSWITCH_PRINT(device, ERROR, "Failed to read NVL object, rc:%d\n", status);
277         goto nvswitch_inforom_read_fail;
278     }
279 
280     status = nvswitch_inforom_add_object(pInforom, &pNvlinkState->pNvl->header);
281     if (status != NVL_SUCCESS)
282     {
283         NVSWITCH_PRINT(device, ERROR, "Failed to cache NVL object header, rc:%d\n",
284                     status);
285         goto nvswitch_inforom_read_fail;
286     }
287 
288     pInforom->pNvlinkState = pNvlinkState;
289 
290     _inforom_nvlink_start_correctable_error_recording(device, pInforom);
291 
292     return NVL_SUCCESS;
293 
294 nvswitch_inforom_read_fail:
295     nvswitch_os_free(pNvlinkState->pPackedObject);
296     nvswitch_os_free(pNvlinkState->pNvl);
297 nvswitch_inforom_nvlink_version_fail:
298     nvswitch_os_free(pNvlinkState);
299 
300     return status;
301 }
302 
303 void
nvswitch_inforom_nvlink_unload(nvswitch_device * device)304 nvswitch_inforom_nvlink_unload
305 (
306     nvswitch_device *device
307 )
308 {
309     INFOROM_NVLINK_STATE *pNvlinkState;
310     struct inforom *pInforom = device->pInforom;
311 
312     if (pInforom == NULL)
313     {
314         return;
315     }
316 
317     pNvlinkState = pInforom->pNvlinkState;
318     if (pNvlinkState == NULL)
319     {
320         return;
321     }
322 
323     if (nvswitch_inforom_nvlink_flush(device) != NVL_SUCCESS)
324     {
325         NVSWITCH_PRINT(device, ERROR, "Failed to flush NVL object on object unload\n");
326     }
327 
328     nvswitch_os_free(pNvlinkState->pPackedObject);
329     nvswitch_os_free(pNvlinkState->pNvl);
330     nvswitch_os_free(pNvlinkState);
331     pInforom->pNvlinkState = NULL;
332 }
333 
334 NvlStatus
nvswitch_inforom_nvlink_log_error_event(nvswitch_device * device,void * error_event)335 nvswitch_inforom_nvlink_log_error_event
336 (
337     nvswitch_device            *device,
338     void                       *error_event
339 )
340 {
341     NvlStatus status;
342     NvBool bDirty = NV_FALSE;
343     struct inforom *pInforom = device->pInforom;
344     INFOROM_NVLINK_STATE *pNvlinkState;
345 
346     if (pInforom == NULL)
347     {
348         return -NVL_ERR_NOT_SUPPORTED;
349     }
350 
351     pNvlinkState = pInforom->pNvlinkState;
352     if (pNvlinkState == NULL)
353     {
354         return -NVL_ERR_NOT_SUPPORTED;
355     }
356 
357     status = device->hal.nvswitch_inforom_nvl_log_error_event(device,
358                                                         pNvlinkState->pNvl,
359                                                         (INFOROM_NVLINK_ERROR_EVENT *)error_event,
360                                                         &bDirty);
361     if (status != NVL_SUCCESS)
362     {
363         NVSWITCH_PRINT(device, ERROR, "Failed to log error to inforom, rc:%d\n",
364                     status);
365     }
366 
367     pNvlinkState->bDirty |= bDirty;
368 
369     return status;
370 }
371 
372 NvlStatus
nvswitch_inforom_nvlink_get_max_correctable_error_rate(nvswitch_device * device,NVSWITCH_GET_NVLINK_MAX_CORRECTABLE_ERROR_RATES_PARAMS * params)373 nvswitch_inforom_nvlink_get_max_correctable_error_rate
374 (
375     nvswitch_device *device,
376     NVSWITCH_GET_NVLINK_MAX_CORRECTABLE_ERROR_RATES_PARAMS *params
377 )
378 {
379     struct inforom *pInforom = device->pInforom;
380 
381     if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL))
382     {
383         return -NVL_ERR_NOT_SUPPORTED;
384     }
385 
386     return device->hal.nvswitch_inforom_nvl_get_max_correctable_error_rate(device, params);
387 }
388 
389 NvlStatus
nvswitch_inforom_nvlink_get_errors(nvswitch_device * device,NVSWITCH_GET_NVLINK_ERROR_COUNTS_PARAMS * params)390 nvswitch_inforom_nvlink_get_errors
391 (
392     nvswitch_device *device,
393     NVSWITCH_GET_NVLINK_ERROR_COUNTS_PARAMS *params
394 )
395 {
396     struct inforom *pInforom = device->pInforom;
397 
398     if ((pInforom == NULL) || (pInforom->pNvlinkState == NULL))
399     {
400         return -NVL_ERR_NOT_SUPPORTED;
401     }
402 
403     return device->hal.nvswitch_inforom_nvl_get_errors(device, params);
404 }
405 
nvswitch_inforom_nvlink_setL1Threshold(nvswitch_device * device,NvU32 word1,NvU32 word2)406 NvlStatus nvswitch_inforom_nvlink_setL1Threshold
407 (
408     nvswitch_device *device,
409     NvU32 word1,
410     NvU32 word2
411 )
412 {
413     struct inforom *pInforom = device->pInforom;
414     INFOROM_NVLINK_STATE *pNvlinkState;
415 
416     if (pInforom == NULL)
417     {
418         return -NVL_ERR_NOT_SUPPORTED;
419     }
420 
421     pNvlinkState = pInforom->pNvlinkState;
422     if (pNvlinkState == NULL)
423     {
424         return -NVL_ERR_NOT_SUPPORTED;
425     }
426 
427     return device->hal.nvswitch_inforom_nvl_setL1Threshold(device,
428                                                         pNvlinkState->pNvl,
429                                                         word1,
430                                                         word2);
431 }
432 
nvswitch_inforom_nvlink_getL1Threshold(nvswitch_device * device,NvU32 * word1,NvU32 * word2)433 NvlStatus nvswitch_inforom_nvlink_getL1Threshold
434 (
435     nvswitch_device *device,
436     NvU32 *word1,
437     NvU32 *word2
438 )
439 {
440     struct inforom *pInforom = device->pInforom;
441     INFOROM_NVLINK_STATE *pNvlinkState;
442 
443     if (pInforom == NULL)
444     {
445         return -NVL_ERR_NOT_SUPPORTED;
446     }
447 
448     pNvlinkState = pInforom->pNvlinkState;
449     if (pNvlinkState == NULL)
450     {
451         return -NVL_ERR_NOT_SUPPORTED;
452     }
453 
454     return device->hal.nvswitch_inforom_nvl_getL1Threshold(device,
455                                                         pNvlinkState->pNvl,
456                                                         word1,
457                                                         word2);
458 }
459 
460