1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "nvlink.h"
25 #include "nvVer.h"
26 #include "nvlink_os.h"
27 #include "nvlink_lib_ctrl.h"
28 #include "../nvlink_ctx.h"
29 #include "../nvlink_helper.h"
30 #include "nvlink_lock.h"
31 #include "nvctassert.h"
32 
33 #define NVLINK_IOC_GET_BUF(ctrlParams, type) (ctrlParams)->size >= sizeof(type) ? (type *) (ctrlParams)->buf : NULL
34 
35 /**
36  * List of static functions
37  */
38 static NvlStatus nvlink_lib_ioctl_ctrl_helper(nvlink_ioctrl_params *);
39 static NvlStatus nvlink_lib_ctrl_prologue(nvlink_ioctrl_params *);
40 static NvlStatus nvlink_lib_ctrl_check_version(nvlink_check_version *);
41 static NvlStatus nvlink_lib_ctrl_set_node_id(nvlink_set_node_id *);
42 static NvlStatus nvlink_lib_ctrl_all_links(nvlink_ioctrl_params *);
43 static NvlStatus nvlink_lib_ctrl_device_link_init_status(nvlink_device_link_init_status *);
44 static NvlStatus nvlink_lib_ctrl_device_write_discovery_tokens(nvlink_device_write_discovery_tokens *);
45 static NvlStatus nvlink_lib_ctrl_device_read_discovery_tokens(nvlink_device_read_discovery_tokens *);
46 static NvlStatus nvlink_lib_ctrl_device_read_sids(nvlink_device_read_sids *);
47 static NvlStatus nvlink_lib_ctrl_discover_intranode_conns(nvlink_discover_intranode_conns *);
48 static NvlStatus nvlink_lib_ctrl_device_get_intranode_conns(nvlink_device_get_intranode_conns *);
49 static NvlStatus nvlink_lib_ctrl_add_internode_conn(nvlink_add_internode_conn *);
50 static NvlStatus nvlink_lib_ctrl_remove_internode_conn(nvlink_remove_internode_conn *);
51 static NvlStatus nvlink_lib_ctrl_train_intranode_conn(nvlink_train_intranode_conn *);
52 static NvlStatus nvlink_lib_ctrl_train_intranode_conns_parallel(nvlink_train_intranode_conns_parallel *);
53 static NvlStatus nvlink_lib_ctrl_train_internode_conn_link(nvlink_train_internode_conn_link *);
54 static NvlStatus nvlink_lib_ctrl_train_internode_conn_sublink(nvlink_train_internode_conn_sublink *);
55 static NvlStatus nvlink_lib_ctrl_train_internode_links_initoptimize(nvlink_train_internode_links_initoptimize *);
56 static NvlStatus nvlink_lib_ctrl_train_internode_links_post_initoptimize(nvlink_train_internode_links_post_initoptimize *);
57 static NvlStatus nvlink_lib_ctrl_train_internode_conns_parallel(nvlink_train_internode_conns_parallel *);
58 static NvlStatus nvlink_lib_ctrl_get_devices_info(nvlink_get_devices_info *);
59 static NvlStatus nvlink_lib_ctrl_acquire_capability(nvlink_ioctrl_params *, nvlink_acquire_capability *);
60 static NvlStatus nvlink_lib_ctrl_get_link_state(nvlink_get_link_state *);
61 static NvlStatus nvlink_lib_ctrl_get_device_link_states(nvlink_get_device_link_states *);
62 
63 /**
64  * Entry point for IOCTLs into the NVLink core library
65  *
66  * @param[in]  ctrlParams  IOCTL params
67  *
68  * return NvlStatus
69  */
70 NvlStatus
nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params * ctrlParams)71 nvlink_lib_ioctl_ctrl
72 (
73     nvlink_ioctrl_params *ctrlParams
74 )
75 {
76     NvlStatus status = NVL_SUCCESS;
77 
78     status = nvlink_lib_ioctl_ctrl_helper(ctrlParams);
79 
80     return status;
81 }
82 
83 /**
84  * Helper function for routing the IOCTL to the respective handlers
85  *
86  * Note: The handlers acquire the required core library locks before
87  *       calling the core library code
88  *
89  * @param[in]  ctrlParams  IOCTL params
90  *
91  * return NvlStatus
92  */
93 static NvlStatus
nvlink_lib_ioctl_ctrl_helper(nvlink_ioctrl_params * ctrlParams)94 nvlink_lib_ioctl_ctrl_helper
95 (
96     nvlink_ioctrl_params *ctrlParams
97 )
98 {
99     NvlStatus status;
100 
101     status = nvlink_lib_ctrl_prologue(ctrlParams);
102     if (status != NVL_SUCCESS)
103     {
104         return status;
105     }
106 
107     switch (ctrlParams->cmd)
108     {
109         case CTRL_NVLINK_CHECK_VERSION:
110         {
111             nvlink_check_version *iocReq;
112 
113             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_check_version);
114             if (!iocReq)
115             {
116                 return NVL_BAD_ARGS;
117             }
118 
119             iocReq->status = nvlink_lib_ctrl_check_version(iocReq);
120             break;
121         }
122 
123         case CTRL_NVLINK_SET_NODE_ID:
124         {
125             nvlink_set_node_id *iocReq;
126 
127             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_node_id);
128             if (!iocReq)
129             {
130                 return NVL_BAD_ARGS;
131             }
132 
133             iocReq->status = nvlink_lib_ctrl_set_node_id(iocReq);
134             break;
135         }
136 
137         //
138         // The following commands operate on all the links registered in the
139         // core library. Hence, clubbing them into a group so, we don't have
140         // to duplicate the lock acquire/release for each of them
141         //
142         case CTRL_NVLINK_INITPHASE1:
143         case CTRL_NVLINK_RX_INIT_TERM:
144         case CTRL_NVLINK_SET_RX_DETECT:
145         case CTRL_NVLINK_GET_RX_DETECT:
146         case CTRL_NVLINK_SET_TX_COMMON_MODE:
147         case CTRL_NVLINK_CALIBRATE:
148         case CTRL_NVLINK_ENABLE_DATA:
149         case CTRL_NVLINK_LINK_INIT_ASYNC:
150         case CTRL_NVLINK_INITNEGOTIATE:
151         case CTRL_NVLINK_INITPHASE5:
152         {
153             nvlink_lib_ctrl_all_links(ctrlParams);
154             break;
155         }
156 
157         case CTRL_NVLINK_DEVICE_LINK_INIT_STATUS:
158         {
159             nvlink_device_link_init_status *iocReq;
160 
161             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_link_init_status);
162             if (!iocReq)
163             {
164                 return NVL_BAD_ARGS;
165             }
166 
167             iocReq->status = nvlink_lib_ctrl_device_link_init_status(iocReq);
168             break;
169         }
170 
171         case CTRL_NVLINK_DEVICE_WRITE_DISCOVERY_TOKENS:
172         {
173             nvlink_device_write_discovery_tokens *iocReq;
174 
175             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_write_discovery_tokens);
176             if (!iocReq)
177             {
178                 return NVL_BAD_ARGS;
179             }
180 
181             iocReq->status = nvlink_lib_ctrl_device_write_discovery_tokens(iocReq);
182             break;
183         }
184 
185         case CTRL_NVLINK_DEVICE_READ_DISCOVERY_TOKENS:
186         {
187             nvlink_device_read_discovery_tokens *iocReq;
188 
189             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_read_discovery_tokens);
190             if (!iocReq)
191             {
192                 return NVL_BAD_ARGS;
193             }
194 
195             iocReq->status = nvlink_lib_ctrl_device_read_discovery_tokens(iocReq);
196             break;
197         }
198 
199         case CTRL_NVLINK_DEVICE_READ_SIDS:
200         {
201             nvlink_device_read_sids *iocReq;
202 
203             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_read_sids);
204             if (!iocReq)
205             {
206                 return NVL_BAD_ARGS;
207             }
208 
209             iocReq->status = nvlink_lib_ctrl_device_read_sids(iocReq);
210             break;
211         }
212 
213         case CTRL_NVLINK_DISCOVER_INTRANODE_CONNS:
214         {
215             nvlink_discover_intranode_conns *iocReq;
216 
217             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_discover_intranode_conns);
218             if (!iocReq)
219             {
220                 return NVL_BAD_ARGS;
221             }
222 
223             iocReq->status = nvlink_lib_ctrl_discover_intranode_conns(iocReq);
224             break;
225         }
226 
227         case CTRL_NVLINK_DEVICE_GET_INTRANODE_CONNS:
228         {
229             nvlink_device_get_intranode_conns *iocReq;
230 
231             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_get_intranode_conns);
232             if (!iocReq)
233             {
234                 return NVL_BAD_ARGS;
235             }
236 
237             iocReq->status = nvlink_lib_ctrl_device_get_intranode_conns(iocReq);
238             break;
239         }
240 
241         case CTRL_NVLINK_ADD_INTERNODE_CONN:
242         {
243             nvlink_add_internode_conn *iocReq;
244 
245             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_add_internode_conn);
246             if (!iocReq)
247             {
248                 return NVL_BAD_ARGS;
249             }
250 
251             iocReq->status = nvlink_lib_ctrl_add_internode_conn(iocReq);
252             break;
253         }
254 
255         case CTRL_NVLINK_REMOVE_INTERNODE_CONN:
256         {
257             nvlink_remove_internode_conn *iocReq;
258 
259             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_remove_internode_conn);
260             if (!iocReq)
261             {
262                 return NVL_BAD_ARGS;
263             }
264 
265             iocReq->status = nvlink_lib_ctrl_remove_internode_conn(iocReq);
266             break;
267         }
268 
269         case CTRL_NVLINK_TRAIN_INTRANODE_CONN:
270         {
271             nvlink_train_intranode_conn *iocReq;
272 
273             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_intranode_conn);
274             if (!iocReq)
275             {
276                 return NVL_BAD_ARGS;
277             }
278 
279             iocReq->status = nvlink_lib_ctrl_train_intranode_conn(iocReq);
280             break;
281         }
282 
283         case CTRL_NVLINK_TRAIN_INTRANODE_CONNS_PARALLEL:
284         {
285             nvlink_train_intranode_conns_parallel *iocReq;
286 
287             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_intranode_conns_parallel);
288             if (!iocReq)
289             {
290                 return NVL_BAD_ARGS;
291             }
292 
293             iocReq->status = nvlink_lib_ctrl_train_intranode_conns_parallel(iocReq);
294             break;
295         }
296 
297         case CTRL_NVLINK_TRAIN_INTERNODE_CONN_LINK:
298         {
299             nvlink_train_internode_conn_link *iocReq;
300 
301             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conn_link);
302             if (!iocReq)
303             {
304                 return NVL_BAD_ARGS;
305             }
306 
307             iocReq->status = nvlink_lib_ctrl_train_internode_conn_link(iocReq);
308             break;
309         }
310 
311         case CTRL_NVLINK_TRAIN_INTERNODE_CONN_SUBLINK:
312         {
313             nvlink_train_internode_conn_sublink *iocReq;
314 
315             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conn_sublink);
316             if (!iocReq)
317             {
318                 return NVL_BAD_ARGS;
319             }
320 
321             iocReq->status = nvlink_lib_ctrl_train_internode_conn_sublink(iocReq);
322             break;
323         }
324 
325         case CTRL_NVLINK_TRAIN_INTERNODE_LINKS_INITOPTIMIZE:
326         {
327             nvlink_train_internode_links_initoptimize *iocReq;
328 
329             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_links_initoptimize);
330             if (!iocReq)
331             {
332                 return NVL_BAD_ARGS;
333             }
334             iocReq->status = nvlink_lib_ctrl_train_internode_links_initoptimize(iocReq);
335             break;
336         }
337 
338         case CTRL_NVLINK_TRAIN_INTERNODE_LINKS_POST_INITOPTIMIZE:
339         {
340             nvlink_train_internode_links_post_initoptimize *iocReq;
341 
342             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_links_post_initoptimize);
343             if (!iocReq)
344             {
345                 return NVL_BAD_ARGS;
346             }
347             iocReq->status = nvlink_lib_ctrl_train_internode_links_post_initoptimize(iocReq);
348             break;
349         }
350 
351         case CTRL_NVLINK_TRAIN_INTERNODE_CONNS_PARALLEL:
352         {
353             nvlink_train_internode_conns_parallel *iocReq;
354 
355             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conns_parallel);
356             if (!iocReq)
357             {
358                 return NVL_BAD_ARGS;
359             }
360 
361             iocReq->status = nvlink_lib_ctrl_train_internode_conns_parallel(iocReq);
362             break;
363         }
364 
365         case CTRL_NVLINK_GET_DEVICES_INFO:
366         {
367             nvlink_get_devices_info *iocReq;
368 
369             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_devices_info);
370             if (!iocReq)
371             {
372                 return NVL_BAD_ARGS;
373             }
374 
375             iocReq->status = nvlink_lib_ctrl_get_devices_info(iocReq);
376             break;
377         }
378 
379         case CTRL_NVLINK_ACQUIRE_CAPABILITY:
380         {
381             nvlink_acquire_capability *iocReq;
382 
383             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_acquire_capability);
384             if (!iocReq)
385             {
386                 return NVL_BAD_ARGS;
387             }
388 
389             iocReq->status = nvlink_lib_ctrl_acquire_capability(ctrlParams, iocReq);
390             break;
391         }
392 
393         case CTRL_NVLINK_GET_LINK_STATE:
394         {
395             nvlink_get_link_state *iocReq;
396 
397             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_link_state);
398             if (!iocReq)
399             {
400                 return NVL_BAD_ARGS;
401             }
402 
403             iocReq->status = nvlink_lib_ctrl_get_link_state(iocReq);
404             break;
405         }
406         case CTRL_NVLINK_GET_DEVICE_LINK_STATES:
407         {
408             nvlink_get_device_link_states *iocReq;
409 
410             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_device_link_states);
411             if (!iocReq)
412             {
413                 return NVL_BAD_ARGS;
414             }
415 
416             iocReq->status = nvlink_lib_ctrl_get_device_link_states(iocReq);
417             break;
418         }
419 
420         case CTRL_NVLINK_RESERVED_0:
421         case CTRL_NVLINK_RESERVED_1:
422         case CTRL_NVLINK_RESERVED_2:
423         case CTRL_NVLINK_RESERVED_3:
424         case CTRL_NVLINK_RESERVED_4:
425         case CTRL_NVLINK_RESERVED_5:
426         case CTRL_NVLINK_RESERVED_6:
427         case CTRL_NVLINK_RESERVED_7:
428         case CTRL_NVLINK_RESERVED_8:
429         case CTRL_NVLINK_RESERVED_9:
430         case CTRL_NVLINK_RESERVED_10:
431         case CTRL_NVLINK_RESERVED_11:
432         {
433             return NVL_SUCCESS;
434             break;
435         }
436 
437         default:
438         {
439             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
440                 "%s: unknown ioctl command 0x%08X specified.\n",
441                 __FUNCTION__, ctrlParams->cmd));
442             return NVL_BAD_ARGS;
443         }
444     }
445 
446     //
447     // the IOCTL call is success. However, status of the individual IOCTL is
448     // indicated in their corresponding embedded status field.
449     //
450     return NVL_SUCCESS;
451 }
452 
453 /**
454  * Preliminary check before passing the IOCTL to the respective handler
455  *
456  * @param[in]  ctrlParams  IOCTL params
457  *
458  * return NvlStatus
459  */
460 static NvlStatus
nvlink_lib_ctrl_prologue(nvlink_ioctrl_params * ctrlParams)461 nvlink_lib_ctrl_prologue
462 (
463     nvlink_ioctrl_params *ctrlParams
464 )
465 {
466     NvlStatus status = NVL_SUCCESS;
467 
468     if (ctrlParams == NULL)
469     {
470         return NVL_BAD_ARGS;
471     }
472 
473     switch (ctrlParams->cmd)
474     {
475         //
476         // These control calls are aren't privileged. So, skip the capability
477         // check.
478         //
479         case CTRL_NVLINK_CHECK_VERSION:
480         case CTRL_NVLINK_ACQUIRE_CAPABILITY:
481         {
482             break;
483         }
484         default:
485         {
486             if (!nvlink_is_admin() &&
487                 !nvlink_is_fabric_manager(ctrlParams->osPrivate))
488             {
489                 status = NVL_ERR_INSUFFICIENT_PERMISSIONS;
490             }
491             break;
492         }
493     }
494 
495     return status;
496 }
497 
498 /**
499  * Check if the user and kernel versions mismatch
500  *
501  * @param[in]  versionParams  IOCTL params
502  *
503  * return NvlStatus
504  */
505 static NvlStatus
nvlink_lib_ctrl_check_version(nvlink_check_version * versionParams)506 nvlink_lib_ctrl_check_version
507 (
508     nvlink_check_version *versionParams
509 )
510 {
511     const NvU32 NV_VERSION_LENGTH = nvlink_strlen(NV_VERSION_STRING);
512 
513     if (NV_VERSION_LENGTH > NVLINK_VERSION_STRING_LENGTH)
514     {
515         return NVL_NO_MEM;
516     }
517 
518     versionParams->user.version[NVLINK_VERSION_STRING_LENGTH - 1] = '\0';
519 
520     nvlink_memset(versionParams->kernel.version, 0x0, sizeof(versionParams->kernel.version));
521     nvlink_strcpy(versionParams->kernel.version, NV_VERSION_STRING);
522 
523     versionParams->kernel.version[NVLINK_VERSION_STRING_LENGTH - 1] = '\0';
524 
525     if (nvlink_strcmp(versionParams->user.version, versionParams->kernel.version))
526     {
527         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
528             "%s: version mismatch, kernel version %s user version %s\n",
529             __FUNCTION__,
530             versionParams->kernel.version, versionParams->user.version));
531 
532         return NVL_ERR_NOT_SUPPORTED;
533     }
534 
535     return NVL_SUCCESS;
536 }
537 
538 /**
539  * Assign node ID to all the registered devices
540  *
541  * @param[in]  idParams  IOCTL params
542  *
543  * return NvlStatus
544  */
545 static NvlStatus
nvlink_lib_ctrl_set_node_id(nvlink_set_node_id * idParams)546 nvlink_lib_ctrl_set_node_id
547 (
548     nvlink_set_node_id *idParams
549 )
550 {
551     NvlStatus      status = NVL_SUCCESS;
552     nvlink_device *dev    = NULL;
553 
554     // Acquire the top-level lock
555     status = nvlink_lib_top_lock_acquire();
556     if (status != NVL_SUCCESS)
557     {
558         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
559             "%s: Failed to acquire top-level lock\n",
560             __FUNCTION__));
561 
562         return status;
563     }
564 
565     // Top-level lock is now acquired
566 
567     // Return success, if an attempt is made to re-assign the same node-id.
568     if (nvlinkLibCtx.nodeId == idParams->nodeId)
569     {
570         // Release the top-level lock
571         nvlink_lib_top_lock_release();
572 
573         return  NVL_SUCCESS;
574     }
575 
576     if (nvlinkLibCtx.nodeId != NV_U16_MAX)
577     {
578         // Don't allow to change fabric node id once it is set.
579         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
580             "%s: Can't change fabric node id once it is set. "
581             "Current node id is %u\n",
582             __FUNCTION__, nvlinkLibCtx.nodeId));
583 
584         // Release the top-level lock
585         nvlink_lib_top_lock_release();
586 
587         return  NVL_ERR_INVALID_STATE;
588     }
589 
590     // Change already registered device's fabric node id.
591     FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
592     {
593         dev->nodeId = idParams->nodeId;
594     }
595 
596     // Store fabric node id for any future device registration.
597     nvlinkLibCtx.nodeId = idParams->nodeId;
598 
599     // Release the top-level lock
600     nvlink_lib_top_lock_release();
601 
602     return NVL_SUCCESS;
603 }
604 
605 /**
606  * Kick off the desired operation on registered links of all devices
607  *
608  * Note: This operation will acquire the per-link locks of all the
609  *       registered links of all devices in the core library
610  *
611  * @param[in]  ctrlParams  IOCTL params
612  *
613  * return NvlStatus
614  */
615 static NvlStatus
nvlink_lib_ctrl_all_links(nvlink_ioctrl_params * ctrlParams)616 nvlink_lib_ctrl_all_links
617 (
618     nvlink_ioctrl_params *ctrlParams
619 )
620 {
621     NvlStatus      status   = NVL_SUCCESS;
622     nvlink_device *dev      = NULL;
623     nvlink_link   *link     = NULL;
624     NvU32          numLinks = 0;
625 
626     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
627                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
628     if (links == NULL)
629     {
630         return NVL_NO_MEM;
631     }
632 
633     // Acquire the top-level lock
634     status = nvlink_lib_top_lock_acquire();
635     if (status != NVL_SUCCESS)
636     {
637         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
638             "%s: Failed to acquire top-level lock\n",
639             __FUNCTION__));
640 
641         nvlink_free((void *)links);
642         return status;
643     }
644 
645     //
646     // Top-level lock is now acquired. Proceed to traversing the device
647     // and link lists
648     //
649 
650     FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
651     {
652         FOR_EACH_LINK_REGISTERED(link, dev, node)
653         {
654             if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
655             {
656                 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
657                     "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
658                     __FUNCTION__));
659 
660                 nvlink_assert(0);
661 
662                 // Release the top-level lock and free links
663                 nvlink_lib_top_lock_release();
664                 nvlink_free((void *)links);
665                 return NVL_ERR_INVALID_STATE;
666             }
667             links[numLinks] = link;
668             numLinks++;
669         }
670     }
671 
672     // Acquire the per-link locks
673     status = nvlink_lib_link_locks_acquire(links, numLinks);
674     if (status != NVL_SUCCESS)
675     {
676         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
677             "%s: Failed to acquire per-link locks\n",
678             __FUNCTION__));
679 
680         // Release the top-level lock
681         nvlink_lib_top_lock_release();
682         nvlink_free((void *)links);
683         return status;
684     }
685 
686     //
687     // All the required per-link locks are now successfully acquired
688     // Release the top level-lock
689     //
690     nvlink_lib_top_lock_release();
691 
692     nvlink_assert((links != NULL) && (numLinks > 0));
693 
694     // Kick off the desired operation on all the registered links
695     switch (ctrlParams->cmd)
696     {
697         case CTRL_NVLINK_INITPHASE1:
698         {
699             nvlink_initphase1 *iocReq;
700 
701             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initphase1);
702             if (!iocReq)
703             {
704                 status = NVL_BAD_ARGS;
705                 goto nvlink_lib_ctrl_all_links_end;
706             }
707 
708             // default initialize status to NVL_SUCCESS
709             iocReq->status = NVL_SUCCESS;
710 
711             if (links[0]->dev->enableALI)
712             {
713                 status = NVL_SUCCESS;
714                 goto nvlink_lib_ctrl_all_links_end;
715             }
716 
717             iocReq->status = nvlink_core_initphase1(links, numLinks,
718                                                     NVLINK_STATE_CHANGE_SYNC);
719             break;
720         }
721 
722         case CTRL_NVLINK_RX_INIT_TERM:
723         {
724             nvlink_rx_init_term *iocReq;
725 
726             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_rx_init_term);
727             if (!iocReq)
728             {
729                 status = NVL_BAD_ARGS;
730                 goto nvlink_lib_ctrl_all_links_end;
731             }
732 
733             // default initialize status to NVL_SUCCESS
734             iocReq->status = NVL_SUCCESS;
735 
736             //
737             // If the current nvlink device does not support the command
738             // skip using the command and return success for FM to continue on.
739             //
740             if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
741             {
742                 status = NVL_SUCCESS;
743                 goto nvlink_lib_ctrl_all_links_end;
744             }
745 
746             iocReq->status = nvlink_core_rx_init_term(links, numLinks,
747                                                       NVLINK_STATE_CHANGE_ASYNC);
748             break;
749         }
750 
751         case CTRL_NVLINK_SET_RX_DETECT:
752         {
753             nvlink_set_rx_detect *iocReq;
754 
755             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_rx_detect);
756             if (!iocReq)
757             {
758                 status = NVL_BAD_ARGS;
759                 goto nvlink_lib_ctrl_all_links_end;
760             }
761 
762             // default initialize status to NVL_SUCCESS
763             iocReq->status = NVL_SUCCESS;
764 
765             if (links[0]->dev->enableALI)
766             {
767                 status = NVL_SUCCESS;
768                 goto nvlink_lib_ctrl_all_links_end;
769             }
770 
771             iocReq->status = nvlink_core_set_rx_detect(links, numLinks,
772                                                       NVLINK_STATE_CHANGE_ASYNC);
773             break;
774         }
775 
776         case CTRL_NVLINK_GET_RX_DETECT:
777         {
778             nvlink_get_rx_detect *iocReq;
779 
780             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_rx_detect);
781             if (!iocReq)
782             {
783                 status = NVL_BAD_ARGS;
784                 goto nvlink_lib_ctrl_all_links_end;
785             }
786 
787             // default initialize status to NVL_SUCCESS
788             iocReq->status = NVL_SUCCESS;
789 
790             if (links[0]->dev->enableALI)
791             {
792                 status = NVL_SUCCESS;
793                 goto nvlink_lib_ctrl_all_links_end;
794             }
795 
796             iocReq->status = nvlink_core_get_rx_detect(links, numLinks,
797                                                       NVLINK_STATE_CHANGE_ASYNC);
798             break;
799         }
800 
801         case CTRL_NVLINK_SET_TX_COMMON_MODE:
802         {
803             nvlink_set_tx_common_mode *iocReq;
804 
805             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_tx_common_mode);
806             if (!iocReq)
807             {
808                 status = NVL_BAD_ARGS;
809                 goto nvlink_lib_ctrl_all_links_end;
810             }
811 
812             // default initialize status to NVL_SUCCESS
813             iocReq->status = NVL_SUCCESS;
814 
815             if (links[0]->dev->enableALI)
816             {
817                 status = NVL_SUCCESS;
818                 goto nvlink_lib_ctrl_all_links_end;
819             }
820 
821             if (iocReq->commMode)
822             {
823                 iocReq->status = nvlink_core_enable_common_mode(links, numLinks,
824                                                       NVLINK_STATE_CHANGE_SYNC);
825             }
826             else if(links[0]->version <= NVLINK_DEVICE_VERSION_30)
827             {
828                 iocReq->status = nvlink_core_disable_common_mode(links, numLinks,
829                                                       NVLINK_STATE_CHANGE_SYNC);
830             }
831 
832             //
833             // If the current nvlink device does not support disabling common mode
834             // skip using the command and return success for FM to continue on.
835             //
836             break;
837         }
838 
839         case CTRL_NVLINK_CALIBRATE:
840         {
841             nvlink_calibrate *iocReq;
842             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_calibrate);
843 
844             if (!iocReq)
845             {
846                 status = NVL_BAD_ARGS;
847                 goto nvlink_lib_ctrl_all_links_end;
848             }
849 
850             // default initialize status to NVL_SUCCESS
851             iocReq->status = NVL_SUCCESS;
852 
853             //
854             // If the current nvlink device does not support the command
855             // skip using the command and return success for FM to continue on.
856             //
857             if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
858             {
859                 iocReq->status = NVL_SUCCESS;
860                 goto nvlink_lib_ctrl_all_links_end;
861             }
862 
863             iocReq->status = nvlink_core_calibrate_links(links, numLinks,
864                                                       NVLINK_STATE_CHANGE_SYNC);
865             break;
866         }
867 
868         case CTRL_NVLINK_ENABLE_DATA:
869         {
870             nvlink_enable_data *iocReq;
871 
872             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_enable_data);
873             if (!iocReq)
874             {
875                 goto nvlink_lib_ctrl_all_links_end;
876             }
877 
878             // default initialize status to NVL_SUCCESS
879             iocReq->status = NVL_SUCCESS;
880 
881             //
882             // If the current nvlink device does not support the command
883             // skip using the command and return success for FM to continue on.
884             //
885             if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
886             {
887                 status = NVL_SUCCESS;
888                 goto nvlink_lib_ctrl_all_links_end;
889             }
890 
891             iocReq->status = nvlink_core_enable_data(links, numLinks,
892                                                       NVLINK_STATE_CHANGE_SYNC);
893             break;
894         }
895 
896         case  CTRL_NVLINK_LINK_INIT_ASYNC:
897         {
898             nvlink_link_init_async *iocReq;
899 
900             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_link_init_async);
901             if (!iocReq)
902             {
903                 status = NVL_BAD_ARGS;
904                 goto nvlink_lib_ctrl_all_links_end;
905             }
906 
907             // default initialize status to NVL_SUCCESS
908             iocReq->status = NVL_SUCCESS;
909 
910             iocReq->status = nvlink_core_link_init_async(links, numLinks);
911             break;
912         }
913 
914         case CTRL_NVLINK_INITNEGOTIATE:
915         {
916             nvlink_initnegotiate *iocReq;
917 
918             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initnegotiate);
919             if (!iocReq)
920             {
921                 status = NVL_BAD_ARGS;
922                 goto nvlink_lib_ctrl_all_links_end;
923             }
924 
925             // default initialize status to NVL_SUCCESS
926             iocReq->status = NVL_SUCCESS;
927 
928             if (links[0]->dev->enableALI)
929             {
930                 status = NVL_SUCCESS;
931                 goto nvlink_lib_ctrl_all_links_end;
932             }
933 
934             iocReq->status = nvlink_core_initnegotiate(links, numLinks,
935                                                       NVLINK_STATE_CHANGE_ASYNC);
936             break;
937         }
938 
939         case CTRL_NVLINK_INITPHASE5:
940         {
941             nvlink_initphase5 *iocReq;
942 
943             iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initphase5);
944             if (!iocReq)
945             {
946                 status = NVL_BAD_ARGS;
947                 goto nvlink_lib_ctrl_all_links_end;
948             }
949 
950             // default initialize status to NVL_SUCCESS
951             iocReq->status = NVL_SUCCESS;
952 
953             //
954             // If the current nvlink device does not support the command
955             // skip using the command and return success for FM to continue on.
956             //
957             if (links[0]->version < NVLINK_DEVICE_VERSION_40 ||
958                 links[0]->dev->enableALI)
959             {
960                 status = NVL_SUCCESS;
961                 goto nvlink_lib_ctrl_all_links_end;
962             }
963             iocReq->status = nvlink_core_initphase5(links, numLinks,
964                                                       NVLINK_STATE_CHANGE_ASYNC);
965             break;
966         }
967 
968         default:
969         {
970             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
971                 "%s: unknown ioctl command specified.\n",
972                 __FUNCTION__));
973 
974             status = NVL_BAD_ARGS;
975             goto nvlink_lib_ctrl_all_links_end;
976         }
977 
978     }
979 
980 nvlink_lib_ctrl_all_links_end:
981 
982     // Release the per-link locks
983     nvlink_lib_link_locks_release(links, numLinks);
984 
985     if (links != NULL)
986     {
987         nvlink_free((void *)links);
988     }
989 
990     return status;
991 }
992 
993 /**
994  * Get the link init status on all queried links
995  *
996  * @param[in]  statusParams  IOCTL params
997  *
998  * return NvlStatus
999  */
1000 static NvlStatus
nvlink_lib_ctrl_device_link_init_status(nvlink_device_link_init_status * statusParams)1001 nvlink_lib_ctrl_device_link_init_status
1002 (
1003     nvlink_device_link_init_status *statusParams
1004 )
1005 {
1006     NvlStatus      status   = NVL_SUCCESS;
1007     nvlink_device *dev      = NULL;
1008     nvlink_link   *link     = NULL;
1009     NvU32          numLinks = 0;
1010     NvU32          i        = 0;
1011 
1012     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
1013                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1014     if (links == NULL)
1015     {
1016         return NVL_NO_MEM;
1017     }
1018 
1019     // Acquire the top-level lock
1020     status = nvlink_lib_top_lock_acquire();
1021     if (status != NVL_SUCCESS)
1022     {
1023         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1024             "%s: Failed to acquire top-level lock\n",
1025             __FUNCTION__));
1026 
1027         nvlink_free((void *)links);
1028         return status;
1029     }
1030 
1031     //
1032     // Top-level lock is now acquired. Proceed to traversing the device
1033     // and link lists
1034     //
1035 
1036     // look-up user requested nvlink device object
1037     nvlink_core_get_device_by_devinfo(&statusParams->devInfo, &dev);
1038     if (dev == NULL)
1039     {
1040         //
1041         // Couldn't find the device ptr in the core library. Release the
1042         // top-level lock and return
1043         //
1044         nvlink_lib_top_lock_release();
1045 
1046         nvlink_free((void *)links);
1047         return NVL_BAD_ARGS;
1048     }
1049 
1050     FOR_EACH_LINK_REGISTERED(link, dev, node)
1051     {
1052         if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1053         {
1054             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1055                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1056                 __FUNCTION__));
1057 
1058             nvlink_assert(0);
1059 
1060             nvlink_lib_top_lock_release();
1061             nvlink_free((void *)links);
1062             return NVL_ERR_INVALID_STATE;
1063         }
1064         links[numLinks] = link;
1065         numLinks++;
1066     }
1067 
1068     // Acquire the per-link locks
1069     status = nvlink_lib_link_locks_acquire(links, numLinks);
1070     if (status != NVL_SUCCESS)
1071     {
1072         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1073             "%s: Failed to acquire per-link locks\n",
1074             __FUNCTION__));
1075 
1076         // Release the top-level lock
1077         nvlink_lib_top_lock_release();
1078 
1079         nvlink_free((void *)links);
1080         return status;
1081     }
1082 
1083     //
1084     // All the required per-link locks are now successfully acquired
1085     // Release the top level-lock
1086     //
1087     nvlink_lib_top_lock_release();
1088 
1089     // Poll for links to reach SAFE/SWCFG and capture the status
1090     for (i = 0; i < numLinks; i++)
1091     {
1092         // status index should be within NVLINK_MAX_DEVICE_CONN
1093         if (i >= NVLINK_MAX_DEVICE_CONN)
1094         {
1095             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1096                 "%s: number of links for the device >= NVLINK_MAX_DEVICE_CONN",
1097                 __FUNCTION__));
1098 
1099             nvlink_assert(0);
1100 
1101             nvlink_lib_link_locks_release(links, numLinks);
1102             nvlink_free((void *)links);
1103             return NVL_ERR_INVALID_STATE;
1104         }
1105 
1106         status = nvlink_core_wait_for_link_init(links[i]);
1107 
1108         // indicate link init state to user
1109         statusParams->linkStatus[i].linkIndex = links[i]->linkNumber;
1110 
1111         if (status == NVL_SUCCESS)
1112         {
1113             statusParams->linkStatus[i].initStatus = NV_TRUE;
1114         }
1115         else
1116         {
1117             statusParams->linkStatus[i].initStatus = NV_FALSE;
1118         }
1119     }
1120 
1121     // Release the per-link locks
1122     nvlink_lib_link_locks_release(links, numLinks);
1123 
1124     if (links != NULL)
1125     {
1126         nvlink_free((void *)links);
1127     }
1128     return NVL_SUCCESS;
1129 }
1130 
1131 /**
1132  * Send discovery tokens on all the links for a given device
1133  *
1134  * @param[in]  writeParams  IOCTL params
1135  *
1136  * return NvlStatus
1137  */
1138 static NvlStatus
nvlink_lib_ctrl_device_write_discovery_tokens(nvlink_device_write_discovery_tokens * writeParams)1139 nvlink_lib_ctrl_device_write_discovery_tokens
1140 (
1141     nvlink_device_write_discovery_tokens *writeParams
1142 )
1143 {
1144     NvlStatus      status    = NVL_SUCCESS;
1145     nvlink_device *dev       = NULL;
1146     nvlink_link   *link      = NULL;
1147     NvU32          numLinks  = 0;
1148     NvU32          i         = 0;
1149     NvU32          numTokens = 0;
1150 
1151     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
1152                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1153     if (links == NULL)
1154     {
1155         return NVL_NO_MEM;
1156     }
1157 
1158     // Initialize number of tokens written to 0
1159     writeParams->numTokens = 0;
1160 
1161     // Acquire the top-level lock
1162     status = nvlink_lib_top_lock_acquire();
1163     if (status != NVL_SUCCESS)
1164     {
1165         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1166             "%s: Failed to acquire top-level lock\n",
1167             __FUNCTION__));
1168 
1169         nvlink_free((void *)links);
1170         return status;
1171     }
1172 
1173     //
1174     // Top-level lock is now acquired. Proceed to traversing the device
1175     // and link lists
1176     //
1177 
1178     // look-up user requested nvlink device object
1179     nvlink_core_get_device_by_devinfo(&writeParams->devInfo, &dev);
1180     if (dev == NULL)
1181     {
1182         //
1183         // Couldn't find the device ptr in the core library. Release the
1184         // top-level lock and return
1185         //
1186         nvlink_lib_top_lock_release();
1187 
1188         nvlink_free((void *)links);
1189         return NVL_BAD_ARGS;
1190     }
1191 
1192     FOR_EACH_LINK_REGISTERED(link, dev, node)
1193     {
1194         nvlink_intranode_conn *conn = NULL;
1195 
1196         nvlink_core_get_intranode_conn(link, &conn);
1197         if (conn != NULL)
1198         {
1199             // skip token write if we already have a connection for the link
1200             continue;
1201         }
1202 
1203         if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1204         {
1205             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1206                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1207                 __FUNCTION__));
1208 
1209             nvlink_assert(0);
1210 
1211             // Release the top-level lock and free links
1212             nvlink_lib_top_lock_release();
1213             nvlink_free((void *)links);
1214             return NVL_ERR_INVALID_STATE;
1215         }
1216 
1217         links[numLinks] = link;
1218         numLinks++;
1219     }
1220 
1221     // Acquire the per-link locks
1222     status = nvlink_lib_link_locks_acquire(links, numLinks);
1223     if (status != NVL_SUCCESS)
1224     {
1225         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1226             "%s: Failed to acquire per-link locks\n",
1227             __FUNCTION__));
1228 
1229         // Release the top-level lock
1230         nvlink_lib_top_lock_release();
1231 
1232         nvlink_free((void *)links);
1233         return status;
1234     }
1235 
1236     //
1237     // All the required per-link locks are now successfully acquired
1238     // Release the top level-lock
1239     //
1240     nvlink_lib_top_lock_release();
1241 
1242     for (i = 0; i < numLinks; i++)
1243     {
1244         NvU64 writeToken = 0;
1245 
1246         writeToken = nvlink_core_get_link_discovery_token(links[i]);
1247         status     = nvlink_core_write_link_discovery_token(links[i], writeToken);
1248 
1249         if (status == NVL_SUCCESS)
1250         {
1251             //
1252             // wrote a token. copy the token and link information to user
1253             // which can be used for comparing tokens across nodes.
1254             //
1255 
1256             // total number of tokens should be within NVLINK_MAX_DEVICE_CONN
1257             if (numTokens >= NVLINK_MAX_DEVICE_CONN)
1258             {
1259                 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1260                     "%s: Number of tokens >= NVLINK_MAX_DEVICE_CONN\n",
1261                     __FUNCTION__));
1262 
1263                 nvlink_assert(0);
1264 
1265                 nvlink_lib_link_locks_release(links, numLinks);
1266                 nvlink_free((void *)links);
1267                 return NVL_ERR_INVALID_STATE;
1268             }
1269 
1270             writeParams->tokenInfo[numTokens].linkIndex  = links[i]->linkNumber;
1271             writeParams->tokenInfo[numTokens].tokenValue = writeToken;
1272             numTokens++;
1273         }
1274     }
1275 
1276     // update total number of tokens written
1277     writeParams->numTokens = numTokens;
1278 
1279     // Release the per-link locks
1280     nvlink_lib_link_locks_release(links, numLinks);
1281 
1282     if (links != NULL)
1283     {
1284         nvlink_free((void *)links);
1285     }
1286     return NVL_SUCCESS;
1287 }
1288 
1289 /**
1290  * Read discovery tokens on all the links for a given device
1291  *
1292  * @param[in]  readParams  IOCTL params
1293  *
1294  * return NvlStatus
1295  */
1296 static NvlStatus
nvlink_lib_ctrl_device_read_discovery_tokens(nvlink_device_read_discovery_tokens * readParams)1297 nvlink_lib_ctrl_device_read_discovery_tokens
1298 (
1299     nvlink_device_read_discovery_tokens *readParams
1300 )
1301 {
1302     NvlStatus      status    = NVL_SUCCESS;
1303     nvlink_device *dev       = NULL;
1304     nvlink_link   *link      = NULL;
1305     NvU32          numLinks  = 0;
1306     NvU32          i         = 0;
1307     NvU32          numTokens = 0;
1308 
1309     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
1310                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1311     if (links == NULL)
1312     {
1313         return NVL_NO_MEM;
1314     }
1315 
1316     // Initialize number of tokens read to 0
1317     readParams->numTokens = 0;
1318 
1319     // Acquire the top-level lock
1320     status = nvlink_lib_top_lock_acquire();
1321     if (status != NVL_SUCCESS)
1322     {
1323         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1324             "%s: Failed to acquire top-level lock\n",
1325             __FUNCTION__));
1326 
1327         nvlink_free((void *)links);
1328         return status;
1329     }
1330 
1331     //
1332     // Top-level lock is now acquired. Proceed to traversing the device
1333     // and link lists
1334     //
1335 
1336     // look-up user requested nvlink device object
1337     nvlink_core_get_device_by_devinfo(&readParams->devInfo, &dev);
1338     if (dev == NULL)
1339     {
1340         //
1341         // Couldn't find the device ptr in the core library. Release the
1342         // top-level lock and return
1343         //
1344         nvlink_lib_top_lock_release();
1345 
1346         nvlink_free((void *)links);
1347         return NVL_BAD_ARGS;
1348     }
1349 
1350     FOR_EACH_LINK_REGISTERED(link, dev, node)
1351     {
1352         nvlink_intranode_conn *conn = NULL;
1353 
1354         nvlink_core_get_intranode_conn(link, &conn);
1355         if (conn != NULL)
1356         {
1357             // skip token write if we already have a connection for the link
1358             continue;
1359         }
1360 
1361         if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1362         {
1363             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1364                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1365                 __FUNCTION__));
1366 
1367             nvlink_assert(0);
1368 
1369             // Release the top-level lock and free links
1370             nvlink_lib_top_lock_release();
1371             nvlink_free((void *)links);
1372             return NVL_ERR_INVALID_STATE;
1373         }
1374 
1375         links[numLinks] = link;
1376         numLinks++;
1377     }
1378 
1379     // Acquire the per-link locks
1380     status = nvlink_lib_link_locks_acquire(links, numLinks);
1381     if (status != NVL_SUCCESS)
1382     {
1383         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1384             "%s: Failed to acquire per-link locks\n",
1385             __FUNCTION__));
1386 
1387         // Release the top-level lock
1388         nvlink_lib_top_lock_release();
1389 
1390         nvlink_free((void *)links);
1391         return status;
1392     }
1393 
1394     //
1395     // All the required per-link locks are now successfully acquired
1396     // Release the top level-lock
1397     //
1398     nvlink_lib_top_lock_release();
1399 
1400     for (i = 0; i < numLinks; i++)
1401     {
1402         NvU64 readToken = 0;
1403 
1404         // query discovery token from the link
1405         readToken = nvlink_core_read_link_discovery_token(links[i]);
1406 
1407         // take non-zero tokens. token will be zero if read_discovery failed as well.
1408         if (readToken)
1409         {
1410             //
1411             // received a valid token. copy the token and link information to user
1412             // which can be used for comparing tokens across nodes.
1413             //
1414 
1415             // total number of tokens should be within NVLINK_MAX_DEVICE_CONN
1416             if (numTokens >= NVLINK_MAX_DEVICE_CONN)
1417             {
1418                 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1419                     "%s: Number of tokens >= NVLINK_MAX_DEVICE_CONN\n",
1420                     __FUNCTION__));
1421 
1422                 nvlink_assert(0);
1423 
1424                 nvlink_lib_link_locks_release(links, numLinks);
1425                 nvlink_free((void *)links);
1426                 return NVL_ERR_INVALID_STATE;
1427             }
1428 
1429             readParams->tokenInfo[numTokens].linkIndex  = links[i]->linkNumber;
1430             readParams->tokenInfo[numTokens].tokenValue = readToken;
1431             numTokens++;
1432         }
1433     }
1434 
1435     // update total number of tokens read
1436     readParams->numTokens = numTokens;
1437 
1438     // Release the per-link locks
1439     nvlink_lib_link_locks_release(links, numLinks);
1440 
1441     if (links != NULL)
1442     {
1443         nvlink_free((void *)links);
1444     }
1445     return NVL_SUCCESS;
1446 }
1447 
1448 /**
1449  * Perform peer link discovery
1450  *
1451  * @param[in]  readParams  IOCTL params
1452  *
1453  * return NvlStatus
1454  */
1455 static NvlStatus
_nvlink_lib_ctrl_device_discover_peer_link(nvlink_link * link)1456 _nvlink_lib_ctrl_device_discover_peer_link
1457 (
1458     nvlink_link *link
1459 )
1460 {
1461     NvlStatus      status   = NVL_SUCCESS;
1462 
1463     //
1464     // If the link succeeds rxDet(link is in HS, SAFE, or SLEEP mode) then go through and find its
1465     // peer link. What is important is not actually finding the link, but making sure the corelib
1466     // goes through the discovery process and has endpoints cache the remote information in the corelib
1467     // such that FM or endpoints can query the corelib for the topology of the system.
1468     //
1469     NvU64 linkMode = NVLINK_LINKSTATE_OFF;
1470     status = link->link_handlers->get_dl_link_mode(link, &linkMode);
1471     if (status != NVL_SUCCESS)
1472     {
1473         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1474             "%s: Unable to get link mode for %s:%s\n",
1475             __FUNCTION__, link->dev->deviceName, link->linkName));
1476         return status;
1477     }
1478 
1479     if ((linkMode == NVLINK_LINKSTATE_SAFE) ||
1480         (linkMode == NVLINK_LINKSTATE_HS)   ||
1481         (linkMode == NVLINK_LINKSTATE_SLEEP))
1482     {
1483         nvlink_link   *remoteLink = NULL;
1484         nvlink_core_discover_and_get_remote_end(link, &remoteLink, 0);
1485         if (remoteLink == NULL)
1486         {
1487             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
1488                 "%s: link 0x%x: couldn't find link pair! Possible that other device queries need to finish before there is a found connection in the corelib\n",
1489                 __FUNCTION__, link->linkNumber));
1490         }
1491     }
1492 
1493     return NVL_SUCCESS;
1494 }
1495 
1496 /**
1497  * Read the SIDs for the the local and remote device
1498  *
1499  * @param[in]  readParams  IOCTL params
1500  *
1501  * return NvlStatus
1502  */
1503 static NvlStatus
nvlink_lib_ctrl_device_read_sids(nvlink_device_read_sids * readParams)1504 nvlink_lib_ctrl_device_read_sids
1505 (
1506     nvlink_device_read_sids *readParams
1507 )
1508 {
1509     NvlStatus      status     = NVL_SUCCESS;
1510     nvlink_device *dev        = NULL;
1511     nvlink_link   *link       = NULL;
1512     NvU32          numLinks   = 0;
1513     NvU32          i          = 0;
1514     NvU32          numEntries = 0;
1515 
1516     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
1517                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1518     if (links == NULL)
1519     {
1520         return NVL_NO_MEM;
1521     }
1522 
1523     // Initialize number of SIDs read to 0
1524     readParams->numEntries = 0;
1525 
1526     // Acquire the top-level lock
1527     status = nvlink_lib_top_lock_acquire();
1528     if (status != NVL_SUCCESS)
1529     {
1530         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1531             "%s: Failed to acquire top-level lock\n",
1532             __FUNCTION__));
1533 
1534         nvlink_free((void *)links);
1535         return status;
1536     }
1537 
1538     //
1539     // Top-level lock is now acquired. Proceed to traversing the device
1540     // and link lists
1541     //
1542 
1543     // look-up user requested nvlink device object
1544     nvlink_core_get_device_by_devinfo(&readParams->devInfo, &dev);
1545     if (dev == NULL)
1546     {
1547         //
1548         // Couldn't find the device ptr in the core library. Release the
1549         // top-level lock and return
1550         //
1551         nvlink_lib_top_lock_release();
1552 
1553         nvlink_free((void *)links);
1554         return NVL_BAD_ARGS;
1555     }
1556 
1557     FOR_EACH_LINK_REGISTERED(link, dev, node)
1558     {
1559         nvlink_intranode_conn *conn = NULL;
1560 
1561         nvlink_core_get_intranode_conn(link, &conn);
1562         if (conn != NULL)
1563         {
1564             // skip token write if we already have a connection for the link
1565             continue;
1566         }
1567 
1568         if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1569         {
1570             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1571                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1572                 __FUNCTION__));
1573 
1574             nvlink_assert(0);
1575 
1576             // Release the top-level lock and free links
1577             nvlink_lib_top_lock_release();
1578             nvlink_free((void *)links);
1579             return NVL_ERR_INVALID_STATE;
1580         }
1581 
1582         links[numLinks] = link;
1583         numLinks++;
1584     }
1585 
1586     // Acquire the per-link locks
1587     status = nvlink_lib_link_locks_acquire(links, numLinks);
1588     if (status != NVL_SUCCESS)
1589     {
1590         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1591             "%s: Failed to acquire per-link locks\n",
1592             __FUNCTION__));
1593 
1594         // Release the top-level lock
1595         nvlink_lib_top_lock_release();
1596 
1597         nvlink_free((void *)links);
1598         return status;
1599     }
1600 
1601     //
1602     // All the required per-link locks are now successfully acquired
1603     // Release the top level-lock
1604     //
1605     nvlink_lib_top_lock_release();
1606 
1607     for (i = 0; i < numLinks; i++)
1608     {
1609         // ALI specific handling to update corelib structures and verify link status
1610         if (dev->enableALI)
1611         {
1612             status = _nvlink_lib_ctrl_device_discover_peer_link(links[i]);
1613             if (status != NVL_SUCCESS)
1614             {
1615                 // Release the per-link locks and free links
1616                 nvlink_lib_link_locks_release(links, numLinks);
1617                 nvlink_free((void *)links);
1618                 return status;
1619             }
1620         }
1621 
1622         // Fill-up the local/remote link numbers and SIDs
1623         readParams->sidInfo[numEntries].localLinkSid  = links[i]->localSid;
1624         readParams->sidInfo[numEntries].remoteLinkSid = links[i]->remoteSid;
1625         readParams->sidInfo[numEntries].localLinkNum  = links[i]->linkNumber;
1626         readParams->sidInfo[numEntries].remoteLinkNum = links[i]->remoteLinkId;
1627         numEntries++;
1628     }
1629 
1630     // update total number of entries read
1631     readParams->numEntries = numEntries;
1632 
1633     // Release the per-link locks
1634     nvlink_lib_link_locks_release(links, numLinks);
1635 
1636     if (links != NULL)
1637     {
1638         nvlink_free((void *)links);
1639     }
1640     return NVL_SUCCESS;
1641 }
1642 
1643 /**
1644  * Discover all the intranode connections from the core library
1645  *
1646  * @param[in]  connParams  IOCTL params
1647  *
1648  * return NvlStatus
1649  */
1650 static NvlStatus
nvlink_lib_ctrl_discover_intranode_conns(nvlink_discover_intranode_conns * connParams)1651 nvlink_lib_ctrl_discover_intranode_conns
1652 (
1653     nvlink_discover_intranode_conns *connParams
1654 )
1655 {
1656     NvlStatus      status   = NVL_SUCCESS;
1657     nvlink_device *dev      = NULL;
1658     nvlink_link   *link     = NULL;
1659     NvU32          numLinks = 0;
1660 
1661     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
1662                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1663     if (links == NULL)
1664     {
1665         return NVL_NO_MEM;
1666     }
1667 
1668     // Acquire the top-level lock
1669     status = nvlink_lib_top_lock_acquire();
1670     if (status != NVL_SUCCESS)
1671     {
1672         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1673             "%s: Failed to acquire top-level lock\n",
1674             __FUNCTION__));
1675 
1676         nvlink_free((void *)links);
1677         return status;
1678     }
1679 
1680     //
1681     // Top-level lock is now acquired. Proceed to traversing the device
1682     // and link lists
1683     //
1684 
1685     FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
1686     {
1687         FOR_EACH_LINK_REGISTERED(link, dev, node)
1688         {
1689             if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1690             {
1691                 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1692                     "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1693                     __FUNCTION__));
1694 
1695                 nvlink_assert(0);
1696 
1697                 // Release the top-level lock and free links
1698                 nvlink_lib_top_lock_release();
1699                 nvlink_free((void *)links);
1700                 return NVL_ERR_INVALID_STATE;
1701             }
1702 
1703             links[numLinks] = link;
1704             numLinks++;
1705         }
1706     }
1707 
1708     // Acquire the per-link locks
1709     status = nvlink_lib_link_locks_acquire(links, numLinks);
1710     if (status != NVL_SUCCESS)
1711     {
1712         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1713             "%s: Failed to acquire per-link locks\n",
1714             __FUNCTION__));
1715 
1716         // Release the top-level lock
1717         nvlink_lib_top_lock_release();
1718 
1719         nvlink_free((void *)links);
1720         return status;
1721     }
1722 
1723     //
1724     // All the required per-link locks are now successfully acquired
1725     // Note: We will still need to hold the top-level lock, because we might have
1726     //       to add connections to the intranode connections list if any case new
1727     //       intranode connection is discovered
1728     //
1729 
1730     FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
1731     {
1732         FOR_EACH_LINK_REGISTERED(link, dev, node)
1733         {
1734             NvU64                  writeToken = 0;
1735             nvlink_intranode_conn *conn       = NULL;
1736 
1737             nvlink_core_get_intranode_conn(link, &conn);
1738             if (conn != NULL)
1739             {
1740                 // skip token write if we already have a connection for the link
1741                 continue;
1742             }
1743 
1744             if (!link->bRxDetected)
1745             {
1746                 // If receiver detect has failed, then there is no connection
1747                 continue;
1748             }
1749 
1750         // ALI specific handling to update corelib structures and verify link status
1751         if (dev->enableALI)
1752         {
1753             status = _nvlink_lib_ctrl_device_discover_peer_link(link);
1754             if (status != NVL_SUCCESS)
1755             {
1756                 // Release the per-link locks
1757                 nvlink_lib_link_locks_release(links, numLinks);
1758 
1759                 // Release the top-level lock
1760                 nvlink_lib_top_lock_release();
1761                 nvlink_free((void *)links);
1762                 return status;
1763             }
1764         }
1765 
1766             writeToken = nvlink_core_get_link_discovery_token(link);
1767 
1768             if ((link->version < NVLINK_DEVICE_VERSION_30) ||
1769                 ((link->localSid == 0) || (link->remoteSid == 0)))
1770             {
1771                 nvlink_core_write_link_discovery_token(link, writeToken);
1772 
1773                 // wrote a token. read back tokens from all links and create connection
1774                 nvlink_core_correlate_conn_by_token(link, writeToken, NV_FALSE);
1775             }
1776             else
1777             {
1778                 // From 3.0 we rely on Sid values. So send skiptoken as true.
1779                 nvlink_core_correlate_conn_by_token(link, writeToken, NV_TRUE);
1780             }
1781         }
1782     }
1783 
1784     // Release the per-link locks
1785     nvlink_lib_link_locks_release(links, numLinks);
1786 
1787     // Release the top-level lock
1788     nvlink_lib_top_lock_release();
1789 
1790     if (links != NULL)
1791     {
1792         nvlink_free((void *)links);
1793     }
1794     return NVL_SUCCESS;
1795 }
1796 
1797 /**
1798  * Get the intranode connections from the core library
1799  *
1800  * @param[in]  getParams  IOCTL params
1801  *
1802  * return NvlStatus
1803  */
1804 static NvlStatus
nvlink_lib_ctrl_device_get_intranode_conns(nvlink_device_get_intranode_conns * getParams)1805 nvlink_lib_ctrl_device_get_intranode_conns
1806 (
1807     nvlink_device_get_intranode_conns *getParams
1808 )
1809 {
1810     NvlStatus              status   = NVL_SUCCESS;
1811     nvlink_device         *dev      = NULL;
1812     NvU32                  numConns = 0;
1813     nvlink_intranode_conn *conn     = NULL;
1814 
1815     // Initialize number of connections to 0
1816     getParams->numConnections = 0;
1817 
1818     // Acquire the top-level lock
1819     status = nvlink_lib_top_lock_acquire();
1820     if (status != NVL_SUCCESS)
1821     {
1822         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1823             "%s: Failed to acquire top-level lock\n",
1824             __FUNCTION__));
1825 
1826         return status;
1827     }
1828 
1829     //
1830     // Top-level lock is now acquired. Proceed to traversing the device
1831     // and link lists
1832     //
1833 
1834     // look-up user requested nvlink device object
1835     nvlink_core_get_device_by_devinfo(&getParams->devInfo, &dev);
1836     if (dev == NULL)
1837     {
1838         //
1839         // Couldn't find the device ptr in the core library. Release the
1840         // top-level lock and return
1841         //
1842         nvlink_lib_top_lock_release();
1843 
1844         return NVL_BAD_ARGS;
1845     }
1846 
1847     FOR_EACH_CONNECTION(conn, nvlinkLibCtx.nv_intraconn_head, node)
1848     {
1849         //
1850         // copy connection information if source or destination device of
1851         // this connection belong to the nvlink device specified by user
1852         //
1853         if ((conn->end0->dev == dev) || (conn->end1->dev == dev))
1854         {
1855             // total number of connections should be within NVLINK_MAX_DEVICE_CONN
1856             if (numConns >= NVLINK_MAX_DEVICE_CONN)
1857             {
1858                 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1859                     "%s: numConns >= NVLINK_MAX_DEVICE_CONN\n",
1860                     __FUNCTION__));
1861 
1862                 nvlink_assert(0);
1863 
1864                 nvlink_lib_top_lock_release();
1865                 return NVL_ERR_INVALID_STATE;
1866             }
1867 
1868             // copy source endpoint information
1869             nvlink_core_copy_endpoint_info(conn->end0, &getParams->conn[numConns].srcEndPoint);
1870 
1871             // copy destination endpoint information
1872             nvlink_core_copy_endpoint_info(conn->end1, &getParams->conn[numConns].dstEndPoint);
1873 
1874             numConns++;
1875         }
1876     }
1877 
1878     getParams->numConnections = numConns;
1879 
1880     // Release the top-level lock
1881     nvlink_lib_top_lock_release();
1882 
1883     return NVL_SUCCESS;
1884 }
1885 
1886 /**
1887  * Add a discovered internode connection
1888  *
1889  * @param[in]  addParams  IOCTL params
1890  *
1891  * return NvlStatus
1892  */
1893 static NvlStatus
nvlink_lib_ctrl_add_internode_conn(nvlink_add_internode_conn * addParams)1894 nvlink_lib_ctrl_add_internode_conn
1895 (
1896     nvlink_add_internode_conn *addParams
1897 )
1898 {
1899     nvlink_link           *localLink = NULL;
1900     nvlink_intranode_conn *intraConn = NULL;
1901     NvlStatus              status    = NVL_SUCCESS;
1902 
1903     // Acquire the top-level lock
1904     status = nvlink_lib_top_lock_acquire();
1905     if (status != NVL_SUCCESS)
1906     {
1907         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1908             "%s: Failed to acquire top-level lock\n",
1909             __FUNCTION__));
1910 
1911         return status;
1912     }
1913 
1914     //
1915     // Top-level lock is now acquired. Proceed to traversing the device
1916     // and link lists
1917     //
1918 
1919     // make sure that this connection is multi-node
1920     if (addParams->localEndPoint.nodeId == addParams->remoteEndPoint.nodeId)
1921     {
1922         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1923             "%s: Internode connection add with same node id for local and remote endpoint\n",
1924             __FUNCTION__));
1925 
1926         // Release the top-level lock
1927         nvlink_lib_top_lock_release();
1928 
1929         return NVL_BAD_ARGS;
1930     }
1931 
1932     // validate the remote endpoint device type information
1933     if (!nvlink_core_is_supported_device_type(addParams->remoteEndPoint.devType))
1934     {
1935         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1936             "%s: Internode connection add with invalid remote device type\n",
1937             __FUNCTION__));
1938 
1939         // Release the top-level lock
1940         nvlink_lib_top_lock_release();
1941 
1942         return NVL_BAD_ARGS;
1943     }
1944 
1945     //
1946     // look-up the nvlink link objects. Look-up will fail if there is a
1947     // fabric node id mismatch. So an explicit check against self
1948     // node id is not required.
1949     //
1950     nvlink_core_get_link_by_endpoint(&addParams->localEndPoint, &localLink);
1951     if (localLink == NULL)
1952     {
1953         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1954             "%s: Internode connection add with no matching local endpoint\n",
1955             __FUNCTION__));
1956 
1957         //
1958         // Couldn't find the endpoint registered in the core library. Release the
1959         // top-level lock and return
1960         //
1961         nvlink_lib_top_lock_release();
1962 
1963         return NVL_BAD_ARGS;
1964     }
1965 
1966     // can't add internode connection if we have an intranode connection
1967     nvlink_core_get_intranode_conn(localLink, &intraConn);
1968     if (intraConn != NULL)
1969     {
1970         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1971             "%s: Found an intranode connection while adding internode connection\n",
1972             __FUNCTION__));
1973 
1974         // Release the top-level lock
1975         nvlink_lib_top_lock_release();
1976 
1977         return NVL_BAD_ARGS;
1978     }
1979 
1980     // all the sanity check passed, add this internode connection in our context
1981     status = nvlink_core_add_internode_conn(localLink, &addParams->remoteEndPoint);
1982 
1983     // Release the top-level lock
1984     nvlink_lib_top_lock_release();
1985 
1986     return status;
1987 }
1988 
1989 /**
1990  * Remove an internode connection from the list
1991  *
1992  * @param[in]  removeParams  IOCTL params
1993  *
1994  * return NvlStatus
1995  */
1996 static NvlStatus
nvlink_lib_ctrl_remove_internode_conn(nvlink_remove_internode_conn * removeParams)1997 nvlink_lib_ctrl_remove_internode_conn
1998 (
1999     nvlink_remove_internode_conn *removeParams
2000 )
2001 {
2002     nvlink_link *localLink = NULL;
2003     NvlStatus    status    = NVL_SUCCESS;
2004 
2005     // Acquire the top-level lock
2006     status = nvlink_lib_top_lock_acquire();
2007     if (status != NVL_SUCCESS)
2008     {
2009         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2010             "%s: Failed to acquire top-level lock\n",
2011             __FUNCTION__));
2012 
2013         return status;
2014     }
2015 
2016     //
2017     // Top-level lock is now acquired. Proceed to traversing the device
2018     // and link lists
2019     //
2020 
2021     //
2022     // look-up the nvlink link objects. Look-up will fail if there is a
2023     // fabric node id mismatch. So an explicit check against self
2024     // node id is not required.
2025     //
2026     nvlink_core_get_link_by_endpoint(&removeParams->localEndPoint, &localLink);
2027     if (localLink == NULL)
2028     {
2029         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2030             "%s: Internode connection remove with no matching local endpoint\n",
2031             __FUNCTION__));
2032 
2033         //
2034         // Couldn't find the endpoint registered in the core library. Release the
2035         // top-level lock and return
2036         //
2037         nvlink_lib_top_lock_release();
2038 
2039         return NVL_BAD_ARGS;
2040     }
2041 
2042     // Acquire the per-link lock
2043     status = nvlink_lib_link_locks_acquire(&localLink, 1);
2044     if (status != NVL_SUCCESS)
2045     {
2046         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2047             "%s: Failed to acquire per-link lock\n",
2048             __FUNCTION__));
2049 
2050         // Release the top-level lock
2051         nvlink_lib_top_lock_release();
2052 
2053         return status;
2054     }
2055 
2056     // all the sanity check passed, remove this internode connection from our context
2057     nvlink_core_remove_internode_conn(localLink);
2058 
2059     // Release the per-link lock
2060     nvlink_lib_link_locks_release(&localLink, 1);
2061 
2062     // Release the top-level lock
2063     nvlink_lib_top_lock_release();
2064 
2065     return NVL_SUCCESS;
2066 }
2067 
2068 /**
2069  * Train the intranode connection to the desired target state
2070  *
2071  * @param[in]  trainParams  IOCTL params
2072  *
2073  * return NvlStatus
2074  */
2075 static NvlStatus
nvlink_lib_ctrl_train_intranode_conn(nvlink_train_intranode_conn * trainParams)2076 nvlink_lib_ctrl_train_intranode_conn
2077 (
2078     nvlink_train_intranode_conn *trainParams
2079 )
2080 {
2081     nvlink_link           *srcLink      = NULL;
2082     nvlink_link           *dstLink      = NULL;
2083     nvlink_link           *initLinks[2] = {0};
2084     nvlink_intranode_conn *conn         = NULL;
2085     NvlStatus              status       = NVL_SUCCESS;
2086     NvU32                  count;
2087     NvU32                  i;
2088 
2089     // make sure that this call is for single node systems
2090     if (trainParams->srcEndPoint.nodeId != trainParams->dstEndPoint.nodeId)
2091     {
2092         return NVL_BAD_ARGS;
2093     }
2094 
2095     // Acquire the top-level lock
2096     status = nvlink_lib_top_lock_acquire();
2097     if (status != NVL_SUCCESS)
2098     {
2099         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2100             "%s: Failed to acquire top-level lock\n",
2101             __FUNCTION__));
2102 
2103         return status;
2104     }
2105 
2106     //
2107     // Top-level lock is now acquired. Proceed to traversing the device
2108     // and link lists
2109     //
2110 
2111     //
2112     // look-up the nvlink link objects. Look-up will fail if there is a
2113     // fabric node id mismatch. So an explicit check against self
2114     // node id is not required.
2115     //
2116     nvlink_core_get_link_by_endpoint(&trainParams->srcEndPoint, &srcLink);
2117     nvlink_core_get_link_by_endpoint(&trainParams->dstEndPoint, &dstLink);
2118 
2119     // we can't train if both ends are not found
2120     if ((srcLink == NULL) || (dstLink == NULL))
2121     {
2122         //
2123         // Couldn't find the endpoints registered in the core library. Release
2124         // the top-level lock and return
2125         //
2126         nvlink_lib_top_lock_release();
2127 
2128         return NVL_BAD_ARGS;
2129     }
2130 
2131     // look-up the nvlink connection object by source link
2132     nvlink_core_get_intranode_conn(srcLink, &conn);
2133     if (conn == NULL)
2134     {
2135         //
2136         // Couldn't find an associated connection for the 2 endpoints. Release
2137         // the top-level lock and return
2138         //
2139         nvlink_lib_top_lock_release();
2140 
2141         return NVL_BAD_ARGS;
2142     }
2143 
2144     //
2145     // we found the connection by the source link. Make sure that dest link is
2146     // indeed, the user specified one as well
2147     //
2148     if ((conn->end0 != dstLink) && (conn->end1 != dstLink))
2149     {
2150         //
2151         // The dest endpoint is not the remote end for the src endpoint. Release
2152         // the top-level lock and return
2153         //
2154         nvlink_lib_top_lock_release();
2155 
2156         return NVL_BAD_ARGS;
2157     }
2158 
2159     initLinks[0] = conn->end0;
2160     initLinks[1] = conn->end1;
2161 
2162     // If loopback then only pass in 1 link
2163     if (conn->end0 != conn->end1)
2164     {
2165         count = 2;
2166     }
2167     else
2168     {
2169         count = 1;
2170     }
2171 
2172     // Acquire the per-link locks
2173     status = nvlink_lib_link_locks_acquire(initLinks, 2);
2174     if (status != NVL_SUCCESS)
2175     {
2176         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2177             "%s: Failed to acquire per-link locks\n",
2178             __FUNCTION__));
2179 
2180         // Release the top-level lock
2181         nvlink_lib_top_lock_release();
2182 
2183         return status;
2184     }
2185 
2186     //
2187     // All the required per-link locks are now successfully acquired
2188     // Release the top level-lock
2189     //
2190     nvlink_lib_top_lock_release();
2191 
2192     // the connection looks sane, initiate the training
2193     switch (trainParams->trainTo)
2194     {
2195         case nvlink_train_conn_off_to_swcfg:
2196         {
2197             if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2198             {
2199                 // non-ALI training for NVLink4.0+
2200                 if (!srcLink->dev->enableALI)
2201                 {
2202                     nvlink_core_init_links_from_off_to_swcfg_non_ALI(
2203                                             initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2204                 }
2205             }
2206             else
2207             {
2208                 // ALT training for NVLink3.0+
2209                 nvlink_core_init_links_from_off_to_swcfg(
2210                                             initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2211             }
2212             break;
2213         }
2214         case nvlink_train_conn_swcfg_to_active:
2215         {
2216             if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2217             {
2218                 // non-ALI training for NVLink4.0+
2219                 if (!srcLink->dev->enableALI)
2220                 {
2221                     status = nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI(
2222                                                      &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2223                 }
2224             }
2225             else if (srcLink->version >= NVLINK_DEVICE_VERSION_30)
2226             {
2227                 // ALT training for NVLink3.0+
2228                 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT(
2229                                                  &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2230             }
2231             else
2232             {
2233                 // Legacy training for pre-NVLink3.0
2234                 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy(
2235                                                  &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2236             }
2237             break;
2238         }
2239         case nvlink_train_conn_active_to_swcfg:
2240         {
2241             status = nvlink_core_powerdown_intranode_conns_from_active_to_swcfg(
2242                                                  &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2243             break;
2244         }
2245         case nvlink_train_conn_to_off:
2246         case nvlink_train_conn_swcfg_to_off:
2247         {
2248             status = nvlink_core_powerdown_intranode_conns_from_active_to_off(
2249                                                  &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2250             if (status == NVL_SUCCESS)
2251             {
2252                 nvlink_core_reset_intranode_conns(&conn, 1, NVLINK_STATE_CHANGE_SYNC);
2253             }
2254             break;
2255         }
2256         case nvlink_train_conn_off_to_active_ali_non_blocking:
2257         case nvlink_train_conn_off_to_active_ali_blocking:
2258         {
2259            if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
2260                srcLink->dev->enableALI)
2261             {
2262                 status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(initLinks, count);
2263 
2264                 if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
2265                 {
2266                     NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
2267                     do
2268                     {
2269                         nvlink_sleep(1);
2270                         status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
2271                         if (status == NVL_SUCCESS)
2272                         {
2273                             break;
2274                         }
2275 
2276                         timeout--;
2277                     } while(timeout > 0);
2278 
2279                     if (status == NVL_SUCCESS)
2280                     {
2281                         for ( i = 0; i < count; ++i)
2282                         {
2283                             //
2284                             // NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
2285                             // setting buffer ready so use the internal check to see if the request for ALI completed
2286                             //
2287                             (void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
2288                         }
2289                 }
2290             }
2291             }
2292             break;
2293         }
2294         default:
2295         {
2296             status = NVL_BAD_ARGS;
2297             break;
2298         }
2299     }
2300 
2301     //
2302     // always get the latest link state values so that
2303     // user has additional information other than just the return value.
2304     //
2305     nvlink_core_get_endpoint_state(conn->end0, &trainParams->srcEndState);
2306     nvlink_core_get_endpoint_state(conn->end1, &trainParams->dstEndState);
2307 
2308     // Release the per-link locks
2309     nvlink_lib_link_locks_release(initLinks, 2);
2310 
2311     return status;
2312 }
2313 
2314 /**
2315  * Train the intranode connections in parallel to the desired target state
2316  *
2317  * @param[in]  trainParams  IOCTL params
2318  *
2319  * return NvlStatus
2320  */
2321 static NvlStatus
nvlink_lib_ctrl_train_intranode_conns_parallel(nvlink_train_intranode_conns_parallel * trainParams)2322 nvlink_lib_ctrl_train_intranode_conns_parallel
2323 (
2324     nvlink_train_intranode_conns_parallel *trainParams
2325 )
2326 {
2327     nvlink_link            *srcLink    = NULL;
2328     nvlink_link            *dstLink    = NULL;
2329     nvlink_link           **trainLinks = NULL;
2330     nvlink_link           **initLinks  = NULL;
2331     nvlink_intranode_conn **conns      = NULL;
2332     NvU32                   numConns   = 0;
2333     NvlStatus               status     = NVL_SUCCESS;
2334     NvU32                   i;
2335     NvU32                   count = 0;
2336 
2337     // sanity check endPointPairsCount
2338     if (trainParams->endPointPairsCount > NVLINK_MAX_PARALLEL_CONNS_TRAIN_COUNT)
2339     {
2340         return NVL_BAD_ARGS;
2341     }
2342 
2343     //
2344     // sanity check the input parms
2345     // make sure that this call is for single node systems
2346     //
2347     numConns = trainParams->endPointPairsCount;
2348     for (i = 0; i < numConns; i++)
2349     {
2350         if (trainParams->endPointPairs[i].src.nodeId !=
2351             trainParams->endPointPairs[i].dst.nodeId)
2352         {
2353             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2354                 "%s: Node index 0x%x with mis-match ids (src:0x%x dst:0x%x).\n",
2355                 __FUNCTION__ , i,
2356                 trainParams->endPointPairs[i].src.nodeId,
2357                 trainParams->endPointPairs[i].dst.nodeId));
2358 
2359             return NVL_BAD_ARGS;
2360         }
2361         if ((trainParams->endPointPairs[i].src.pciInfo.bus == trainParams->endPointPairs[i].dst.pciInfo.bus) &&
2362             (trainParams->endPointPairs[i].src.pciInfo.device == trainParams->endPointPairs[i].dst.pciInfo.device) &&
2363             (trainParams->endPointPairs[i].src.pciInfo.function == trainParams->endPointPairs[i].dst.pciInfo.function) &&
2364             (trainParams->endPointPairs[i].src.linkIndex == trainParams->endPointPairs[i].dst.linkIndex))
2365         {
2366             count++;
2367         }
2368         else
2369         {
2370             count = count + 2;
2371         }
2372     }
2373 
2374     // Allocate space for the connection list
2375     conns = (nvlink_intranode_conn **)nvlink_malloc(
2376                             sizeof(nvlink_intranode_conn *) * numConns);
2377     if (conns == NULL)
2378     {
2379         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2380             "%s: Failed to allocate space for connections list\n",
2381             __FUNCTION__));
2382 
2383         status = NVL_ERR_GENERIC;
2384         goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2385     }
2386 
2387     // Allocate space for the links list for link initialization
2388     initLinks = (nvlink_link **)nvlink_malloc(sizeof(nvlink_link *) * count);
2389     if (initLinks == NULL)
2390     {
2391         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2392             "%s: Failed to allocate space for links list for link initialization\n",
2393             __FUNCTION__));
2394 
2395         status = NVL_ERR_GENERIC;
2396         goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2397     }
2398 
2399     // Allocate space for the links list for link training
2400     trainLinks = (nvlink_link **)nvlink_malloc(sizeof(nvlink_link *) * numConns);
2401     if (trainLinks == NULL)
2402     {
2403         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2404             "%s: Failed to allocate space for links list for link training\n",
2405             __FUNCTION__));
2406 
2407         status = NVL_ERR_GENERIC;
2408         goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2409     }
2410 
2411     nvlink_memset(conns, 0, sizeof(nvlink_intranode_conn *) * numConns);
2412     nvlink_memset(initLinks,  0, sizeof(nvlink_link *) * count);
2413     nvlink_memset(trainLinks, 0, sizeof(nvlink_link *) * numConns);
2414 
2415     // Acquire the top-level lock
2416     status = nvlink_lib_top_lock_acquire();
2417     if (status != NVL_SUCCESS)
2418     {
2419         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2420             "%s: Failed to acquire top-level lock\n",
2421             __FUNCTION__));
2422 
2423         goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2424     }
2425 
2426     //
2427     // Top-level lock is now acquired. Proceed to traversing the device and
2428     // link lists and connections list
2429     //
2430     count = 0;
2431     // Get all the connections associated with the list of links
2432     for (i = 0; i < numConns; i++)
2433     {
2434         //
2435         // look-up the nvlink link objects. Look-up will fail if there is a
2436         // fabric node id mismatch. So an explicit check against self
2437         // node id is not required.
2438         //
2439         srcLink = NULL;
2440         dstLink = NULL;
2441 
2442         nvlink_core_get_link_by_endpoint(&trainParams->endPointPairs[i].src, &srcLink);
2443         nvlink_core_get_link_by_endpoint(&trainParams->endPointPairs[i].dst, &dstLink);
2444 
2445         // we can't train if both ends of a pair not found
2446         if ((srcLink == NULL) || (dstLink == NULL))
2447         {
2448             //
2449             // Couldn't find the endpoints registered in the core library. Release
2450             // the top-level lock and return
2451             //
2452             nvlink_lib_top_lock_release();
2453 
2454             status = NVL_BAD_ARGS;
2455             goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2456         }
2457 
2458         // look-up the nvlink connection object by source link
2459         nvlink_core_get_intranode_conn(srcLink, &conns[i]);
2460         if (conns[i] == NULL)
2461         {
2462             //
2463             // Couldn't find an associated connection for the 2 endpoints. Release
2464             // the top-level lock and return
2465             //
2466             nvlink_lib_top_lock_release();
2467 
2468             status = NVL_BAD_ARGS;
2469             goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2470         }
2471 
2472         //
2473         // we found the connection by source link. Make sure that dest link is
2474         // indeed, the user specified one as well
2475         //
2476         if ((conns[i]->end0 != dstLink) && (conns[i]->end1 != dstLink))
2477         {
2478             //
2479             // The dest endpoint is not the remote end for the src endpoint. Release
2480             // the top-level lock and return
2481             //
2482             nvlink_lib_top_lock_release();
2483 
2484             status = NVL_BAD_ARGS;
2485             goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2486         }
2487         if (srcLink == dstLink)
2488         {
2489             initLinks[count] = srcLink;
2490             count++;
2491         }
2492         else
2493         {
2494             initLinks[count]       = srcLink;
2495             initLinks[count + 1]   = dstLink;
2496             count = count + 2;
2497         }
2498         trainLinks[i]      = srcLink;
2499     }
2500 
2501     // Acquire the per-link locks
2502     status = nvlink_lib_link_locks_acquire(initLinks, count);
2503     if (status != NVL_SUCCESS)
2504     {
2505         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2506             "%s: Failed to acquire per-link locks\n",
2507             __FUNCTION__));
2508 
2509         // Release the top-level lock
2510         nvlink_lib_top_lock_release();
2511 
2512         goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2513     }
2514 
2515     //
2516     // All the required per-link locks are now successfully acquired
2517     // Release the top level-lock
2518     //
2519     nvlink_lib_top_lock_release();
2520 
2521     // Check all the links captured have version >= 3.0
2522     for (i = 0; i < numConns; i++)
2523     {
2524         // Parallel training allowed NvLink 3.0 & above
2525         if ((conns[i]->end0->version < NVLINK_DEVICE_VERSION_30) ||
2526             (conns[i]->end1->version < NVLINK_DEVICE_VERSION_30))
2527         {
2528             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2529                 "%s: Parallel training not allowed with nvlink version 0x%x indexed 0x%x\n",
2530                 __FUNCTION__ ,
2531                 conns[i]->end0->version, i));
2532 
2533             //
2534             // Parallel training is allowed for only NVLink 3.0 and above. Release
2535             // the per link locks and return
2536             //
2537             nvlink_lib_link_locks_release(initLinks, count);
2538 
2539             status = NVL_BAD_ARGS;
2540             goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2541         }
2542     }
2543 
2544     // the connection looks sane, initiate the training
2545     switch (trainParams->trainTo)
2546     {
2547         case nvlink_train_conn_off_to_swcfg:
2548         {
2549             if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2550             {
2551                 // non-ALI training for NVLink4.0+
2552                 if (!srcLink->dev->enableALI)
2553                 {
2554                     nvlink_core_init_links_from_off_to_swcfg_non_ALI(
2555                                             initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2556                 }
2557             }
2558             else
2559             {
2560                 // ALT training for NVLink3.0+
2561                 nvlink_core_init_links_from_off_to_swcfg(
2562                                             initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2563             }
2564             break;
2565         }
2566         case nvlink_train_conn_swcfg_to_active:
2567         {
2568             if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2569             {
2570                 // non-ALI training for NVLink4.0+
2571                 if (!srcLink->dev->enableALI)
2572                 {
2573                     status = nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI(
2574                                                         conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2575                 }
2576             }
2577             else
2578             {
2579                 // ALT training for NVLink3.0+
2580                 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT(
2581                                                         conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2582             }
2583             break;
2584         }
2585         case nvlink_train_conn_active_to_swcfg:
2586         {
2587             status = nvlink_core_powerdown_intranode_conns_from_active_to_swcfg(
2588                                                     conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2589             break;
2590         }
2591         case nvlink_train_conn_to_off:
2592         case nvlink_train_conn_swcfg_to_off:
2593         {
2594             status = nvlink_core_powerdown_intranode_conns_from_active_to_off(
2595                                                     conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2596             if (status == NVL_SUCCESS)
2597             {
2598                 nvlink_core_reset_intranode_conns(conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2599             }
2600             break;
2601         }
2602         case nvlink_train_conn_off_to_active_ali_non_blocking:
2603         case nvlink_train_conn_off_to_active_ali_blocking:
2604         {
2605             if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
2606                 srcLink->dev->enableALI)
2607             {
2608                 status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(
2609                                                  initLinks, count);
2610 
2611                 if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
2612                 {
2613                     NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
2614                     do
2615                     {
2616                         nvlink_sleep(1);
2617                         status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
2618                         if (status == NVL_SUCCESS)
2619                         {
2620                             break;
2621                         }
2622 
2623                         timeout--;
2624                     } while(timeout > 0);
2625 
2626                     if (status == NVL_SUCCESS)
2627                     {
2628                         for ( i = 0; i < count; ++i)
2629                         {
2630                             //
2631                             // NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
2632                             // setting buffer ready so use the internal check to see if the request for ALI completed
2633                             //
2634                             (void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
2635                         }
2636                 }
2637             }
2638             }
2639             break;
2640         }
2641         default:
2642         {
2643             status = NVL_BAD_ARGS;
2644             break;
2645         }
2646     }
2647 
2648     //
2649     // always get the latest link state values when the args are verified
2650     // so that user has additional information other than just the return value.
2651     //
2652     for (i = 0; i < numConns; i++)
2653     {
2654         nvlink_core_get_endpoint_state(conns[i]->end0, &trainParams->endpointPairsStates[i].srcEnd);
2655         nvlink_core_get_endpoint_state(conns[i]->end1, &trainParams->endpointPairsStates[i].dstEnd);
2656     }
2657 
2658     // Release the per-link locks
2659     nvlink_lib_link_locks_release(initLinks, count);
2660 
2661 nvlink_lib_ctrl_train_intranode_conns_parallel_end:
2662 
2663     if (conns != NULL)
2664     {
2665         nvlink_free((void *)conns);
2666     }
2667 
2668     if (initLinks != NULL)
2669     {
2670         nvlink_free((void *)initLinks);
2671     }
2672 
2673     if (trainLinks != NULL)
2674     {
2675         nvlink_free((void *)trainLinks);
2676     }
2677 
2678     return status;
2679 }
2680 
2681 /**
2682  * Train the internode connection link to the target state
2683  *
2684  * @param[in]  linkParams  IOCTL params
2685  *
2686  * return NvlStatus
2687  */
2688 static NvlStatus
nvlink_lib_ctrl_train_internode_conn_link(nvlink_train_internode_conn_link * linkParams)2689 nvlink_lib_ctrl_train_internode_conn_link
2690 (
2691     nvlink_train_internode_conn_link *linkParams
2692 )
2693 {
2694     nvlink_link           *localLink = NULL;
2695     NvlStatus              status    = NVL_SUCCESS;
2696     nvlink_internode_conn *interConn = NULL;
2697 
2698     // Acquire the top-level lock
2699     status = nvlink_lib_top_lock_acquire();
2700     if (status != NVL_SUCCESS)
2701     {
2702         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2703             "%s: Failed to acquire top-level lock\n",
2704             __FUNCTION__));
2705 
2706         return status;
2707     }
2708 
2709     //
2710     // Top-level lock is now acquired. Proceed to traversing the device and
2711     // link lists and connections list
2712     //
2713 
2714     //
2715     // look-up the nvlink link objects. Look-up will fail if there is a
2716     // fabric node id mismatch. So an explicit check against self
2717     // node id is not required.
2718     //
2719     nvlink_core_get_link_by_endpoint(&linkParams->localEndPoint, &localLink);
2720 
2721     // user specified link is not available
2722     if (localLink == NULL)
2723     {
2724         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2725             "%s: Internode connection link train request with no matching local endpoint\n",
2726             __FUNCTION__));
2727 
2728         //
2729         // Couldn't find the endpoint registered in the core library. Release
2730         // the top-level lock and return
2731         //
2732         nvlink_lib_top_lock_release();
2733 
2734         return NVL_BAD_ARGS;
2735     }
2736 
2737     nvlink_core_get_internode_conn(localLink, &interConn);
2738     if (interConn == NULL)
2739     {
2740         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2741             "%s: No Internode connection exists for local endpoint %s: %s.\n",
2742             __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
2743 
2744         //
2745         // Couldn't find an associated connection for the endpoint. Release
2746         // the top-level lock and return
2747         //
2748         nvlink_lib_top_lock_release();
2749 
2750         return NVL_BAD_ARGS;
2751     }
2752 
2753     // Acquire the per-link lock
2754     status = nvlink_lib_link_locks_acquire(&localLink, 1);
2755     if (status != NVL_SUCCESS)
2756     {
2757         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2758             "%s: Failed to acquire per-link locks\n",
2759             __FUNCTION__));
2760 
2761         // Release the top-level lock
2762         nvlink_lib_top_lock_release();
2763 
2764         return status;
2765     }
2766 
2767     //
2768     // All the required per-link locks are now successfully acquired
2769     // Release the top level-lock
2770     //
2771     nvlink_lib_top_lock_release();
2772 
2773 
2774     switch (linkParams->trainTo)
2775     {
2776         case nvlink_train_link_off_to_swcfg:
2777         {
2778             // OFF to SAFE is part of initialization sequence as of now.
2779             status = NVL_BAD_ARGS;
2780             break;
2781         }
2782         case nvlink_train_link_swcfg_to_active:
2783         {
2784             status = nvlink_core_train_internode_conns_from_swcfg_to_active(
2785                         &interConn, 1, &linkParams->isMasterEnd, NVLINK_STATE_CHANGE_SYNC);
2786             break;
2787         }
2788         case nvlink_train_link_to_off:
2789         {
2790             // OFF state transitions are not supported/tested
2791             status = NVL_BAD_ARGS;
2792             break;
2793         }
2794         case nvlink_train_link_active_to_swcfg:
2795         {
2796             // not implemented/supported now
2797             status =  NVL_BAD_ARGS;
2798             break;
2799         }
2800         case nvlink_train_link_swcfg_to_off:
2801         {
2802             // OFF state transitions are not supported/tested
2803             status = NVL_BAD_ARGS;
2804             break;
2805         }
2806         default:
2807         {
2808             status = NVL_BAD_ARGS;
2809             break;
2810         }
2811     }
2812 
2813     //
2814     // always get the latest link state values so that
2815     // user has additional information other than just the return value.
2816     //
2817     nvlink_core_get_endpoint_state(localLink, &linkParams->localEndState);
2818 
2819     // Release the per-link lock
2820     nvlink_lib_link_locks_release(&localLink, 1);
2821 
2822     return status;
2823 }
2824 
2825 /*
2826  * Train the internode connection sublink to the target state
2827  *
2828  * @param[in]  subLinkParams  IOCTL params
2829  *
2830  * return NvlStatus
2831  */
2832 static NvlStatus
nvlink_lib_ctrl_train_internode_conn_sublink(nvlink_train_internode_conn_sublink * subLinkParams)2833 nvlink_lib_ctrl_train_internode_conn_sublink
2834 (
2835     nvlink_train_internode_conn_sublink *subLinkParams
2836 )
2837 {
2838     nvlink_link           *localLink = NULL;
2839     NvlStatus              status    = NVL_SUCCESS;
2840     nvlink_internode_conn *interConn = NULL;
2841 
2842     // Acquire the top-level lock
2843     status = nvlink_lib_top_lock_acquire();
2844     if (status != NVL_SUCCESS)
2845     {
2846         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2847             "%s: Failed to acquire top-level lock\n",
2848             __FUNCTION__));
2849 
2850         return status;
2851     }
2852 
2853     //
2854     // Top-level lock is now acquired. Proceed to traversing the device and
2855     // link lists and connections list
2856     //
2857 
2858     //
2859     // look-up the nvlink link objects. Look-up will fail if there is a
2860     // fabric node id mismatch. So an explicit check against self
2861     // node id is not required.
2862     //
2863     nvlink_core_get_link_by_endpoint(&subLinkParams->localEndPoint, &localLink);
2864 
2865     // user specified link is not available
2866     if (localLink == NULL)
2867     {
2868         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2869             "%s: Internode connection sublink train request with no matching local endpoint\n",
2870             __FUNCTION__));
2871 
2872         //
2873         // Couldn't find the endpoint registered in the core library. Release
2874         // the top-level lock and return
2875         //
2876         nvlink_lib_top_lock_release();
2877 
2878         return NVL_BAD_ARGS;
2879     }
2880 
2881     nvlink_core_get_internode_conn(localLink, &interConn);
2882     if (interConn == NULL)
2883     {
2884         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2885             "%s: No Internode connection exists for local endpoint %s: %s.\n",
2886             __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
2887 
2888         //
2889         // Couldn't find an associated connection for the endpoint. Release
2890         // the top-level lock and return
2891         //
2892         nvlink_lib_top_lock_release();
2893 
2894         return NVL_BAD_ARGS;
2895     }
2896 
2897     // Acquire the per-link lock
2898     status = nvlink_lib_link_locks_acquire(&localLink, 1);
2899     if (status != NVL_SUCCESS)
2900     {
2901         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2902             "%s: Failed to acquire per-link locks\n",
2903             __FUNCTION__));
2904 
2905         // Release the top-level lock
2906         nvlink_lib_top_lock_release();
2907 
2908         return status;
2909     }
2910 
2911     //
2912     // All the required per-link locks are now successfully acquired
2913     // Release the top level-lock
2914     //
2915     nvlink_lib_top_lock_release();
2916 
2917     switch (subLinkParams->trainTo)
2918     {
2919         case nvlink_train_sublink_off_to_safe:
2920         {
2921             // OFF to SAFE is part of initialization sequence as of now.
2922             status = NVL_BAD_ARGS;
2923             break;
2924         }
2925         case nvlink_train_sublink_safe_to_hs:
2926         {
2927             // NVLink 3.0 onwards this is handled through INITOPTIMIZE
2928             if (localLink->version >= NVLINK_DEVICE_VERSION_30)
2929             {
2930                 return NVL_ERR_NOT_SUPPORTED;
2931             }
2932             status = nvlink_core_train_internode_conn_sublink_from_safe_to_hs(
2933                          interConn, NVLINK_STATE_CHANGE_SYNC);
2934             break;
2935         }
2936         case nvlink_train_sublink_to_off:
2937         {
2938             // OFF state transitions are not supported/tested
2939             status = NVL_BAD_ARGS;
2940             break;
2941         }
2942         case nvlink_train_sublink_hs_to_safe:
2943         {
2944             // not implemented/supported now
2945             status = NVL_BAD_ARGS;
2946             break;
2947         }
2948         case nvlink_train_sublink_safe_to_off:
2949         {
2950             // OFF state transitions are not supported/tested
2951             status = NVL_BAD_ARGS;
2952             break;
2953         }
2954         default:
2955         {
2956             status = NVL_BAD_ARGS;
2957             break;
2958         }
2959     }
2960 
2961     //
2962     // always get the latest link state values so that
2963     // user has additional information other than just the return value.
2964     //
2965     nvlink_core_get_endpoint_state(localLink, &subLinkParams->localEndState);
2966 
2967     // Release the per-link lock
2968     nvlink_lib_link_locks_release(&localLink, 1);
2969 
2970     return status;
2971 }
2972 
2973 /**
2974  * Send INITOPTIMIZE on the given internode links
2975  *
2976  * @param[in]  initoptimizeParams  IOCTL params
2977  *
2978  * return NvlStatus
2979  */
2980 static NvlStatus
nvlink_lib_ctrl_train_internode_links_initoptimize(nvlink_train_internode_links_initoptimize * initoptimizeParams)2981 nvlink_lib_ctrl_train_internode_links_initoptimize
2982 (
2983     nvlink_train_internode_links_initoptimize *initoptimizeParams
2984 )
2985 {
2986     nvlink_link  *endpoint  = NULL;
2987     NvlStatus     status    = NVL_SUCCESS;
2988     NvU32         numLinks  = 0;
2989     NvU32         i         = 0;
2990 
2991     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
2992                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
2993     if (links == NULL)
2994     {
2995         return NVL_NO_MEM;
2996     }
2997 
2998     if (initoptimizeParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
2999     {
3000         nvlink_free((void *)links);
3001         return NVL_BAD_ARGS;
3002     }
3003 
3004     // Acquire the top-level lock
3005     status = nvlink_lib_top_lock_acquire();
3006     if (status != NVL_SUCCESS)
3007     {
3008         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3009             "%s: Failed to acquire top-level lock\n",
3010             __FUNCTION__));
3011 
3012         nvlink_free((void *)links);
3013         return status;
3014     }
3015 
3016     //
3017     // Top-level lock is now acquired. Proceed to traversing the device and
3018     // link lists and connections list
3019     //
3020 
3021     for (i = 0; i < initoptimizeParams->endPointCount; i++)
3022     {
3023         endpoint = NULL;
3024         nvlink_core_get_link_by_endpoint(&initoptimizeParams->endPoints[i], &endpoint);
3025 
3026         // we can't send INITOPTIMIZE if the endpoint is not found
3027         if (endpoint == NULL)
3028         {
3029             //
3030             // Couldn't find the endpoint registered in the core library. Release
3031             // the top-level lock and return
3032             //
3033             nvlink_lib_top_lock_release();
3034 
3035             nvlink_free((void *)links);
3036             return NVL_BAD_ARGS;
3037         }
3038         else if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
3039         {
3040             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3041                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3042                 __FUNCTION__));
3043 
3044             nvlink_assert(0);
3045 
3046             // Release the top-level lock and free links
3047             nvlink_lib_top_lock_release();
3048             nvlink_free((void *)links);
3049             return NVL_ERR_INVALID_STATE;
3050         }
3051 
3052         links[numLinks] = endpoint;
3053         numLinks++;
3054     }
3055 
3056     // Acquire the per-link locks
3057     status = nvlink_lib_link_locks_acquire(links, numLinks);
3058     if (status != NVL_SUCCESS)
3059     {
3060         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3061             "%s: Failed to acquire per-link locks\n",
3062             __FUNCTION__));
3063 
3064         // Release the top-level lock
3065         nvlink_lib_top_lock_release();
3066 
3067         nvlink_free((void *)links);
3068         return status;
3069     }
3070 
3071     //
3072     // All the required per-link locks are now successfully acquired
3073     // Release the top level-lock
3074     //
3075     nvlink_lib_top_lock_release();
3076 
3077     for (i = 0; i < numLinks; i++)
3078     {
3079         // INITOPTIMIZE is not supported before NVLink 3.0
3080         if (links[i]->version < NVLINK_DEVICE_VERSION_30)
3081             continue;
3082 
3083         // Continue if the link is already active, nothing to do
3084         if ((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_HS)) &&
3085             (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_HS)) &&
3086             (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_HS)))
3087         {
3088             continue;
3089         }
3090 
3091         //
3092         // For INITOPTIMIZE, link should be in SWCFG, else flag error and continue
3093         // to next link
3094         //
3095         if (!((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_SAFE)) &&
3096               (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_SAFE)) &&
3097               (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_SAFE))))
3098         {
3099             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3100                 "%s: INITOPTIMIZE only works for links in SAFE %s:%s.\n",
3101                 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3102             continue;
3103         }
3104 
3105         status = links[i]->link_handlers->set_dl_link_mode(links[i],
3106                                                            NVLINK_LINKSTATE_INITOPTIMIZE,
3107                                                            NVLINK_STATE_CHANGE_ASYNC);
3108 
3109         // Although it failed we need to continue on other links.
3110         if (status != NVL_SUCCESS)
3111         {
3112             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3113                 "%s: INITOPTIMIZE failed on Device:Link %s:%s\n",
3114                 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3115         }
3116     }
3117 
3118     // Release the per-link locks
3119     nvlink_lib_link_locks_release(links, numLinks);
3120 
3121     if (links != NULL)
3122     {
3123         nvlink_free((void *)links);
3124     }
3125     return NVL_SUCCESS;
3126 }
3127 
3128 /**
3129  * Send POSTINITOPTIMIZE on the given internode links
3130  *
3131  * @param[in]  initoptimizeParams  IOCTL params
3132  *
3133  * return NvlStatus
3134  */
3135 static NvlStatus
nvlink_lib_ctrl_train_internode_links_post_initoptimize(nvlink_train_internode_links_post_initoptimize * postinitoptimizeParams)3136 nvlink_lib_ctrl_train_internode_links_post_initoptimize
3137 (
3138     nvlink_train_internode_links_post_initoptimize *postinitoptimizeParams
3139 )
3140 {
3141     nvlink_link  *endpoint  = NULL;
3142     NvlStatus     status    = NVL_SUCCESS;
3143     NvU32         numLinks  = 0;
3144     NvU32         i         = 0;
3145 
3146     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
3147                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3148     if (links == NULL)
3149     {
3150         return NVL_NO_MEM;
3151     }
3152 
3153     if (postinitoptimizeParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
3154     {
3155         nvlink_free((void *)links);
3156         return NVL_BAD_ARGS;
3157     }
3158 
3159     // Acquire the top-level lock
3160     status = nvlink_lib_top_lock_acquire();
3161     if (status != NVL_SUCCESS)
3162     {
3163         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3164             "%s: Failed to acquire top-level lock\n",
3165             __FUNCTION__));
3166 
3167         nvlink_free((void *)links);
3168         return status;
3169     }
3170 
3171     //
3172     // Top-level lock is now acquired. Proceed to traversing the device and
3173     // link lists and connections list
3174     //
3175 
3176     for (i = 0; i < postinitoptimizeParams->endPointCount; i++)
3177     {
3178         endpoint = NULL;
3179         nvlink_core_get_link_by_endpoint(&postinitoptimizeParams->endPoints[i], &endpoint);
3180 
3181         // we can't send INITOPTIMIZE if the endpoint is not found
3182         if (endpoint == NULL)
3183         {
3184             //
3185             // Couldn't find the endpoint registered in the core library. Release
3186             // the top-level lock and return
3187             //
3188             nvlink_lib_top_lock_release();
3189 
3190             nvlink_free((void *)links);
3191             return NVL_BAD_ARGS;
3192         }
3193         else if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
3194         {
3195             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3196                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3197                 __FUNCTION__));
3198 
3199             nvlink_assert(0);
3200 
3201             // Release the top-level lock and free links
3202             nvlink_lib_top_lock_release();
3203             nvlink_free((void *)links);
3204             return NVL_ERR_INVALID_STATE;
3205         }
3206 
3207         links[numLinks] = endpoint;
3208         numLinks++;
3209     }
3210 
3211     // Acquire the per-link locks
3212     status = nvlink_lib_link_locks_acquire(links, numLinks);
3213     if (status != NVL_SUCCESS)
3214     {
3215         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3216             "%s: Failed to acquire per-link locks\n",
3217             __FUNCTION__));
3218 
3219         // Release the top-level lock
3220         nvlink_lib_top_lock_release();
3221 
3222         nvlink_free((void *)links);
3223         return status;
3224     }
3225 
3226     //
3227     // All the required per-link locks are now successfully acquired
3228     // Release the top level-lock
3229     //
3230     nvlink_lib_top_lock_release();
3231 
3232     for (i = 0; i < numLinks; i++)
3233     {
3234         // POST_INITOPTIMIZE is not supported before NVLink 3.0
3235         if (links[i]->version < NVLINK_DEVICE_VERSION_30)
3236             continue;
3237 
3238         // Continue if the link is already active, nothing to do
3239         if ((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_HS)) &&
3240             (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_HS)) &&
3241             (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_HS)))
3242         {
3243             continue;
3244         }
3245 
3246         status = links[i]->link_handlers->set_dl_link_mode(links[i],
3247                                                 NVLINK_LINKSTATE_POST_INITOPTIMIZE,
3248                                                 NVLINK_STATE_CHANGE_ASYNC);
3249 
3250         // Although it failed we need to continue on other links.
3251         if (status != NVL_SUCCESS)
3252         {
3253             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3254                 "%s: POST_INITOPTIMIZE failed on Device:Link %s:%s\n",
3255                 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3256         }
3257     }
3258 
3259     // Release the per-link locks
3260     nvlink_lib_link_locks_release(links, numLinks);
3261 
3262     if (links != NULL)
3263     {
3264         nvlink_free((void *)links);
3265     }
3266     return NVL_SUCCESS;
3267 }
3268 
3269 /**
3270  * Train the internode connection links to the target state
3271  *
3272  * @param[in]  linkParams  IOCTL params
3273  *
3274  * return NvlStatus
3275  */
3276 static NvlStatus
nvlink_lib_ctrl_train_internode_conns_parallel(nvlink_train_internode_conns_parallel * linkParams)3277 nvlink_lib_ctrl_train_internode_conns_parallel
3278 (
3279     nvlink_train_internode_conns_parallel *linkParams
3280 )
3281 {
3282     nvlink_link            *localLink  = NULL;
3283     NvlStatus               status     = NVL_SUCCESS;
3284     NvU32                   numLinks   = 0;
3285     NvU32                   i          = 0;
3286     nvlink_link           **links      = NULL;
3287     nvlink_internode_conn **interConns = NULL;
3288 
3289     links = (nvlink_link **)nvlink_malloc(
3290                     sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3291     if (links == NULL)
3292     {
3293         status = NVL_NO_MEM;
3294         goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3295     }
3296 
3297     interConns = (nvlink_internode_conn **)nvlink_malloc(
3298                     sizeof(nvlink_internode_conn *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3299     if (interConns == NULL)
3300     {
3301         status = NVL_NO_MEM;
3302         goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3303     }
3304 
3305     if (linkParams->localEndPointCount > NVLINK_MAX_PARALLEL_CONNS_TRAIN_COUNT)
3306     {
3307         status = NVL_BAD_ARGS;
3308         goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3309     }
3310 
3311     // Acquire the top-level lock
3312     status = nvlink_lib_top_lock_acquire();
3313     if (status != NVL_SUCCESS)
3314     {
3315         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3316             "%s: Failed to acquire top-level lock\n",
3317             __FUNCTION__));
3318 
3319         goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3320     }
3321 
3322     //
3323     // Top-level lock is now acquired. Proceed to traversing the device and
3324     // link lists and connections list
3325     //
3326     for (i = 0; i < linkParams->localEndPointCount; i++)
3327     {
3328         //
3329         // look-up the nvlink link objects. Look-up will fail if there is a
3330         // fabric node id mismatch. So an explicit check against self
3331         // node id is not required.
3332         //
3333         nvlink_core_get_link_by_endpoint(&linkParams->localEndPoints[i], &localLink);
3334 
3335         // user specified link is not available
3336         if (localLink == NULL)
3337         {
3338             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3339                 "%s: Internode connection link train request with no matching local endpoint\n",
3340                 __FUNCTION__));
3341 
3342             //
3343             // Couldn't find the endpoint registered in the core library. Release
3344             // the top-level lock and return
3345             //
3346             nvlink_lib_top_lock_release();
3347 
3348             status = NVL_BAD_ARGS;
3349             goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3350         }
3351 
3352         nvlink_core_get_internode_conn(localLink, &(interConns[i]));
3353         if (interConns[i] == NULL)
3354         {
3355             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3356                 "%s: No Internode connection exists for local endpoint %s: %s.\n",
3357                 __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
3358 
3359             //
3360             // Couldn't find an associated connection for the endpoint. Release
3361             // the top-level lock and return
3362             //
3363             nvlink_lib_top_lock_release();
3364 
3365             status = NVL_BAD_ARGS;
3366             goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3367         }
3368 
3369         links[numLinks] = localLink;
3370         numLinks++;
3371     }
3372 
3373     // Acquire the per-link lock
3374     status = nvlink_lib_link_locks_acquire(links, numLinks);
3375     if (status != NVL_SUCCESS)
3376     {
3377         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3378             "%s: Failed to acquire per-link locks\n",
3379             __FUNCTION__));
3380 
3381         // Release the top-level lock
3382         nvlink_lib_top_lock_release();
3383 
3384         goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3385     }
3386 
3387     //
3388     // All the required per-link locks are now successfully acquired
3389     // Release the top level-lock
3390     //
3391     nvlink_lib_top_lock_release();
3392 
3393     switch (linkParams->trainTo)
3394     {
3395         case nvlink_train_link_off_to_swcfg:
3396         {
3397             // OFF to SAFE is part of initialization sequence as of now.
3398             status = NVL_BAD_ARGS;
3399             break;
3400         }
3401         case nvlink_train_link_swcfg_to_active:
3402         {
3403             status = nvlink_core_train_internode_conns_from_swcfg_to_active(
3404                         interConns, numLinks, linkParams->isMasterEnd, NVLINK_STATE_CHANGE_SYNC);
3405             break;
3406         }
3407         case nvlink_train_link_to_off:
3408         {
3409             // OFF state transitions are not supported/tested
3410             status = NVL_BAD_ARGS;
3411             break;
3412         }
3413         case nvlink_train_link_active_to_swcfg:
3414         {
3415             // not implemented/supported now
3416             status =  NVL_BAD_ARGS;
3417             break;
3418         }
3419         case nvlink_train_link_swcfg_to_off:
3420         {
3421             // OFF state transitions are not supported/tested
3422             status = NVL_BAD_ARGS;
3423             break;
3424         }
3425         default:
3426         {
3427             status = NVL_BAD_ARGS;
3428             break;
3429         }
3430     }
3431 
3432     for (i = 0; i < numLinks; i++)
3433     {
3434 
3435         //
3436         // always get the latest link state values so that
3437         // user has additional information other than just the return value.
3438         //
3439         nvlink_core_get_endpoint_state(links[i], &linkParams->localEndStates[i]);
3440     }
3441 
3442     // Release the per-link lock
3443     nvlink_lib_link_locks_release(links, numLinks);
3444 
3445 nvlink_lib_ctrl_train_internode_conns_parallel_end:
3446 
3447     if (links != NULL)
3448     {
3449         nvlink_free((void *)links);
3450     }
3451     if (interConns != NULL)
3452     {
3453         nvlink_free((void *)interConns);
3454     }
3455 
3456     return status;
3457 }
3458 
3459 /**
3460  * Get the device information for all registered devices
3461  *
3462  * @param[in]  infoParams  IOCTL params
3463  *
3464  * return NvlStatus
3465  */
3466 static NvlStatus
nvlink_lib_ctrl_get_devices_info(nvlink_get_devices_info * infoParams)3467 nvlink_lib_ctrl_get_devices_info
3468 (
3469     nvlink_get_devices_info *infoParams
3470 )
3471 {
3472     nvlink_device *dev        = NULL;
3473     NvlStatus      status     = NVL_SUCCESS;
3474     NvU32          numDevices = 0;
3475 
3476     // Initialize number of devices to 0
3477     infoParams->numDevice = 0;
3478 
3479     // Acquire the top-level lock
3480     status = nvlink_lib_top_lock_acquire();
3481     if (status != NVL_SUCCESS)
3482     {
3483         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3484             "%s: Failed to acquire top-level lock\n",
3485             __FUNCTION__));
3486 
3487         return status;
3488     }
3489 
3490     //
3491     // Top-level lock is now acquired. Proceed to traversing the device and
3492     // link lists and connections list
3493     //
3494 
3495     FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
3496     {
3497         // total number of devices should be within NVLINK_DEVICE_INSTANCE_MAX
3498         if (numDevices >= NVLINK_DEVICE_INSTANCE_MAX)
3499         {
3500             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3501                 "%s: numDevices >= NVLINK_DEVICE_INSTANCE_MAX",
3502                 __FUNCTION__));
3503 
3504             nvlink_assert(0);
3505             nvlink_lib_top_lock_release();
3506             return NVL_ERR_INVALID_STATE;
3507         }
3508 
3509         // copy device information
3510         nvlink_core_copy_device_info(dev, &infoParams->devInfo[numDevices]);
3511         numDevices++;
3512     }
3513 
3514     infoParams->numDevice = numDevices;
3515 
3516     // Release the top-level lock
3517     nvlink_lib_top_lock_release();
3518 
3519     return status;
3520 }
3521 
3522 static NvlStatus
nvlink_lib_ctrl_acquire_capability(nvlink_ioctrl_params * ctrlParams,nvlink_acquire_capability * capParams)3523 nvlink_lib_ctrl_acquire_capability
3524 (
3525     nvlink_ioctrl_params      *ctrlParams,
3526     nvlink_acquire_capability *capParams
3527 )
3528 {
3529     NvlStatus status;
3530 
3531     if (capParams == NULL)
3532     {
3533         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3534             "%s: Bad ioctl capability ctrl params specified.\n",
3535             __FUNCTION__));
3536         return NVL_BAD_ARGS;
3537     }
3538 
3539     status = nvlink_acquire_fabric_mgmt_cap(ctrlParams->osPrivate,
3540                                             capParams->capDescriptor);
3541     if (status != NVL_SUCCESS)
3542     {
3543         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3544             "%s: Failed to acquire fabric mgmt capability.\n",
3545             __FUNCTION__));
3546         return status;
3547     }
3548 
3549     return NVL_SUCCESS;
3550 }
3551 
nvlink_lib_ctrl_get_link_state(nvlink_get_link_state * linkParams)3552 static NvlStatus nvlink_lib_ctrl_get_link_state
3553 (
3554     nvlink_get_link_state *linkParams
3555 )
3556 {
3557     nvlink_link  *endpoint  = NULL;
3558     NvlStatus     status    = NVL_SUCCESS;
3559     NvU32         numLinks  = 0;
3560     NvU32         i         = 0;
3561 
3562     ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
3563 
3564     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
3565                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3566     if (links == NULL)
3567     {
3568         return NVL_NO_MEM;
3569     }
3570 
3571     if (linkParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
3572     {
3573         nvlink_free((void *)links);
3574         return NVL_BAD_ARGS;
3575     }
3576 
3577     // Acquire the top-level lock
3578     status = nvlink_lib_top_lock_acquire();
3579     if (status != NVL_SUCCESS)
3580     {
3581         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3582             "%s: Failed to acquire top-level lock\n",
3583             __FUNCTION__));
3584 
3585         nvlink_free((void *)links);
3586         return status;
3587     }
3588 
3589     //
3590     // Top-level lock is now acquired. Proceed to traversing the device and
3591     // link lists and connections list
3592     //
3593 
3594     for (i = 0; i < linkParams->endPointCount; i++)
3595     {
3596         endpoint = NULL;
3597         nvlink_core_get_link_by_endpoint(&linkParams->endPoints[i], &endpoint);
3598 
3599         // we can't send this command if the endpoint is not found
3600         if (endpoint == NULL)
3601         {
3602             //
3603             // Couldn't find the endpoint registered in the core library. Release
3604             // the top-level lock and return
3605             //
3606             nvlink_lib_top_lock_release();
3607 
3608             nvlink_free((void *)links);
3609             return NVL_BAD_ARGS;
3610         }
3611         else if (numLinks >= NVLINK_MAX_NVLINK_ENDPOINTS)
3612         {
3613             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3614                 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3615                 __FUNCTION__));
3616 
3617             nvlink_assert(0);
3618 
3619             // Release the top-level lock and free links
3620             nvlink_lib_top_lock_release();
3621             nvlink_free((void *)links);
3622             return NVL_ERR_INVALID_STATE;
3623         }
3624 
3625         links[numLinks] = endpoint;
3626         numLinks++;
3627     }
3628 
3629     // Acquire the per-link locks
3630     status = nvlink_lib_link_locks_acquire(links, numLinks);
3631     if (status != NVL_SUCCESS)
3632     {
3633         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3634             "%s: Failed to acquire per-link locks\n",
3635             __FUNCTION__));
3636 
3637         // Release the top-level lock
3638         nvlink_lib_top_lock_release();
3639 
3640         nvlink_free((void *)links);
3641         return status;
3642     }
3643 
3644     //
3645     // All the required per-link locks are now successfully acquired
3646     // Release the top level-lock
3647     //
3648     nvlink_lib_top_lock_release();
3649 
3650     for (i = 0; i < numLinks; i++)
3651     {
3652         // Wait for the link state to change.
3653         status = nvlink_core_poll_link_state(links[i],
3654                                              NVLINK_LINKSTATE_HS,
3655                                              NVLINK_TRANSITION_POST_HS_TIMEOUT);
3656         if (status != NVL_SUCCESS)
3657         {
3658             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3659                 "%s: Unable to set link state to ACTIVE for link"
3660                 " %s:%s \n",
3661                 __FUNCTION__,
3662                 links[i]->dev->deviceName, links[i]->linkName));
3663         }
3664         else
3665         {
3666             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_SETUP,
3667                 "%s: Successfully able to set link state to ACTIVE for link"
3668                 " %s:%s \n",
3669                 __FUNCTION__,
3670                 links[i]->dev->deviceName, links[i]->linkName));
3671         }
3672 
3673         nvlink_core_get_endpoint_state(links[i], &linkParams->endState[i]);
3674     }
3675 
3676     // Release the per-link locks
3677     nvlink_lib_link_locks_release(links, numLinks);
3678 
3679     if (links != NULL)
3680     {
3681         nvlink_free((void *)links);
3682     }
3683 
3684     return NVL_SUCCESS;
3685 }
3686 
3687 static NvlStatus
nvlink_lib_ctrl_get_device_link_states(nvlink_get_device_link_states * params)3688 nvlink_lib_ctrl_get_device_link_states
3689 (
3690     nvlink_get_device_link_states *params
3691 )
3692 {
3693     nvlink_link  *endpoint  = NULL;
3694     nvlink_device *dev      = NULL;
3695     NvlStatus     status    = NVL_SUCCESS;
3696     NvU32         numLinks  = 0;
3697     NvU32         i         = 0;
3698     NvU8          linkNumber;
3699 
3700     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
3701                             sizeof(nvlink_link *) * NVLINK_MAX_DEVICE_CONN);
3702 
3703     // Get current monotonic time in seconds.nanoseconds
3704     params->time = nvlink_get_platform_time();
3705 
3706     if (links == NULL)
3707     {
3708         return NVL_NO_MEM;
3709     }
3710 
3711     nvlink_memset(params->endStates, 0x0, sizeof(params->endStates));
3712 
3713     // Acquire the top-level lock
3714     status = nvlink_lib_top_lock_acquire();
3715     if (status != NVL_SUCCESS)
3716     {
3717         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3718             "%s: Failed to acquire top-level lock\n",
3719             __FUNCTION__));
3720 
3721         nvlink_free((void *)links);
3722         return status;
3723     }
3724 
3725     // look-up user requested nvlink device object
3726     nvlink_core_get_device_by_devinfo(&params->devInfo, &dev);
3727     if (dev == NULL)
3728     {
3729         //
3730         // Couldn't find the device ptr in the core library. Release the
3731         // top-level lock and return
3732         //
3733         nvlink_lib_top_lock_release();
3734 
3735         nvlink_free((void *)links);
3736         return NVL_BAD_ARGS;
3737     }
3738 
3739     //
3740     // Top-level lock is now acquired. Proceed to traversing the list
3741     // of devices and list of links to lock all links
3742     //
3743     FOR_EACH_LINK_REGISTERED(endpoint, dev, node)
3744     {
3745         if (numLinks >= NVLINK_MAX_DEVICE_CONN)
3746         {
3747             NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3748                 "%s: numLinks >= NVLINK_MAX_DEVICE_CONN",
3749                 __FUNCTION__));
3750 
3751             nvlink_assert(0);
3752 
3753             // Release the top-level lock and free links
3754             nvlink_lib_top_lock_release();
3755             nvlink_free((void *)links);
3756             return NVL_ERR_INVALID_STATE;
3757         }
3758         links[numLinks] = endpoint;
3759         numLinks++;
3760     }
3761 
3762     // Acquire the per-link locks
3763     status = nvlink_lib_link_locks_acquire(links, numLinks);
3764     if (status != NVL_SUCCESS)
3765     {
3766         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3767             "%s: Failed to acquire per-link locks\n",
3768             __FUNCTION__));
3769 
3770         // Release the top-level lock
3771         nvlink_lib_top_lock_release();
3772         nvlink_free((void *)links);
3773         return status;
3774     }
3775 
3776     //
3777     // All the required per-link locks are now successfully acquired
3778     // Release the top level-lock
3779     //
3780     nvlink_lib_top_lock_release();
3781 
3782     nvlink_assert((links != NULL) && (numLinks > 0));
3783 
3784     for (i = 0; i < numLinks; ++i)
3785     {
3786         linkNumber = links[i]->linkNumber;
3787 
3788         nvlink_assert(linkNumber < NVLINK_MAX_DEVICE_CONN);
3789 
3790         // Get the endpoint states of the link
3791         nvlink_core_get_endpoint_state(links[i], &(params->endStates[linkNumber]));
3792 
3793         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
3794             "%s: link 0x%x -- linkMode 0x%x,\n",
3795             __FUNCTION__, linkNumber, params->endStates[linkNumber].linkMode));
3796     }
3797 
3798     // This is done to preserve client behavior that uses endStatesCount to iterate across endStates array
3799     params->endStatesCount = NVLINK_MAX_DEVICE_CONN;
3800 
3801     // Release the per-link locks
3802     nvlink_lib_link_locks_release(links, numLinks);
3803 
3804     if (links != NULL)
3805     {
3806         nvlink_free((void *)links);
3807     }
3808 
3809     return status;
3810 }
3811