1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "nvlink.h"
25 #include "nvVer.h"
26 #include "nvlink_os.h"
27 #include "nvlink_lib_ctrl.h"
28 #include "../nvlink_ctx.h"
29 #include "../nvlink_helper.h"
30 #include "nvlink_lock.h"
31 #include "nvctassert.h"
32
33 #define NVLINK_IOC_GET_BUF(ctrlParams, type) (ctrlParams)->size >= sizeof(type) ? (type *) (ctrlParams)->buf : NULL
34
35 /**
36 * List of static functions
37 */
38 static NvlStatus nvlink_lib_ioctl_ctrl_helper(nvlink_ioctrl_params *);
39 static NvlStatus nvlink_lib_ctrl_prologue(nvlink_ioctrl_params *);
40 static NvlStatus nvlink_lib_ctrl_check_version(nvlink_check_version *);
41 static NvlStatus nvlink_lib_ctrl_set_node_id(nvlink_set_node_id *);
42 static NvlStatus nvlink_lib_ctrl_all_links(nvlink_ioctrl_params *);
43 static NvlStatus nvlink_lib_ctrl_device_link_init_status(nvlink_device_link_init_status *);
44 static NvlStatus nvlink_lib_ctrl_device_write_discovery_tokens(nvlink_device_write_discovery_tokens *);
45 static NvlStatus nvlink_lib_ctrl_device_read_discovery_tokens(nvlink_device_read_discovery_tokens *);
46 static NvlStatus nvlink_lib_ctrl_device_read_sids(nvlink_device_read_sids *);
47 static NvlStatus nvlink_lib_ctrl_discover_intranode_conns(nvlink_discover_intranode_conns *);
48 static NvlStatus nvlink_lib_ctrl_device_get_intranode_conns(nvlink_device_get_intranode_conns *);
49 static NvlStatus nvlink_lib_ctrl_add_internode_conn(nvlink_add_internode_conn *);
50 static NvlStatus nvlink_lib_ctrl_remove_internode_conn(nvlink_remove_internode_conn *);
51 static NvlStatus nvlink_lib_ctrl_train_intranode_conn(nvlink_train_intranode_conn *);
52 static NvlStatus nvlink_lib_ctrl_train_intranode_conns_parallel(nvlink_train_intranode_conns_parallel *);
53 static NvlStatus nvlink_lib_ctrl_train_internode_conn_link(nvlink_train_internode_conn_link *);
54 static NvlStatus nvlink_lib_ctrl_train_internode_conn_sublink(nvlink_train_internode_conn_sublink *);
55 static NvlStatus nvlink_lib_ctrl_train_internode_links_initoptimize(nvlink_train_internode_links_initoptimize *);
56 static NvlStatus nvlink_lib_ctrl_train_internode_links_post_initoptimize(nvlink_train_internode_links_post_initoptimize *);
57 static NvlStatus nvlink_lib_ctrl_train_internode_conns_parallel(nvlink_train_internode_conns_parallel *);
58 static NvlStatus nvlink_lib_ctrl_get_devices_info(nvlink_get_devices_info *);
59 static NvlStatus nvlink_lib_ctrl_acquire_capability(nvlink_ioctrl_params *, nvlink_acquire_capability *);
60 static NvlStatus nvlink_lib_ctrl_get_link_state(nvlink_get_link_state *);
61 static NvlStatus nvlink_lib_ctrl_get_device_link_states(nvlink_get_device_link_states *);
62
63 /**
64 * Entry point for IOCTLs into the NVLink core library
65 *
66 * @param[in] ctrlParams IOCTL params
67 *
68 * return NvlStatus
69 */
70 NvlStatus
nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params * ctrlParams)71 nvlink_lib_ioctl_ctrl
72 (
73 nvlink_ioctrl_params *ctrlParams
74 )
75 {
76 NvlStatus status = NVL_SUCCESS;
77
78 status = nvlink_lib_ioctl_ctrl_helper(ctrlParams);
79
80 return status;
81 }
82
83 /**
84 * Helper function for routing the IOCTL to the respective handlers
85 *
86 * Note: The handlers acquire the required core library locks before
87 * calling the core library code
88 *
89 * @param[in] ctrlParams IOCTL params
90 *
91 * return NvlStatus
92 */
93 static NvlStatus
nvlink_lib_ioctl_ctrl_helper(nvlink_ioctrl_params * ctrlParams)94 nvlink_lib_ioctl_ctrl_helper
95 (
96 nvlink_ioctrl_params *ctrlParams
97 )
98 {
99 NvlStatus status;
100
101 status = nvlink_lib_ctrl_prologue(ctrlParams);
102 if (status != NVL_SUCCESS)
103 {
104 return status;
105 }
106
107 switch (ctrlParams->cmd)
108 {
109 case CTRL_NVLINK_CHECK_VERSION:
110 {
111 nvlink_check_version *iocReq;
112
113 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_check_version);
114 if (!iocReq)
115 {
116 return NVL_BAD_ARGS;
117 }
118
119 iocReq->status = nvlink_lib_ctrl_check_version(iocReq);
120 break;
121 }
122
123 case CTRL_NVLINK_SET_NODE_ID:
124 {
125 nvlink_set_node_id *iocReq;
126
127 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_node_id);
128 if (!iocReq)
129 {
130 return NVL_BAD_ARGS;
131 }
132
133 iocReq->status = nvlink_lib_ctrl_set_node_id(iocReq);
134 break;
135 }
136
137 //
138 // The following commands operate on all the links registered in the
139 // core library. Hence, clubbing them into a group so, we don't have
140 // to duplicate the lock acquire/release for each of them
141 //
142 case CTRL_NVLINK_INITPHASE1:
143 case CTRL_NVLINK_RX_INIT_TERM:
144 case CTRL_NVLINK_SET_RX_DETECT:
145 case CTRL_NVLINK_GET_RX_DETECT:
146 case CTRL_NVLINK_SET_TX_COMMON_MODE:
147 case CTRL_NVLINK_CALIBRATE:
148 case CTRL_NVLINK_ENABLE_DATA:
149 case CTRL_NVLINK_LINK_INIT_ASYNC:
150 case CTRL_NVLINK_INITNEGOTIATE:
151 case CTRL_NVLINK_INITPHASE5:
152 {
153 nvlink_lib_ctrl_all_links(ctrlParams);
154 break;
155 }
156
157 case CTRL_NVLINK_DEVICE_LINK_INIT_STATUS:
158 {
159 nvlink_device_link_init_status *iocReq;
160
161 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_link_init_status);
162 if (!iocReq)
163 {
164 return NVL_BAD_ARGS;
165 }
166
167 iocReq->status = nvlink_lib_ctrl_device_link_init_status(iocReq);
168 break;
169 }
170
171 case CTRL_NVLINK_DEVICE_WRITE_DISCOVERY_TOKENS:
172 {
173 nvlink_device_write_discovery_tokens *iocReq;
174
175 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_write_discovery_tokens);
176 if (!iocReq)
177 {
178 return NVL_BAD_ARGS;
179 }
180
181 iocReq->status = nvlink_lib_ctrl_device_write_discovery_tokens(iocReq);
182 break;
183 }
184
185 case CTRL_NVLINK_DEVICE_READ_DISCOVERY_TOKENS:
186 {
187 nvlink_device_read_discovery_tokens *iocReq;
188
189 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_read_discovery_tokens);
190 if (!iocReq)
191 {
192 return NVL_BAD_ARGS;
193 }
194
195 iocReq->status = nvlink_lib_ctrl_device_read_discovery_tokens(iocReq);
196 break;
197 }
198
199 case CTRL_NVLINK_DEVICE_READ_SIDS:
200 {
201 nvlink_device_read_sids *iocReq;
202
203 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_read_sids);
204 if (!iocReq)
205 {
206 return NVL_BAD_ARGS;
207 }
208
209 iocReq->status = nvlink_lib_ctrl_device_read_sids(iocReq);
210 break;
211 }
212
213 case CTRL_NVLINK_DISCOVER_INTRANODE_CONNS:
214 {
215 nvlink_discover_intranode_conns *iocReq;
216
217 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_discover_intranode_conns);
218 if (!iocReq)
219 {
220 return NVL_BAD_ARGS;
221 }
222
223 iocReq->status = nvlink_lib_ctrl_discover_intranode_conns(iocReq);
224 break;
225 }
226
227 case CTRL_NVLINK_DEVICE_GET_INTRANODE_CONNS:
228 {
229 nvlink_device_get_intranode_conns *iocReq;
230
231 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_device_get_intranode_conns);
232 if (!iocReq)
233 {
234 return NVL_BAD_ARGS;
235 }
236
237 iocReq->status = nvlink_lib_ctrl_device_get_intranode_conns(iocReq);
238 break;
239 }
240
241 case CTRL_NVLINK_ADD_INTERNODE_CONN:
242 {
243 nvlink_add_internode_conn *iocReq;
244
245 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_add_internode_conn);
246 if (!iocReq)
247 {
248 return NVL_BAD_ARGS;
249 }
250
251 iocReq->status = nvlink_lib_ctrl_add_internode_conn(iocReq);
252 break;
253 }
254
255 case CTRL_NVLINK_REMOVE_INTERNODE_CONN:
256 {
257 nvlink_remove_internode_conn *iocReq;
258
259 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_remove_internode_conn);
260 if (!iocReq)
261 {
262 return NVL_BAD_ARGS;
263 }
264
265 iocReq->status = nvlink_lib_ctrl_remove_internode_conn(iocReq);
266 break;
267 }
268
269 case CTRL_NVLINK_TRAIN_INTRANODE_CONN:
270 {
271 nvlink_train_intranode_conn *iocReq;
272
273 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_intranode_conn);
274 if (!iocReq)
275 {
276 return NVL_BAD_ARGS;
277 }
278
279 iocReq->status = nvlink_lib_ctrl_train_intranode_conn(iocReq);
280 break;
281 }
282
283 case CTRL_NVLINK_TRAIN_INTRANODE_CONNS_PARALLEL:
284 {
285 nvlink_train_intranode_conns_parallel *iocReq;
286
287 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_intranode_conns_parallel);
288 if (!iocReq)
289 {
290 return NVL_BAD_ARGS;
291 }
292
293 iocReq->status = nvlink_lib_ctrl_train_intranode_conns_parallel(iocReq);
294 break;
295 }
296
297 case CTRL_NVLINK_TRAIN_INTERNODE_CONN_LINK:
298 {
299 nvlink_train_internode_conn_link *iocReq;
300
301 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conn_link);
302 if (!iocReq)
303 {
304 return NVL_BAD_ARGS;
305 }
306
307 iocReq->status = nvlink_lib_ctrl_train_internode_conn_link(iocReq);
308 break;
309 }
310
311 case CTRL_NVLINK_TRAIN_INTERNODE_CONN_SUBLINK:
312 {
313 nvlink_train_internode_conn_sublink *iocReq;
314
315 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conn_sublink);
316 if (!iocReq)
317 {
318 return NVL_BAD_ARGS;
319 }
320
321 iocReq->status = nvlink_lib_ctrl_train_internode_conn_sublink(iocReq);
322 break;
323 }
324
325 case CTRL_NVLINK_TRAIN_INTERNODE_LINKS_INITOPTIMIZE:
326 {
327 nvlink_train_internode_links_initoptimize *iocReq;
328
329 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_links_initoptimize);
330 if (!iocReq)
331 {
332 return NVL_BAD_ARGS;
333 }
334 iocReq->status = nvlink_lib_ctrl_train_internode_links_initoptimize(iocReq);
335 break;
336 }
337
338 case CTRL_NVLINK_TRAIN_INTERNODE_LINKS_POST_INITOPTIMIZE:
339 {
340 nvlink_train_internode_links_post_initoptimize *iocReq;
341
342 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_links_post_initoptimize);
343 if (!iocReq)
344 {
345 return NVL_BAD_ARGS;
346 }
347 iocReq->status = nvlink_lib_ctrl_train_internode_links_post_initoptimize(iocReq);
348 break;
349 }
350
351 case CTRL_NVLINK_TRAIN_INTERNODE_CONNS_PARALLEL:
352 {
353 nvlink_train_internode_conns_parallel *iocReq;
354
355 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_train_internode_conns_parallel);
356 if (!iocReq)
357 {
358 return NVL_BAD_ARGS;
359 }
360
361 iocReq->status = nvlink_lib_ctrl_train_internode_conns_parallel(iocReq);
362 break;
363 }
364
365 case CTRL_NVLINK_GET_DEVICES_INFO:
366 {
367 nvlink_get_devices_info *iocReq;
368
369 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_devices_info);
370 if (!iocReq)
371 {
372 return NVL_BAD_ARGS;
373 }
374
375 iocReq->status = nvlink_lib_ctrl_get_devices_info(iocReq);
376 break;
377 }
378
379 case CTRL_NVLINK_ACQUIRE_CAPABILITY:
380 {
381 nvlink_acquire_capability *iocReq;
382
383 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_acquire_capability);
384 if (!iocReq)
385 {
386 return NVL_BAD_ARGS;
387 }
388
389 iocReq->status = nvlink_lib_ctrl_acquire_capability(ctrlParams, iocReq);
390 break;
391 }
392
393 case CTRL_NVLINK_GET_LINK_STATE:
394 {
395 nvlink_get_link_state *iocReq;
396
397 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_link_state);
398 if (!iocReq)
399 {
400 return NVL_BAD_ARGS;
401 }
402
403 iocReq->status = nvlink_lib_ctrl_get_link_state(iocReq);
404 break;
405 }
406 case CTRL_NVLINK_GET_DEVICE_LINK_STATES:
407 {
408 nvlink_get_device_link_states *iocReq;
409
410 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_device_link_states);
411 if (!iocReq)
412 {
413 return NVL_BAD_ARGS;
414 }
415
416 iocReq->status = nvlink_lib_ctrl_get_device_link_states(iocReq);
417 break;
418 }
419
420 case CTRL_NVLINK_RESERVED_0:
421 case CTRL_NVLINK_RESERVED_1:
422 case CTRL_NVLINK_RESERVED_2:
423 case CTRL_NVLINK_RESERVED_3:
424 case CTRL_NVLINK_RESERVED_4:
425 case CTRL_NVLINK_RESERVED_5:
426 case CTRL_NVLINK_RESERVED_6:
427 case CTRL_NVLINK_RESERVED_7:
428 case CTRL_NVLINK_RESERVED_8:
429 case CTRL_NVLINK_RESERVED_9:
430 case CTRL_NVLINK_RESERVED_10:
431 case CTRL_NVLINK_RESERVED_11:
432 {
433 return NVL_SUCCESS;
434 break;
435 }
436
437 default:
438 {
439 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
440 "%s: unknown ioctl command 0x%08X specified.\n",
441 __FUNCTION__, ctrlParams->cmd));
442 return NVL_BAD_ARGS;
443 }
444 }
445
446 //
447 // the IOCTL call is success. However, status of the individual IOCTL is
448 // indicated in their corresponding embedded status field.
449 //
450 return NVL_SUCCESS;
451 }
452
453 /**
454 * Preliminary check before passing the IOCTL to the respective handler
455 *
456 * @param[in] ctrlParams IOCTL params
457 *
458 * return NvlStatus
459 */
460 static NvlStatus
nvlink_lib_ctrl_prologue(nvlink_ioctrl_params * ctrlParams)461 nvlink_lib_ctrl_prologue
462 (
463 nvlink_ioctrl_params *ctrlParams
464 )
465 {
466 NvlStatus status = NVL_SUCCESS;
467
468 if (ctrlParams == NULL)
469 {
470 return NVL_BAD_ARGS;
471 }
472
473 switch (ctrlParams->cmd)
474 {
475 //
476 // These control calls are aren't privileged. So, skip the capability
477 // check.
478 //
479 case CTRL_NVLINK_CHECK_VERSION:
480 case CTRL_NVLINK_ACQUIRE_CAPABILITY:
481 {
482 break;
483 }
484 default:
485 {
486 if (!nvlink_is_admin() &&
487 !nvlink_is_fabric_manager(ctrlParams->osPrivate))
488 {
489 status = NVL_ERR_INSUFFICIENT_PERMISSIONS;
490 }
491 break;
492 }
493 }
494
495 return status;
496 }
497
498 /**
499 * Check if the user and kernel versions mismatch
500 *
501 * @param[in] versionParams IOCTL params
502 *
503 * return NvlStatus
504 */
505 static NvlStatus
nvlink_lib_ctrl_check_version(nvlink_check_version * versionParams)506 nvlink_lib_ctrl_check_version
507 (
508 nvlink_check_version *versionParams
509 )
510 {
511 const NvU32 NV_VERSION_LENGTH = nvlink_strlen(NV_VERSION_STRING);
512
513 if (NV_VERSION_LENGTH > NVLINK_VERSION_STRING_LENGTH)
514 {
515 return NVL_NO_MEM;
516 }
517
518 versionParams->user.version[NVLINK_VERSION_STRING_LENGTH - 1] = '\0';
519
520 nvlink_memset(versionParams->kernel.version, 0x0, sizeof(versionParams->kernel.version));
521 nvlink_strcpy(versionParams->kernel.version, NV_VERSION_STRING);
522
523 versionParams->kernel.version[NVLINK_VERSION_STRING_LENGTH - 1] = '\0';
524
525 if (nvlink_strcmp(versionParams->user.version, versionParams->kernel.version))
526 {
527 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
528 "%s: version mismatch, kernel version %s user version %s\n",
529 __FUNCTION__,
530 versionParams->kernel.version, versionParams->user.version));
531
532 return NVL_ERR_NOT_SUPPORTED;
533 }
534
535 return NVL_SUCCESS;
536 }
537
538 /**
539 * Assign node ID to all the registered devices
540 *
541 * @param[in] idParams IOCTL params
542 *
543 * return NvlStatus
544 */
545 static NvlStatus
nvlink_lib_ctrl_set_node_id(nvlink_set_node_id * idParams)546 nvlink_lib_ctrl_set_node_id
547 (
548 nvlink_set_node_id *idParams
549 )
550 {
551 NvlStatus status = NVL_SUCCESS;
552 nvlink_device *dev = NULL;
553
554 // Acquire the top-level lock
555 status = nvlink_lib_top_lock_acquire();
556 if (status != NVL_SUCCESS)
557 {
558 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
559 "%s: Failed to acquire top-level lock\n",
560 __FUNCTION__));
561
562 return status;
563 }
564
565 // Top-level lock is now acquired
566
567 // Return success, if an attempt is made to re-assign the same node-id.
568 if (nvlinkLibCtx.nodeId == idParams->nodeId)
569 {
570 // Release the top-level lock
571 nvlink_lib_top_lock_release();
572
573 return NVL_SUCCESS;
574 }
575
576 if (nvlinkLibCtx.nodeId != NV_U16_MAX)
577 {
578 // Don't allow to change fabric node id once it is set.
579 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
580 "%s: Can't change fabric node id once it is set. "
581 "Current node id is %u\n",
582 __FUNCTION__, nvlinkLibCtx.nodeId));
583
584 // Release the top-level lock
585 nvlink_lib_top_lock_release();
586
587 return NVL_ERR_INVALID_STATE;
588 }
589
590 // Change already registered device's fabric node id.
591 FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
592 {
593 dev->nodeId = idParams->nodeId;
594 }
595
596 // Store fabric node id for any future device registration.
597 nvlinkLibCtx.nodeId = idParams->nodeId;
598
599 // Release the top-level lock
600 nvlink_lib_top_lock_release();
601
602 return NVL_SUCCESS;
603 }
604
605 /**
606 * Kick off the desired operation on registered links of all devices
607 *
608 * Note: This operation will acquire the per-link locks of all the
609 * registered links of all devices in the core library
610 *
611 * @param[in] ctrlParams IOCTL params
612 *
613 * return NvlStatus
614 */
615 static NvlStatus
nvlink_lib_ctrl_all_links(nvlink_ioctrl_params * ctrlParams)616 nvlink_lib_ctrl_all_links
617 (
618 nvlink_ioctrl_params *ctrlParams
619 )
620 {
621 NvlStatus status = NVL_SUCCESS;
622 nvlink_device *dev = NULL;
623 nvlink_link *link = NULL;
624 NvU32 numLinks = 0;
625
626 nvlink_link **links = (nvlink_link **)nvlink_malloc(
627 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
628 if (links == NULL)
629 {
630 return NVL_NO_MEM;
631 }
632
633 // Acquire the top-level lock
634 status = nvlink_lib_top_lock_acquire();
635 if (status != NVL_SUCCESS)
636 {
637 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
638 "%s: Failed to acquire top-level lock\n",
639 __FUNCTION__));
640
641 nvlink_free((void *)links);
642 return status;
643 }
644
645 //
646 // Top-level lock is now acquired. Proceed to traversing the device
647 // and link lists
648 //
649
650 FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
651 {
652 FOR_EACH_LINK_REGISTERED(link, dev, node)
653 {
654 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
655 {
656 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
657 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
658 __FUNCTION__));
659
660 nvlink_assert(0);
661
662 // Release the top-level lock and free links
663 nvlink_lib_top_lock_release();
664 nvlink_free((void *)links);
665 return NVL_ERR_INVALID_STATE;
666 }
667 links[numLinks] = link;
668 numLinks++;
669 }
670 }
671
672 // Acquire the per-link locks
673 status = nvlink_lib_link_locks_acquire(links, numLinks);
674 if (status != NVL_SUCCESS)
675 {
676 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
677 "%s: Failed to acquire per-link locks\n",
678 __FUNCTION__));
679
680 // Release the top-level lock
681 nvlink_lib_top_lock_release();
682 nvlink_free((void *)links);
683 return status;
684 }
685
686 //
687 // All the required per-link locks are now successfully acquired
688 // Release the top level-lock
689 //
690 nvlink_lib_top_lock_release();
691
692 nvlink_assert((links != NULL) && (numLinks > 0));
693
694 // Kick off the desired operation on all the registered links
695 switch (ctrlParams->cmd)
696 {
697 case CTRL_NVLINK_INITPHASE1:
698 {
699 nvlink_initphase1 *iocReq;
700
701 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initphase1);
702 if (!iocReq)
703 {
704 status = NVL_BAD_ARGS;
705 goto nvlink_lib_ctrl_all_links_end;
706 }
707
708 // default initialize status to NVL_SUCCESS
709 iocReq->status = NVL_SUCCESS;
710
711 if (links[0]->dev->enableALI)
712 {
713 status = NVL_SUCCESS;
714 goto nvlink_lib_ctrl_all_links_end;
715 }
716
717 iocReq->status = nvlink_core_initphase1(links, numLinks,
718 NVLINK_STATE_CHANGE_SYNC);
719 break;
720 }
721
722 case CTRL_NVLINK_RX_INIT_TERM:
723 {
724 nvlink_rx_init_term *iocReq;
725
726 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_rx_init_term);
727 if (!iocReq)
728 {
729 status = NVL_BAD_ARGS;
730 goto nvlink_lib_ctrl_all_links_end;
731 }
732
733 // default initialize status to NVL_SUCCESS
734 iocReq->status = NVL_SUCCESS;
735
736 //
737 // If the current nvlink device does not support the command
738 // skip using the command and return success for FM to continue on.
739 //
740 if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
741 {
742 status = NVL_SUCCESS;
743 goto nvlink_lib_ctrl_all_links_end;
744 }
745
746 iocReq->status = nvlink_core_rx_init_term(links, numLinks,
747 NVLINK_STATE_CHANGE_ASYNC);
748 break;
749 }
750
751 case CTRL_NVLINK_SET_RX_DETECT:
752 {
753 nvlink_set_rx_detect *iocReq;
754
755 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_rx_detect);
756 if (!iocReq)
757 {
758 status = NVL_BAD_ARGS;
759 goto nvlink_lib_ctrl_all_links_end;
760 }
761
762 // default initialize status to NVL_SUCCESS
763 iocReq->status = NVL_SUCCESS;
764
765 if (links[0]->dev->enableALI)
766 {
767 status = NVL_SUCCESS;
768 goto nvlink_lib_ctrl_all_links_end;
769 }
770
771 iocReq->status = nvlink_core_set_rx_detect(links, numLinks,
772 NVLINK_STATE_CHANGE_ASYNC);
773 break;
774 }
775
776 case CTRL_NVLINK_GET_RX_DETECT:
777 {
778 nvlink_get_rx_detect *iocReq;
779
780 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_get_rx_detect);
781 if (!iocReq)
782 {
783 status = NVL_BAD_ARGS;
784 goto nvlink_lib_ctrl_all_links_end;
785 }
786
787 // default initialize status to NVL_SUCCESS
788 iocReq->status = NVL_SUCCESS;
789
790 if (links[0]->dev->enableALI)
791 {
792 status = NVL_SUCCESS;
793 goto nvlink_lib_ctrl_all_links_end;
794 }
795
796 iocReq->status = nvlink_core_get_rx_detect(links, numLinks,
797 NVLINK_STATE_CHANGE_ASYNC);
798 break;
799 }
800
801 case CTRL_NVLINK_SET_TX_COMMON_MODE:
802 {
803 nvlink_set_tx_common_mode *iocReq;
804
805 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_set_tx_common_mode);
806 if (!iocReq)
807 {
808 status = NVL_BAD_ARGS;
809 goto nvlink_lib_ctrl_all_links_end;
810 }
811
812 // default initialize status to NVL_SUCCESS
813 iocReq->status = NVL_SUCCESS;
814
815 if (links[0]->dev->enableALI)
816 {
817 status = NVL_SUCCESS;
818 goto nvlink_lib_ctrl_all_links_end;
819 }
820
821 if (iocReq->commMode)
822 {
823 iocReq->status = nvlink_core_enable_common_mode(links, numLinks,
824 NVLINK_STATE_CHANGE_SYNC);
825 }
826 else if(links[0]->version <= NVLINK_DEVICE_VERSION_30)
827 {
828 iocReq->status = nvlink_core_disable_common_mode(links, numLinks,
829 NVLINK_STATE_CHANGE_SYNC);
830 }
831
832 //
833 // If the current nvlink device does not support disabling common mode
834 // skip using the command and return success for FM to continue on.
835 //
836 break;
837 }
838
839 case CTRL_NVLINK_CALIBRATE:
840 {
841 nvlink_calibrate *iocReq;
842 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_calibrate);
843
844 if (!iocReq)
845 {
846 status = NVL_BAD_ARGS;
847 goto nvlink_lib_ctrl_all_links_end;
848 }
849
850 // default initialize status to NVL_SUCCESS
851 iocReq->status = NVL_SUCCESS;
852
853 //
854 // If the current nvlink device does not support the command
855 // skip using the command and return success for FM to continue on.
856 //
857 if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
858 {
859 iocReq->status = NVL_SUCCESS;
860 goto nvlink_lib_ctrl_all_links_end;
861 }
862
863 iocReq->status = nvlink_core_calibrate_links(links, numLinks,
864 NVLINK_STATE_CHANGE_SYNC);
865 break;
866 }
867
868 case CTRL_NVLINK_ENABLE_DATA:
869 {
870 nvlink_enable_data *iocReq;
871
872 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_enable_data);
873 if (!iocReq)
874 {
875 goto nvlink_lib_ctrl_all_links_end;
876 }
877
878 // default initialize status to NVL_SUCCESS
879 iocReq->status = NVL_SUCCESS;
880
881 //
882 // If the current nvlink device does not support the command
883 // skip using the command and return success for FM to continue on.
884 //
885 if (links[0]->version >= NVLINK_DEVICE_VERSION_40)
886 {
887 status = NVL_SUCCESS;
888 goto nvlink_lib_ctrl_all_links_end;
889 }
890
891 iocReq->status = nvlink_core_enable_data(links, numLinks,
892 NVLINK_STATE_CHANGE_SYNC);
893 break;
894 }
895
896 case CTRL_NVLINK_LINK_INIT_ASYNC:
897 {
898 nvlink_link_init_async *iocReq;
899
900 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_link_init_async);
901 if (!iocReq)
902 {
903 status = NVL_BAD_ARGS;
904 goto nvlink_lib_ctrl_all_links_end;
905 }
906
907 // default initialize status to NVL_SUCCESS
908 iocReq->status = NVL_SUCCESS;
909
910 iocReq->status = nvlink_core_link_init_async(links, numLinks);
911 break;
912 }
913
914 case CTRL_NVLINK_INITNEGOTIATE:
915 {
916 nvlink_initnegotiate *iocReq;
917
918 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initnegotiate);
919 if (!iocReq)
920 {
921 status = NVL_BAD_ARGS;
922 goto nvlink_lib_ctrl_all_links_end;
923 }
924
925 // default initialize status to NVL_SUCCESS
926 iocReq->status = NVL_SUCCESS;
927
928 if (links[0]->dev->enableALI)
929 {
930 status = NVL_SUCCESS;
931 goto nvlink_lib_ctrl_all_links_end;
932 }
933
934 iocReq->status = nvlink_core_initnegotiate(links, numLinks,
935 NVLINK_STATE_CHANGE_ASYNC);
936 break;
937 }
938
939 case CTRL_NVLINK_INITPHASE5:
940 {
941 nvlink_initphase5 *iocReq;
942
943 iocReq = NVLINK_IOC_GET_BUF(ctrlParams, nvlink_initphase5);
944 if (!iocReq)
945 {
946 status = NVL_BAD_ARGS;
947 goto nvlink_lib_ctrl_all_links_end;
948 }
949
950 // default initialize status to NVL_SUCCESS
951 iocReq->status = NVL_SUCCESS;
952
953 //
954 // If the current nvlink device does not support the command
955 // skip using the command and return success for FM to continue on.
956 //
957 if (links[0]->version < NVLINK_DEVICE_VERSION_40 ||
958 links[0]->dev->enableALI)
959 {
960 status = NVL_SUCCESS;
961 goto nvlink_lib_ctrl_all_links_end;
962 }
963 iocReq->status = nvlink_core_initphase5(links, numLinks,
964 NVLINK_STATE_CHANGE_ASYNC);
965 break;
966 }
967
968 default:
969 {
970 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
971 "%s: unknown ioctl command specified.\n",
972 __FUNCTION__));
973
974 status = NVL_BAD_ARGS;
975 goto nvlink_lib_ctrl_all_links_end;
976 }
977
978 }
979
980 nvlink_lib_ctrl_all_links_end:
981
982 // Release the per-link locks
983 nvlink_lib_link_locks_release(links, numLinks);
984
985 if (links != NULL)
986 {
987 nvlink_free((void *)links);
988 }
989
990 return status;
991 }
992
993 /**
994 * Get the link init status on all queried links
995 *
996 * @param[in] statusParams IOCTL params
997 *
998 * return NvlStatus
999 */
1000 static NvlStatus
nvlink_lib_ctrl_device_link_init_status(nvlink_device_link_init_status * statusParams)1001 nvlink_lib_ctrl_device_link_init_status
1002 (
1003 nvlink_device_link_init_status *statusParams
1004 )
1005 {
1006 NvlStatus status = NVL_SUCCESS;
1007 nvlink_device *dev = NULL;
1008 nvlink_link *link = NULL;
1009 NvU32 numLinks = 0;
1010 NvU32 i = 0;
1011
1012 nvlink_link **links = (nvlink_link **)nvlink_malloc(
1013 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1014 if (links == NULL)
1015 {
1016 return NVL_NO_MEM;
1017 }
1018
1019 // Acquire the top-level lock
1020 status = nvlink_lib_top_lock_acquire();
1021 if (status != NVL_SUCCESS)
1022 {
1023 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1024 "%s: Failed to acquire top-level lock\n",
1025 __FUNCTION__));
1026
1027 nvlink_free((void *)links);
1028 return status;
1029 }
1030
1031 //
1032 // Top-level lock is now acquired. Proceed to traversing the device
1033 // and link lists
1034 //
1035
1036 // look-up user requested nvlink device object
1037 nvlink_core_get_device_by_devinfo(&statusParams->devInfo, &dev);
1038 if (dev == NULL)
1039 {
1040 //
1041 // Couldn't find the device ptr in the core library. Release the
1042 // top-level lock and return
1043 //
1044 nvlink_lib_top_lock_release();
1045
1046 nvlink_free((void *)links);
1047 return NVL_BAD_ARGS;
1048 }
1049
1050 FOR_EACH_LINK_REGISTERED(link, dev, node)
1051 {
1052 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1053 {
1054 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1055 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1056 __FUNCTION__));
1057
1058 nvlink_assert(0);
1059
1060 nvlink_lib_top_lock_release();
1061 nvlink_free((void *)links);
1062 return NVL_ERR_INVALID_STATE;
1063 }
1064 links[numLinks] = link;
1065 numLinks++;
1066 }
1067
1068 // Acquire the per-link locks
1069 status = nvlink_lib_link_locks_acquire(links, numLinks);
1070 if (status != NVL_SUCCESS)
1071 {
1072 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1073 "%s: Failed to acquire per-link locks\n",
1074 __FUNCTION__));
1075
1076 // Release the top-level lock
1077 nvlink_lib_top_lock_release();
1078
1079 nvlink_free((void *)links);
1080 return status;
1081 }
1082
1083 //
1084 // All the required per-link locks are now successfully acquired
1085 // Release the top level-lock
1086 //
1087 nvlink_lib_top_lock_release();
1088
1089 // Poll for links to reach SAFE/SWCFG and capture the status
1090 for (i = 0; i < numLinks; i++)
1091 {
1092 // status index should be within NVLINK_MAX_DEVICE_CONN
1093 if (i >= NVLINK_MAX_DEVICE_CONN)
1094 {
1095 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1096 "%s: number of links for the device >= NVLINK_MAX_DEVICE_CONN",
1097 __FUNCTION__));
1098
1099 nvlink_assert(0);
1100
1101 nvlink_lib_link_locks_release(links, numLinks);
1102 nvlink_free((void *)links);
1103 return NVL_ERR_INVALID_STATE;
1104 }
1105
1106 status = nvlink_core_wait_for_link_init(links[i]);
1107
1108 // indicate link init state to user
1109 statusParams->linkStatus[i].linkIndex = links[i]->linkNumber;
1110
1111 if (status == NVL_SUCCESS)
1112 {
1113 statusParams->linkStatus[i].initStatus = NV_TRUE;
1114 }
1115 else
1116 {
1117 statusParams->linkStatus[i].initStatus = NV_FALSE;
1118 }
1119 }
1120
1121 // Release the per-link locks
1122 nvlink_lib_link_locks_release(links, numLinks);
1123
1124 if (links != NULL)
1125 {
1126 nvlink_free((void *)links);
1127 }
1128 return NVL_SUCCESS;
1129 }
1130
1131 /**
1132 * Send discovery tokens on all the links for a given device
1133 *
1134 * @param[in] writeParams IOCTL params
1135 *
1136 * return NvlStatus
1137 */
1138 static NvlStatus
nvlink_lib_ctrl_device_write_discovery_tokens(nvlink_device_write_discovery_tokens * writeParams)1139 nvlink_lib_ctrl_device_write_discovery_tokens
1140 (
1141 nvlink_device_write_discovery_tokens *writeParams
1142 )
1143 {
1144 NvlStatus status = NVL_SUCCESS;
1145 nvlink_device *dev = NULL;
1146 nvlink_link *link = NULL;
1147 NvU32 numLinks = 0;
1148 NvU32 i = 0;
1149 NvU32 numTokens = 0;
1150
1151 nvlink_link **links = (nvlink_link **)nvlink_malloc(
1152 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1153 if (links == NULL)
1154 {
1155 return NVL_NO_MEM;
1156 }
1157
1158 // Initialize number of tokens written to 0
1159 writeParams->numTokens = 0;
1160
1161 // Acquire the top-level lock
1162 status = nvlink_lib_top_lock_acquire();
1163 if (status != NVL_SUCCESS)
1164 {
1165 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1166 "%s: Failed to acquire top-level lock\n",
1167 __FUNCTION__));
1168
1169 nvlink_free((void *)links);
1170 return status;
1171 }
1172
1173 //
1174 // Top-level lock is now acquired. Proceed to traversing the device
1175 // and link lists
1176 //
1177
1178 // look-up user requested nvlink device object
1179 nvlink_core_get_device_by_devinfo(&writeParams->devInfo, &dev);
1180 if (dev == NULL)
1181 {
1182 //
1183 // Couldn't find the device ptr in the core library. Release the
1184 // top-level lock and return
1185 //
1186 nvlink_lib_top_lock_release();
1187
1188 nvlink_free((void *)links);
1189 return NVL_BAD_ARGS;
1190 }
1191
1192 FOR_EACH_LINK_REGISTERED(link, dev, node)
1193 {
1194 nvlink_intranode_conn *conn = NULL;
1195
1196 nvlink_core_get_intranode_conn(link, &conn);
1197 if (conn != NULL)
1198 {
1199 // skip token write if we already have a connection for the link
1200 continue;
1201 }
1202
1203 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1204 {
1205 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1206 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1207 __FUNCTION__));
1208
1209 nvlink_assert(0);
1210
1211 // Release the top-level lock and free links
1212 nvlink_lib_top_lock_release();
1213 nvlink_free((void *)links);
1214 return NVL_ERR_INVALID_STATE;
1215 }
1216
1217 links[numLinks] = link;
1218 numLinks++;
1219 }
1220
1221 // Acquire the per-link locks
1222 status = nvlink_lib_link_locks_acquire(links, numLinks);
1223 if (status != NVL_SUCCESS)
1224 {
1225 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1226 "%s: Failed to acquire per-link locks\n",
1227 __FUNCTION__));
1228
1229 // Release the top-level lock
1230 nvlink_lib_top_lock_release();
1231
1232 nvlink_free((void *)links);
1233 return status;
1234 }
1235
1236 //
1237 // All the required per-link locks are now successfully acquired
1238 // Release the top level-lock
1239 //
1240 nvlink_lib_top_lock_release();
1241
1242 for (i = 0; i < numLinks; i++)
1243 {
1244 NvU64 writeToken = 0;
1245
1246 writeToken = nvlink_core_get_link_discovery_token(links[i]);
1247 status = nvlink_core_write_link_discovery_token(links[i], writeToken);
1248
1249 if (status == NVL_SUCCESS)
1250 {
1251 //
1252 // wrote a token. copy the token and link information to user
1253 // which can be used for comparing tokens across nodes.
1254 //
1255
1256 // total number of tokens should be within NVLINK_MAX_DEVICE_CONN
1257 if (numTokens >= NVLINK_MAX_DEVICE_CONN)
1258 {
1259 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1260 "%s: Number of tokens >= NVLINK_MAX_DEVICE_CONN\n",
1261 __FUNCTION__));
1262
1263 nvlink_assert(0);
1264
1265 nvlink_lib_link_locks_release(links, numLinks);
1266 nvlink_free((void *)links);
1267 return NVL_ERR_INVALID_STATE;
1268 }
1269
1270 writeParams->tokenInfo[numTokens].linkIndex = links[i]->linkNumber;
1271 writeParams->tokenInfo[numTokens].tokenValue = writeToken;
1272 numTokens++;
1273 }
1274 }
1275
1276 // update total number of tokens written
1277 writeParams->numTokens = numTokens;
1278
1279 // Release the per-link locks
1280 nvlink_lib_link_locks_release(links, numLinks);
1281
1282 if (links != NULL)
1283 {
1284 nvlink_free((void *)links);
1285 }
1286 return NVL_SUCCESS;
1287 }
1288
1289 /**
1290 * Read discovery tokens on all the links for a given device
1291 *
1292 * @param[in] readParams IOCTL params
1293 *
1294 * return NvlStatus
1295 */
1296 static NvlStatus
nvlink_lib_ctrl_device_read_discovery_tokens(nvlink_device_read_discovery_tokens * readParams)1297 nvlink_lib_ctrl_device_read_discovery_tokens
1298 (
1299 nvlink_device_read_discovery_tokens *readParams
1300 )
1301 {
1302 NvlStatus status = NVL_SUCCESS;
1303 nvlink_device *dev = NULL;
1304 nvlink_link *link = NULL;
1305 NvU32 numLinks = 0;
1306 NvU32 i = 0;
1307 NvU32 numTokens = 0;
1308
1309 nvlink_link **links = (nvlink_link **)nvlink_malloc(
1310 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1311 if (links == NULL)
1312 {
1313 return NVL_NO_MEM;
1314 }
1315
1316 // Initialize number of tokens read to 0
1317 readParams->numTokens = 0;
1318
1319 // Acquire the top-level lock
1320 status = nvlink_lib_top_lock_acquire();
1321 if (status != NVL_SUCCESS)
1322 {
1323 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1324 "%s: Failed to acquire top-level lock\n",
1325 __FUNCTION__));
1326
1327 nvlink_free((void *)links);
1328 return status;
1329 }
1330
1331 //
1332 // Top-level lock is now acquired. Proceed to traversing the device
1333 // and link lists
1334 //
1335
1336 // look-up user requested nvlink device object
1337 nvlink_core_get_device_by_devinfo(&readParams->devInfo, &dev);
1338 if (dev == NULL)
1339 {
1340 //
1341 // Couldn't find the device ptr in the core library. Release the
1342 // top-level lock and return
1343 //
1344 nvlink_lib_top_lock_release();
1345
1346 nvlink_free((void *)links);
1347 return NVL_BAD_ARGS;
1348 }
1349
1350 FOR_EACH_LINK_REGISTERED(link, dev, node)
1351 {
1352 nvlink_intranode_conn *conn = NULL;
1353
1354 nvlink_core_get_intranode_conn(link, &conn);
1355 if (conn != NULL)
1356 {
1357 // skip token write if we already have a connection for the link
1358 continue;
1359 }
1360
1361 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1362 {
1363 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1364 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1365 __FUNCTION__));
1366
1367 nvlink_assert(0);
1368
1369 // Release the top-level lock and free links
1370 nvlink_lib_top_lock_release();
1371 nvlink_free((void *)links);
1372 return NVL_ERR_INVALID_STATE;
1373 }
1374
1375 links[numLinks] = link;
1376 numLinks++;
1377 }
1378
1379 // Acquire the per-link locks
1380 status = nvlink_lib_link_locks_acquire(links, numLinks);
1381 if (status != NVL_SUCCESS)
1382 {
1383 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1384 "%s: Failed to acquire per-link locks\n",
1385 __FUNCTION__));
1386
1387 // Release the top-level lock
1388 nvlink_lib_top_lock_release();
1389
1390 nvlink_free((void *)links);
1391 return status;
1392 }
1393
1394 //
1395 // All the required per-link locks are now successfully acquired
1396 // Release the top level-lock
1397 //
1398 nvlink_lib_top_lock_release();
1399
1400 for (i = 0; i < numLinks; i++)
1401 {
1402 NvU64 readToken = 0;
1403
1404 // query discovery token from the link
1405 readToken = nvlink_core_read_link_discovery_token(links[i]);
1406
1407 // take non-zero tokens. token will be zero if read_discovery failed as well.
1408 if (readToken)
1409 {
1410 //
1411 // received a valid token. copy the token and link information to user
1412 // which can be used for comparing tokens across nodes.
1413 //
1414
1415 // total number of tokens should be within NVLINK_MAX_DEVICE_CONN
1416 if (numTokens >= NVLINK_MAX_DEVICE_CONN)
1417 {
1418 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1419 "%s: Number of tokens >= NVLINK_MAX_DEVICE_CONN\n",
1420 __FUNCTION__));
1421
1422 nvlink_assert(0);
1423
1424 nvlink_lib_link_locks_release(links, numLinks);
1425 nvlink_free((void *)links);
1426 return NVL_ERR_INVALID_STATE;
1427 }
1428
1429 readParams->tokenInfo[numTokens].linkIndex = links[i]->linkNumber;
1430 readParams->tokenInfo[numTokens].tokenValue = readToken;
1431 numTokens++;
1432 }
1433 }
1434
1435 // update total number of tokens read
1436 readParams->numTokens = numTokens;
1437
1438 // Release the per-link locks
1439 nvlink_lib_link_locks_release(links, numLinks);
1440
1441 if (links != NULL)
1442 {
1443 nvlink_free((void *)links);
1444 }
1445 return NVL_SUCCESS;
1446 }
1447
1448 /**
1449 * Perform peer link discovery
1450 *
1451 * @param[in] readParams IOCTL params
1452 *
1453 * return NvlStatus
1454 */
1455 static NvlStatus
_nvlink_lib_ctrl_device_discover_peer_link(nvlink_link * link)1456 _nvlink_lib_ctrl_device_discover_peer_link
1457 (
1458 nvlink_link *link
1459 )
1460 {
1461 NvlStatus status = NVL_SUCCESS;
1462
1463 //
1464 // If the link succeeds rxDet(link is in HS, SAFE, or SLEEP mode) then go through and find its
1465 // peer link. What is important is not actually finding the link, but making sure the corelib
1466 // goes through the discovery process and has endpoints cache the remote information in the corelib
1467 // such that FM or endpoints can query the corelib for the topology of the system.
1468 //
1469 NvU64 linkMode = NVLINK_LINKSTATE_OFF;
1470 status = link->link_handlers->get_dl_link_mode(link, &linkMode);
1471 if (status != NVL_SUCCESS)
1472 {
1473 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1474 "%s: Unable to get link mode for %s:%s\n",
1475 __FUNCTION__, link->dev->deviceName, link->linkName));
1476 return status;
1477 }
1478
1479 if ((linkMode == NVLINK_LINKSTATE_SAFE) ||
1480 (linkMode == NVLINK_LINKSTATE_HS) ||
1481 (linkMode == NVLINK_LINKSTATE_SLEEP))
1482 {
1483 nvlink_link *remoteLink = NULL;
1484 nvlink_core_discover_and_get_remote_end(link, &remoteLink, 0);
1485 if (remoteLink == NULL)
1486 {
1487 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
1488 "%s: link 0x%x: couldn't find link pair! Possible that other device queries need to finish before there is a found connection in the corelib\n",
1489 __FUNCTION__, link->linkNumber));
1490 }
1491 }
1492
1493 return NVL_SUCCESS;
1494 }
1495
1496 /**
1497 * Read the SIDs for the the local and remote device
1498 *
1499 * @param[in] readParams IOCTL params
1500 *
1501 * return NvlStatus
1502 */
1503 static NvlStatus
nvlink_lib_ctrl_device_read_sids(nvlink_device_read_sids * readParams)1504 nvlink_lib_ctrl_device_read_sids
1505 (
1506 nvlink_device_read_sids *readParams
1507 )
1508 {
1509 NvlStatus status = NVL_SUCCESS;
1510 nvlink_device *dev = NULL;
1511 nvlink_link *link = NULL;
1512 NvU32 numLinks = 0;
1513 NvU32 i = 0;
1514 NvU32 numEntries = 0;
1515
1516 nvlink_link **links = (nvlink_link **)nvlink_malloc(
1517 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1518 if (links == NULL)
1519 {
1520 return NVL_NO_MEM;
1521 }
1522
1523 // Initialize number of SIDs read to 0
1524 readParams->numEntries = 0;
1525
1526 // Acquire the top-level lock
1527 status = nvlink_lib_top_lock_acquire();
1528 if (status != NVL_SUCCESS)
1529 {
1530 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1531 "%s: Failed to acquire top-level lock\n",
1532 __FUNCTION__));
1533
1534 nvlink_free((void *)links);
1535 return status;
1536 }
1537
1538 //
1539 // Top-level lock is now acquired. Proceed to traversing the device
1540 // and link lists
1541 //
1542
1543 // look-up user requested nvlink device object
1544 nvlink_core_get_device_by_devinfo(&readParams->devInfo, &dev);
1545 if (dev == NULL)
1546 {
1547 //
1548 // Couldn't find the device ptr in the core library. Release the
1549 // top-level lock and return
1550 //
1551 nvlink_lib_top_lock_release();
1552
1553 nvlink_free((void *)links);
1554 return NVL_BAD_ARGS;
1555 }
1556
1557 FOR_EACH_LINK_REGISTERED(link, dev, node)
1558 {
1559 nvlink_intranode_conn *conn = NULL;
1560
1561 nvlink_core_get_intranode_conn(link, &conn);
1562 if (conn != NULL)
1563 {
1564 // skip token write if we already have a connection for the link
1565 continue;
1566 }
1567
1568 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1569 {
1570 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1571 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1572 __FUNCTION__));
1573
1574 nvlink_assert(0);
1575
1576 // Release the top-level lock and free links
1577 nvlink_lib_top_lock_release();
1578 nvlink_free((void *)links);
1579 return NVL_ERR_INVALID_STATE;
1580 }
1581
1582 links[numLinks] = link;
1583 numLinks++;
1584 }
1585
1586 // Acquire the per-link locks
1587 status = nvlink_lib_link_locks_acquire(links, numLinks);
1588 if (status != NVL_SUCCESS)
1589 {
1590 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1591 "%s: Failed to acquire per-link locks\n",
1592 __FUNCTION__));
1593
1594 // Release the top-level lock
1595 nvlink_lib_top_lock_release();
1596
1597 nvlink_free((void *)links);
1598 return status;
1599 }
1600
1601 //
1602 // All the required per-link locks are now successfully acquired
1603 // Release the top level-lock
1604 //
1605 nvlink_lib_top_lock_release();
1606
1607 for (i = 0; i < numLinks; i++)
1608 {
1609 // ALI specific handling to update corelib structures and verify link status
1610 if (dev->enableALI)
1611 {
1612 status = _nvlink_lib_ctrl_device_discover_peer_link(links[i]);
1613 if (status != NVL_SUCCESS)
1614 {
1615 // Release the per-link locks and free links
1616 nvlink_lib_link_locks_release(links, numLinks);
1617 nvlink_free((void *)links);
1618 return status;
1619 }
1620 }
1621
1622 // Fill-up the local/remote link numbers and SIDs
1623 readParams->sidInfo[numEntries].localLinkSid = links[i]->localSid;
1624 readParams->sidInfo[numEntries].remoteLinkSid = links[i]->remoteSid;
1625 readParams->sidInfo[numEntries].localLinkNum = links[i]->linkNumber;
1626 readParams->sidInfo[numEntries].remoteLinkNum = links[i]->remoteLinkId;
1627 numEntries++;
1628 }
1629
1630 // update total number of entries read
1631 readParams->numEntries = numEntries;
1632
1633 // Release the per-link locks
1634 nvlink_lib_link_locks_release(links, numLinks);
1635
1636 if (links != NULL)
1637 {
1638 nvlink_free((void *)links);
1639 }
1640 return NVL_SUCCESS;
1641 }
1642
1643 /**
1644 * Discover all the intranode connections from the core library
1645 *
1646 * @param[in] connParams IOCTL params
1647 *
1648 * return NvlStatus
1649 */
1650 static NvlStatus
nvlink_lib_ctrl_discover_intranode_conns(nvlink_discover_intranode_conns * connParams)1651 nvlink_lib_ctrl_discover_intranode_conns
1652 (
1653 nvlink_discover_intranode_conns *connParams
1654 )
1655 {
1656 NvlStatus status = NVL_SUCCESS;
1657 nvlink_device *dev = NULL;
1658 nvlink_link *link = NULL;
1659 NvU32 numLinks = 0;
1660
1661 nvlink_link **links = (nvlink_link **)nvlink_malloc(
1662 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
1663 if (links == NULL)
1664 {
1665 return NVL_NO_MEM;
1666 }
1667
1668 // Acquire the top-level lock
1669 status = nvlink_lib_top_lock_acquire();
1670 if (status != NVL_SUCCESS)
1671 {
1672 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1673 "%s: Failed to acquire top-level lock\n",
1674 __FUNCTION__));
1675
1676 nvlink_free((void *)links);
1677 return status;
1678 }
1679
1680 //
1681 // Top-level lock is now acquired. Proceed to traversing the device
1682 // and link lists
1683 //
1684
1685 FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
1686 {
1687 FOR_EACH_LINK_REGISTERED(link, dev, node)
1688 {
1689 if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
1690 {
1691 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1692 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
1693 __FUNCTION__));
1694
1695 nvlink_assert(0);
1696
1697 // Release the top-level lock and free links
1698 nvlink_lib_top_lock_release();
1699 nvlink_free((void *)links);
1700 return NVL_ERR_INVALID_STATE;
1701 }
1702
1703 links[numLinks] = link;
1704 numLinks++;
1705 }
1706 }
1707
1708 // Acquire the per-link locks
1709 status = nvlink_lib_link_locks_acquire(links, numLinks);
1710 if (status != NVL_SUCCESS)
1711 {
1712 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1713 "%s: Failed to acquire per-link locks\n",
1714 __FUNCTION__));
1715
1716 // Release the top-level lock
1717 nvlink_lib_top_lock_release();
1718
1719 nvlink_free((void *)links);
1720 return status;
1721 }
1722
1723 //
1724 // All the required per-link locks are now successfully acquired
1725 // Note: We will still need to hold the top-level lock, because we might have
1726 // to add connections to the intranode connections list if any case new
1727 // intranode connection is discovered
1728 //
1729
1730 FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
1731 {
1732 FOR_EACH_LINK_REGISTERED(link, dev, node)
1733 {
1734 NvU64 writeToken = 0;
1735 nvlink_intranode_conn *conn = NULL;
1736
1737 nvlink_core_get_intranode_conn(link, &conn);
1738 if (conn != NULL)
1739 {
1740 // skip token write if we already have a connection for the link
1741 continue;
1742 }
1743
1744 if (!link->bRxDetected)
1745 {
1746 // If receiver detect has failed, then there is no connection
1747 continue;
1748 }
1749
1750 // ALI specific handling to update corelib structures and verify link status
1751 if (dev->enableALI)
1752 {
1753 status = _nvlink_lib_ctrl_device_discover_peer_link(link);
1754 if (status != NVL_SUCCESS)
1755 {
1756 // Release the per-link locks
1757 nvlink_lib_link_locks_release(links, numLinks);
1758
1759 // Release the top-level lock
1760 nvlink_lib_top_lock_release();
1761 nvlink_free((void *)links);
1762 return status;
1763 }
1764 }
1765
1766 writeToken = nvlink_core_get_link_discovery_token(link);
1767
1768 if ((link->version < NVLINK_DEVICE_VERSION_30) ||
1769 ((link->localSid == 0) || (link->remoteSid == 0)))
1770 {
1771 nvlink_core_write_link_discovery_token(link, writeToken);
1772
1773 // wrote a token. read back tokens from all links and create connection
1774 nvlink_core_correlate_conn_by_token(link, writeToken, NV_FALSE);
1775 }
1776 else
1777 {
1778 // From 3.0 we rely on Sid values. So send skiptoken as true.
1779 nvlink_core_correlate_conn_by_token(link, writeToken, NV_TRUE);
1780 }
1781 }
1782 }
1783
1784 // Release the per-link locks
1785 nvlink_lib_link_locks_release(links, numLinks);
1786
1787 // Release the top-level lock
1788 nvlink_lib_top_lock_release();
1789
1790 if (links != NULL)
1791 {
1792 nvlink_free((void *)links);
1793 }
1794 return NVL_SUCCESS;
1795 }
1796
1797 /**
1798 * Get the intranode connections from the core library
1799 *
1800 * @param[in] getParams IOCTL params
1801 *
1802 * return NvlStatus
1803 */
1804 static NvlStatus
nvlink_lib_ctrl_device_get_intranode_conns(nvlink_device_get_intranode_conns * getParams)1805 nvlink_lib_ctrl_device_get_intranode_conns
1806 (
1807 nvlink_device_get_intranode_conns *getParams
1808 )
1809 {
1810 NvlStatus status = NVL_SUCCESS;
1811 nvlink_device *dev = NULL;
1812 NvU32 numConns = 0;
1813 nvlink_intranode_conn *conn = NULL;
1814
1815 // Initialize number of connections to 0
1816 getParams->numConnections = 0;
1817
1818 // Acquire the top-level lock
1819 status = nvlink_lib_top_lock_acquire();
1820 if (status != NVL_SUCCESS)
1821 {
1822 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1823 "%s: Failed to acquire top-level lock\n",
1824 __FUNCTION__));
1825
1826 return status;
1827 }
1828
1829 //
1830 // Top-level lock is now acquired. Proceed to traversing the device
1831 // and link lists
1832 //
1833
1834 // look-up user requested nvlink device object
1835 nvlink_core_get_device_by_devinfo(&getParams->devInfo, &dev);
1836 if (dev == NULL)
1837 {
1838 //
1839 // Couldn't find the device ptr in the core library. Release the
1840 // top-level lock and return
1841 //
1842 nvlink_lib_top_lock_release();
1843
1844 return NVL_BAD_ARGS;
1845 }
1846
1847 FOR_EACH_CONNECTION(conn, nvlinkLibCtx.nv_intraconn_head, node)
1848 {
1849 //
1850 // copy connection information if source or destination device of
1851 // this connection belong to the nvlink device specified by user
1852 //
1853 if ((conn->end0->dev == dev) || (conn->end1->dev == dev))
1854 {
1855 // total number of connections should be within NVLINK_MAX_DEVICE_CONN
1856 if (numConns >= NVLINK_MAX_DEVICE_CONN)
1857 {
1858 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1859 "%s: numConns >= NVLINK_MAX_DEVICE_CONN\n",
1860 __FUNCTION__));
1861
1862 nvlink_assert(0);
1863
1864 nvlink_lib_top_lock_release();
1865 return NVL_ERR_INVALID_STATE;
1866 }
1867
1868 // copy source endpoint information
1869 nvlink_core_copy_endpoint_info(conn->end0, &getParams->conn[numConns].srcEndPoint);
1870
1871 // copy destination endpoint information
1872 nvlink_core_copy_endpoint_info(conn->end1, &getParams->conn[numConns].dstEndPoint);
1873
1874 numConns++;
1875 }
1876 }
1877
1878 getParams->numConnections = numConns;
1879
1880 // Release the top-level lock
1881 nvlink_lib_top_lock_release();
1882
1883 return NVL_SUCCESS;
1884 }
1885
1886 /**
1887 * Add a discovered internode connection
1888 *
1889 * @param[in] addParams IOCTL params
1890 *
1891 * return NvlStatus
1892 */
1893 static NvlStatus
nvlink_lib_ctrl_add_internode_conn(nvlink_add_internode_conn * addParams)1894 nvlink_lib_ctrl_add_internode_conn
1895 (
1896 nvlink_add_internode_conn *addParams
1897 )
1898 {
1899 nvlink_link *localLink = NULL;
1900 nvlink_intranode_conn *intraConn = NULL;
1901 NvlStatus status = NVL_SUCCESS;
1902
1903 // Acquire the top-level lock
1904 status = nvlink_lib_top_lock_acquire();
1905 if (status != NVL_SUCCESS)
1906 {
1907 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1908 "%s: Failed to acquire top-level lock\n",
1909 __FUNCTION__));
1910
1911 return status;
1912 }
1913
1914 //
1915 // Top-level lock is now acquired. Proceed to traversing the device
1916 // and link lists
1917 //
1918
1919 // make sure that this connection is multi-node
1920 if (addParams->localEndPoint.nodeId == addParams->remoteEndPoint.nodeId)
1921 {
1922 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1923 "%s: Internode connection add with same node id for local and remote endpoint\n",
1924 __FUNCTION__));
1925
1926 // Release the top-level lock
1927 nvlink_lib_top_lock_release();
1928
1929 return NVL_BAD_ARGS;
1930 }
1931
1932 // validate the remote endpoint device type information
1933 if (!nvlink_core_is_supported_device_type(addParams->remoteEndPoint.devType))
1934 {
1935 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1936 "%s: Internode connection add with invalid remote device type\n",
1937 __FUNCTION__));
1938
1939 // Release the top-level lock
1940 nvlink_lib_top_lock_release();
1941
1942 return NVL_BAD_ARGS;
1943 }
1944
1945 //
1946 // look-up the nvlink link objects. Look-up will fail if there is a
1947 // fabric node id mismatch. So an explicit check against self
1948 // node id is not required.
1949 //
1950 nvlink_core_get_link_by_endpoint(&addParams->localEndPoint, &localLink);
1951 if (localLink == NULL)
1952 {
1953 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1954 "%s: Internode connection add with no matching local endpoint\n",
1955 __FUNCTION__));
1956
1957 //
1958 // Couldn't find the endpoint registered in the core library. Release the
1959 // top-level lock and return
1960 //
1961 nvlink_lib_top_lock_release();
1962
1963 return NVL_BAD_ARGS;
1964 }
1965
1966 // can't add internode connection if we have an intranode connection
1967 nvlink_core_get_intranode_conn(localLink, &intraConn);
1968 if (intraConn != NULL)
1969 {
1970 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
1971 "%s: Found an intranode connection while adding internode connection\n",
1972 __FUNCTION__));
1973
1974 // Release the top-level lock
1975 nvlink_lib_top_lock_release();
1976
1977 return NVL_BAD_ARGS;
1978 }
1979
1980 // all the sanity check passed, add this internode connection in our context
1981 status = nvlink_core_add_internode_conn(localLink, &addParams->remoteEndPoint);
1982
1983 // Release the top-level lock
1984 nvlink_lib_top_lock_release();
1985
1986 return status;
1987 }
1988
1989 /**
1990 * Remove an internode connection from the list
1991 *
1992 * @param[in] removeParams IOCTL params
1993 *
1994 * return NvlStatus
1995 */
1996 static NvlStatus
nvlink_lib_ctrl_remove_internode_conn(nvlink_remove_internode_conn * removeParams)1997 nvlink_lib_ctrl_remove_internode_conn
1998 (
1999 nvlink_remove_internode_conn *removeParams
2000 )
2001 {
2002 nvlink_link *localLink = NULL;
2003 NvlStatus status = NVL_SUCCESS;
2004
2005 // Acquire the top-level lock
2006 status = nvlink_lib_top_lock_acquire();
2007 if (status != NVL_SUCCESS)
2008 {
2009 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2010 "%s: Failed to acquire top-level lock\n",
2011 __FUNCTION__));
2012
2013 return status;
2014 }
2015
2016 //
2017 // Top-level lock is now acquired. Proceed to traversing the device
2018 // and link lists
2019 //
2020
2021 //
2022 // look-up the nvlink link objects. Look-up will fail if there is a
2023 // fabric node id mismatch. So an explicit check against self
2024 // node id is not required.
2025 //
2026 nvlink_core_get_link_by_endpoint(&removeParams->localEndPoint, &localLink);
2027 if (localLink == NULL)
2028 {
2029 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2030 "%s: Internode connection remove with no matching local endpoint\n",
2031 __FUNCTION__));
2032
2033 //
2034 // Couldn't find the endpoint registered in the core library. Release the
2035 // top-level lock and return
2036 //
2037 nvlink_lib_top_lock_release();
2038
2039 return NVL_BAD_ARGS;
2040 }
2041
2042 // Acquire the per-link lock
2043 status = nvlink_lib_link_locks_acquire(&localLink, 1);
2044 if (status != NVL_SUCCESS)
2045 {
2046 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2047 "%s: Failed to acquire per-link lock\n",
2048 __FUNCTION__));
2049
2050 // Release the top-level lock
2051 nvlink_lib_top_lock_release();
2052
2053 return status;
2054 }
2055
2056 // all the sanity check passed, remove this internode connection from our context
2057 nvlink_core_remove_internode_conn(localLink);
2058
2059 // Release the per-link lock
2060 nvlink_lib_link_locks_release(&localLink, 1);
2061
2062 // Release the top-level lock
2063 nvlink_lib_top_lock_release();
2064
2065 return NVL_SUCCESS;
2066 }
2067
2068 /**
2069 * Train the intranode connection to the desired target state
2070 *
2071 * @param[in] trainParams IOCTL params
2072 *
2073 * return NvlStatus
2074 */
2075 static NvlStatus
nvlink_lib_ctrl_train_intranode_conn(nvlink_train_intranode_conn * trainParams)2076 nvlink_lib_ctrl_train_intranode_conn
2077 (
2078 nvlink_train_intranode_conn *trainParams
2079 )
2080 {
2081 nvlink_link *srcLink = NULL;
2082 nvlink_link *dstLink = NULL;
2083 nvlink_link *initLinks[2] = {0};
2084 nvlink_intranode_conn *conn = NULL;
2085 NvlStatus status = NVL_SUCCESS;
2086 NvU32 count;
2087 NvU32 i;
2088
2089 // make sure that this call is for single node systems
2090 if (trainParams->srcEndPoint.nodeId != trainParams->dstEndPoint.nodeId)
2091 {
2092 return NVL_BAD_ARGS;
2093 }
2094
2095 // Acquire the top-level lock
2096 status = nvlink_lib_top_lock_acquire();
2097 if (status != NVL_SUCCESS)
2098 {
2099 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2100 "%s: Failed to acquire top-level lock\n",
2101 __FUNCTION__));
2102
2103 return status;
2104 }
2105
2106 //
2107 // Top-level lock is now acquired. Proceed to traversing the device
2108 // and link lists
2109 //
2110
2111 //
2112 // look-up the nvlink link objects. Look-up will fail if there is a
2113 // fabric node id mismatch. So an explicit check against self
2114 // node id is not required.
2115 //
2116 nvlink_core_get_link_by_endpoint(&trainParams->srcEndPoint, &srcLink);
2117 nvlink_core_get_link_by_endpoint(&trainParams->dstEndPoint, &dstLink);
2118
2119 // we can't train if both ends are not found
2120 if ((srcLink == NULL) || (dstLink == NULL))
2121 {
2122 //
2123 // Couldn't find the endpoints registered in the core library. Release
2124 // the top-level lock and return
2125 //
2126 nvlink_lib_top_lock_release();
2127
2128 return NVL_BAD_ARGS;
2129 }
2130
2131 // look-up the nvlink connection object by source link
2132 nvlink_core_get_intranode_conn(srcLink, &conn);
2133 if (conn == NULL)
2134 {
2135 //
2136 // Couldn't find an associated connection for the 2 endpoints. Release
2137 // the top-level lock and return
2138 //
2139 nvlink_lib_top_lock_release();
2140
2141 return NVL_BAD_ARGS;
2142 }
2143
2144 //
2145 // we found the connection by the source link. Make sure that dest link is
2146 // indeed, the user specified one as well
2147 //
2148 if ((conn->end0 != dstLink) && (conn->end1 != dstLink))
2149 {
2150 //
2151 // The dest endpoint is not the remote end for the src endpoint. Release
2152 // the top-level lock and return
2153 //
2154 nvlink_lib_top_lock_release();
2155
2156 return NVL_BAD_ARGS;
2157 }
2158
2159 initLinks[0] = conn->end0;
2160 initLinks[1] = conn->end1;
2161
2162 // If loopback then only pass in 1 link
2163 if (conn->end0 != conn->end1)
2164 {
2165 count = 2;
2166 }
2167 else
2168 {
2169 count = 1;
2170 }
2171
2172 // Acquire the per-link locks
2173 status = nvlink_lib_link_locks_acquire(initLinks, 2);
2174 if (status != NVL_SUCCESS)
2175 {
2176 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2177 "%s: Failed to acquire per-link locks\n",
2178 __FUNCTION__));
2179
2180 // Release the top-level lock
2181 nvlink_lib_top_lock_release();
2182
2183 return status;
2184 }
2185
2186 //
2187 // All the required per-link locks are now successfully acquired
2188 // Release the top level-lock
2189 //
2190 nvlink_lib_top_lock_release();
2191
2192 // the connection looks sane, initiate the training
2193 switch (trainParams->trainTo)
2194 {
2195 case nvlink_train_conn_off_to_swcfg:
2196 {
2197 if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2198 {
2199 // non-ALI training for NVLink4.0+
2200 if (!srcLink->dev->enableALI)
2201 {
2202 nvlink_core_init_links_from_off_to_swcfg_non_ALI(
2203 initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2204 }
2205 }
2206 else
2207 {
2208 // ALT training for NVLink3.0+
2209 nvlink_core_init_links_from_off_to_swcfg(
2210 initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2211 }
2212 break;
2213 }
2214 case nvlink_train_conn_swcfg_to_active:
2215 {
2216 if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2217 {
2218 // non-ALI training for NVLink4.0+
2219 if (!srcLink->dev->enableALI)
2220 {
2221 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI(
2222 &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2223 }
2224 }
2225 else if (srcLink->version >= NVLINK_DEVICE_VERSION_30)
2226 {
2227 // ALT training for NVLink3.0+
2228 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT(
2229 &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2230 }
2231 else
2232 {
2233 // Legacy training for pre-NVLink3.0
2234 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy(
2235 &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2236 }
2237 break;
2238 }
2239 case nvlink_train_conn_active_to_swcfg:
2240 {
2241 status = nvlink_core_powerdown_intranode_conns_from_active_to_swcfg(
2242 &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2243 break;
2244 }
2245 case nvlink_train_conn_to_off:
2246 case nvlink_train_conn_swcfg_to_off:
2247 {
2248 status = nvlink_core_powerdown_intranode_conns_from_active_to_off(
2249 &conn, 1, NVLINK_STATE_CHANGE_SYNC);
2250 if (status == NVL_SUCCESS)
2251 {
2252 nvlink_core_reset_intranode_conns(&conn, 1, NVLINK_STATE_CHANGE_SYNC);
2253 }
2254 break;
2255 }
2256 case nvlink_train_conn_off_to_active_ali_non_blocking:
2257 case nvlink_train_conn_off_to_active_ali_blocking:
2258 {
2259 if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
2260 srcLink->dev->enableALI)
2261 {
2262 status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(initLinks, count);
2263
2264 if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
2265 {
2266 NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
2267 do
2268 {
2269 nvlink_sleep(1);
2270 status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
2271 if (status == NVL_SUCCESS)
2272 {
2273 break;
2274 }
2275
2276 timeout--;
2277 } while(timeout > 0);
2278
2279 if (status == NVL_SUCCESS)
2280 {
2281 for ( i = 0; i < count; ++i)
2282 {
2283 //
2284 // NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
2285 // setting buffer ready so use the internal check to see if the request for ALI completed
2286 //
2287 (void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
2288 }
2289 }
2290 }
2291 }
2292 break;
2293 }
2294 default:
2295 {
2296 status = NVL_BAD_ARGS;
2297 break;
2298 }
2299 }
2300
2301 //
2302 // always get the latest link state values so that
2303 // user has additional information other than just the return value.
2304 //
2305 nvlink_core_get_endpoint_state(conn->end0, &trainParams->srcEndState);
2306 nvlink_core_get_endpoint_state(conn->end1, &trainParams->dstEndState);
2307
2308 // Release the per-link locks
2309 nvlink_lib_link_locks_release(initLinks, 2);
2310
2311 return status;
2312 }
2313
2314 /**
2315 * Train the intranode connections in parallel to the desired target state
2316 *
2317 * @param[in] trainParams IOCTL params
2318 *
2319 * return NvlStatus
2320 */
2321 static NvlStatus
nvlink_lib_ctrl_train_intranode_conns_parallel(nvlink_train_intranode_conns_parallel * trainParams)2322 nvlink_lib_ctrl_train_intranode_conns_parallel
2323 (
2324 nvlink_train_intranode_conns_parallel *trainParams
2325 )
2326 {
2327 nvlink_link *srcLink = NULL;
2328 nvlink_link *dstLink = NULL;
2329 nvlink_link **trainLinks = NULL;
2330 nvlink_link **initLinks = NULL;
2331 nvlink_intranode_conn **conns = NULL;
2332 NvU32 numConns = 0;
2333 NvlStatus status = NVL_SUCCESS;
2334 NvU32 i;
2335 NvU32 count = 0;
2336
2337 // sanity check endPointPairsCount
2338 if (trainParams->endPointPairsCount > NVLINK_MAX_PARALLEL_CONNS_TRAIN_COUNT)
2339 {
2340 return NVL_BAD_ARGS;
2341 }
2342
2343 //
2344 // sanity check the input parms
2345 // make sure that this call is for single node systems
2346 //
2347 numConns = trainParams->endPointPairsCount;
2348 for (i = 0; i < numConns; i++)
2349 {
2350 if (trainParams->endPointPairs[i].src.nodeId !=
2351 trainParams->endPointPairs[i].dst.nodeId)
2352 {
2353 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2354 "%s: Node index 0x%x with mis-match ids (src:0x%x dst:0x%x).\n",
2355 __FUNCTION__ , i,
2356 trainParams->endPointPairs[i].src.nodeId,
2357 trainParams->endPointPairs[i].dst.nodeId));
2358
2359 return NVL_BAD_ARGS;
2360 }
2361 if ((trainParams->endPointPairs[i].src.pciInfo.bus == trainParams->endPointPairs[i].dst.pciInfo.bus) &&
2362 (trainParams->endPointPairs[i].src.pciInfo.device == trainParams->endPointPairs[i].dst.pciInfo.device) &&
2363 (trainParams->endPointPairs[i].src.pciInfo.function == trainParams->endPointPairs[i].dst.pciInfo.function) &&
2364 (trainParams->endPointPairs[i].src.linkIndex == trainParams->endPointPairs[i].dst.linkIndex))
2365 {
2366 count++;
2367 }
2368 else
2369 {
2370 count = count + 2;
2371 }
2372 }
2373
2374 // Allocate space for the connection list
2375 conns = (nvlink_intranode_conn **)nvlink_malloc(
2376 sizeof(nvlink_intranode_conn *) * numConns);
2377 if (conns == NULL)
2378 {
2379 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2380 "%s: Failed to allocate space for connections list\n",
2381 __FUNCTION__));
2382
2383 status = NVL_ERR_GENERIC;
2384 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2385 }
2386
2387 // Allocate space for the links list for link initialization
2388 initLinks = (nvlink_link **)nvlink_malloc(sizeof(nvlink_link *) * count);
2389 if (initLinks == NULL)
2390 {
2391 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2392 "%s: Failed to allocate space for links list for link initialization\n",
2393 __FUNCTION__));
2394
2395 status = NVL_ERR_GENERIC;
2396 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2397 }
2398
2399 // Allocate space for the links list for link training
2400 trainLinks = (nvlink_link **)nvlink_malloc(sizeof(nvlink_link *) * numConns);
2401 if (trainLinks == NULL)
2402 {
2403 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2404 "%s: Failed to allocate space for links list for link training\n",
2405 __FUNCTION__));
2406
2407 status = NVL_ERR_GENERIC;
2408 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2409 }
2410
2411 nvlink_memset(conns, 0, sizeof(nvlink_intranode_conn *) * numConns);
2412 nvlink_memset(initLinks, 0, sizeof(nvlink_link *) * count);
2413 nvlink_memset(trainLinks, 0, sizeof(nvlink_link *) * numConns);
2414
2415 // Acquire the top-level lock
2416 status = nvlink_lib_top_lock_acquire();
2417 if (status != NVL_SUCCESS)
2418 {
2419 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2420 "%s: Failed to acquire top-level lock\n",
2421 __FUNCTION__));
2422
2423 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2424 }
2425
2426 //
2427 // Top-level lock is now acquired. Proceed to traversing the device and
2428 // link lists and connections list
2429 //
2430 count = 0;
2431 // Get all the connections associated with the list of links
2432 for (i = 0; i < numConns; i++)
2433 {
2434 //
2435 // look-up the nvlink link objects. Look-up will fail if there is a
2436 // fabric node id mismatch. So an explicit check against self
2437 // node id is not required.
2438 //
2439 srcLink = NULL;
2440 dstLink = NULL;
2441
2442 nvlink_core_get_link_by_endpoint(&trainParams->endPointPairs[i].src, &srcLink);
2443 nvlink_core_get_link_by_endpoint(&trainParams->endPointPairs[i].dst, &dstLink);
2444
2445 // we can't train if both ends of a pair not found
2446 if ((srcLink == NULL) || (dstLink == NULL))
2447 {
2448 //
2449 // Couldn't find the endpoints registered in the core library. Release
2450 // the top-level lock and return
2451 //
2452 nvlink_lib_top_lock_release();
2453
2454 status = NVL_BAD_ARGS;
2455 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2456 }
2457
2458 // look-up the nvlink connection object by source link
2459 nvlink_core_get_intranode_conn(srcLink, &conns[i]);
2460 if (conns[i] == NULL)
2461 {
2462 //
2463 // Couldn't find an associated connection for the 2 endpoints. Release
2464 // the top-level lock and return
2465 //
2466 nvlink_lib_top_lock_release();
2467
2468 status = NVL_BAD_ARGS;
2469 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2470 }
2471
2472 //
2473 // we found the connection by source link. Make sure that dest link is
2474 // indeed, the user specified one as well
2475 //
2476 if ((conns[i]->end0 != dstLink) && (conns[i]->end1 != dstLink))
2477 {
2478 //
2479 // The dest endpoint is not the remote end for the src endpoint. Release
2480 // the top-level lock and return
2481 //
2482 nvlink_lib_top_lock_release();
2483
2484 status = NVL_BAD_ARGS;
2485 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2486 }
2487 if (srcLink == dstLink)
2488 {
2489 initLinks[count] = srcLink;
2490 count++;
2491 }
2492 else
2493 {
2494 initLinks[count] = srcLink;
2495 initLinks[count + 1] = dstLink;
2496 count = count + 2;
2497 }
2498 trainLinks[i] = srcLink;
2499 }
2500
2501 // Acquire the per-link locks
2502 status = nvlink_lib_link_locks_acquire(initLinks, count);
2503 if (status != NVL_SUCCESS)
2504 {
2505 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2506 "%s: Failed to acquire per-link locks\n",
2507 __FUNCTION__));
2508
2509 // Release the top-level lock
2510 nvlink_lib_top_lock_release();
2511
2512 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2513 }
2514
2515 //
2516 // All the required per-link locks are now successfully acquired
2517 // Release the top level-lock
2518 //
2519 nvlink_lib_top_lock_release();
2520
2521 // Check all the links captured have version >= 3.0
2522 for (i = 0; i < numConns; i++)
2523 {
2524 // Parallel training allowed NvLink 3.0 & above
2525 if ((conns[i]->end0->version < NVLINK_DEVICE_VERSION_30) ||
2526 (conns[i]->end1->version < NVLINK_DEVICE_VERSION_30))
2527 {
2528 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2529 "%s: Parallel training not allowed with nvlink version 0x%x indexed 0x%x\n",
2530 __FUNCTION__ ,
2531 conns[i]->end0->version, i));
2532
2533 //
2534 // Parallel training is allowed for only NVLink 3.0 and above. Release
2535 // the per link locks and return
2536 //
2537 nvlink_lib_link_locks_release(initLinks, count);
2538
2539 status = NVL_BAD_ARGS;
2540 goto nvlink_lib_ctrl_train_intranode_conns_parallel_end;
2541 }
2542 }
2543
2544 // the connection looks sane, initiate the training
2545 switch (trainParams->trainTo)
2546 {
2547 case nvlink_train_conn_off_to_swcfg:
2548 {
2549 if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2550 {
2551 // non-ALI training for NVLink4.0+
2552 if (!srcLink->dev->enableALI)
2553 {
2554 nvlink_core_init_links_from_off_to_swcfg_non_ALI(
2555 initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2556 }
2557 }
2558 else
2559 {
2560 // ALT training for NVLink3.0+
2561 nvlink_core_init_links_from_off_to_swcfg(
2562 initLinks, count, NVLINK_STATE_CHANGE_SYNC);
2563 }
2564 break;
2565 }
2566 case nvlink_train_conn_swcfg_to_active:
2567 {
2568 if (srcLink->version >= NVLINK_DEVICE_VERSION_40)
2569 {
2570 // non-ALI training for NVLink4.0+
2571 if (!srcLink->dev->enableALI)
2572 {
2573 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI(
2574 conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2575 }
2576 }
2577 else
2578 {
2579 // ALT training for NVLink3.0+
2580 status = nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT(
2581 conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2582 }
2583 break;
2584 }
2585 case nvlink_train_conn_active_to_swcfg:
2586 {
2587 status = nvlink_core_powerdown_intranode_conns_from_active_to_swcfg(
2588 conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2589 break;
2590 }
2591 case nvlink_train_conn_to_off:
2592 case nvlink_train_conn_swcfg_to_off:
2593 {
2594 status = nvlink_core_powerdown_intranode_conns_from_active_to_off(
2595 conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2596 if (status == NVL_SUCCESS)
2597 {
2598 nvlink_core_reset_intranode_conns(conns, numConns, NVLINK_STATE_CHANGE_SYNC);
2599 }
2600 break;
2601 }
2602 case nvlink_train_conn_off_to_active_ali_non_blocking:
2603 case nvlink_train_conn_off_to_active_ali_blocking:
2604 {
2605 if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
2606 srcLink->dev->enableALI)
2607 {
2608 status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(
2609 initLinks, count);
2610
2611 if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
2612 {
2613 NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
2614 do
2615 {
2616 nvlink_sleep(1);
2617 status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
2618 if (status == NVL_SUCCESS)
2619 {
2620 break;
2621 }
2622
2623 timeout--;
2624 } while(timeout > 0);
2625
2626 if (status == NVL_SUCCESS)
2627 {
2628 for ( i = 0; i < count; ++i)
2629 {
2630 //
2631 // NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
2632 // setting buffer ready so use the internal check to see if the request for ALI completed
2633 //
2634 (void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
2635 }
2636 }
2637 }
2638 }
2639 break;
2640 }
2641 default:
2642 {
2643 status = NVL_BAD_ARGS;
2644 break;
2645 }
2646 }
2647
2648 //
2649 // always get the latest link state values when the args are verified
2650 // so that user has additional information other than just the return value.
2651 //
2652 for (i = 0; i < numConns; i++)
2653 {
2654 nvlink_core_get_endpoint_state(conns[i]->end0, &trainParams->endpointPairsStates[i].srcEnd);
2655 nvlink_core_get_endpoint_state(conns[i]->end1, &trainParams->endpointPairsStates[i].dstEnd);
2656 }
2657
2658 // Release the per-link locks
2659 nvlink_lib_link_locks_release(initLinks, count);
2660
2661 nvlink_lib_ctrl_train_intranode_conns_parallel_end:
2662
2663 if (conns != NULL)
2664 {
2665 nvlink_free((void *)conns);
2666 }
2667
2668 if (initLinks != NULL)
2669 {
2670 nvlink_free((void *)initLinks);
2671 }
2672
2673 if (trainLinks != NULL)
2674 {
2675 nvlink_free((void *)trainLinks);
2676 }
2677
2678 return status;
2679 }
2680
2681 /**
2682 * Train the internode connection link to the target state
2683 *
2684 * @param[in] linkParams IOCTL params
2685 *
2686 * return NvlStatus
2687 */
2688 static NvlStatus
nvlink_lib_ctrl_train_internode_conn_link(nvlink_train_internode_conn_link * linkParams)2689 nvlink_lib_ctrl_train_internode_conn_link
2690 (
2691 nvlink_train_internode_conn_link *linkParams
2692 )
2693 {
2694 nvlink_link *localLink = NULL;
2695 NvlStatus status = NVL_SUCCESS;
2696 nvlink_internode_conn *interConn = NULL;
2697
2698 // Acquire the top-level lock
2699 status = nvlink_lib_top_lock_acquire();
2700 if (status != NVL_SUCCESS)
2701 {
2702 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2703 "%s: Failed to acquire top-level lock\n",
2704 __FUNCTION__));
2705
2706 return status;
2707 }
2708
2709 //
2710 // Top-level lock is now acquired. Proceed to traversing the device and
2711 // link lists and connections list
2712 //
2713
2714 //
2715 // look-up the nvlink link objects. Look-up will fail if there is a
2716 // fabric node id mismatch. So an explicit check against self
2717 // node id is not required.
2718 //
2719 nvlink_core_get_link_by_endpoint(&linkParams->localEndPoint, &localLink);
2720
2721 // user specified link is not available
2722 if (localLink == NULL)
2723 {
2724 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2725 "%s: Internode connection link train request with no matching local endpoint\n",
2726 __FUNCTION__));
2727
2728 //
2729 // Couldn't find the endpoint registered in the core library. Release
2730 // the top-level lock and return
2731 //
2732 nvlink_lib_top_lock_release();
2733
2734 return NVL_BAD_ARGS;
2735 }
2736
2737 nvlink_core_get_internode_conn(localLink, &interConn);
2738 if (interConn == NULL)
2739 {
2740 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2741 "%s: No Internode connection exists for local endpoint %s: %s.\n",
2742 __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
2743
2744 //
2745 // Couldn't find an associated connection for the endpoint. Release
2746 // the top-level lock and return
2747 //
2748 nvlink_lib_top_lock_release();
2749
2750 return NVL_BAD_ARGS;
2751 }
2752
2753 // Acquire the per-link lock
2754 status = nvlink_lib_link_locks_acquire(&localLink, 1);
2755 if (status != NVL_SUCCESS)
2756 {
2757 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2758 "%s: Failed to acquire per-link locks\n",
2759 __FUNCTION__));
2760
2761 // Release the top-level lock
2762 nvlink_lib_top_lock_release();
2763
2764 return status;
2765 }
2766
2767 //
2768 // All the required per-link locks are now successfully acquired
2769 // Release the top level-lock
2770 //
2771 nvlink_lib_top_lock_release();
2772
2773
2774 switch (linkParams->trainTo)
2775 {
2776 case nvlink_train_link_off_to_swcfg:
2777 {
2778 // OFF to SAFE is part of initialization sequence as of now.
2779 status = NVL_BAD_ARGS;
2780 break;
2781 }
2782 case nvlink_train_link_swcfg_to_active:
2783 {
2784 status = nvlink_core_train_internode_conns_from_swcfg_to_active(
2785 &interConn, 1, &linkParams->isMasterEnd, NVLINK_STATE_CHANGE_SYNC);
2786 break;
2787 }
2788 case nvlink_train_link_to_off:
2789 {
2790 // OFF state transitions are not supported/tested
2791 status = NVL_BAD_ARGS;
2792 break;
2793 }
2794 case nvlink_train_link_active_to_swcfg:
2795 {
2796 // not implemented/supported now
2797 status = NVL_BAD_ARGS;
2798 break;
2799 }
2800 case nvlink_train_link_swcfg_to_off:
2801 {
2802 // OFF state transitions are not supported/tested
2803 status = NVL_BAD_ARGS;
2804 break;
2805 }
2806 default:
2807 {
2808 status = NVL_BAD_ARGS;
2809 break;
2810 }
2811 }
2812
2813 //
2814 // always get the latest link state values so that
2815 // user has additional information other than just the return value.
2816 //
2817 nvlink_core_get_endpoint_state(localLink, &linkParams->localEndState);
2818
2819 // Release the per-link lock
2820 nvlink_lib_link_locks_release(&localLink, 1);
2821
2822 return status;
2823 }
2824
2825 /*
2826 * Train the internode connection sublink to the target state
2827 *
2828 * @param[in] subLinkParams IOCTL params
2829 *
2830 * return NvlStatus
2831 */
2832 static NvlStatus
nvlink_lib_ctrl_train_internode_conn_sublink(nvlink_train_internode_conn_sublink * subLinkParams)2833 nvlink_lib_ctrl_train_internode_conn_sublink
2834 (
2835 nvlink_train_internode_conn_sublink *subLinkParams
2836 )
2837 {
2838 nvlink_link *localLink = NULL;
2839 NvlStatus status = NVL_SUCCESS;
2840 nvlink_internode_conn *interConn = NULL;
2841
2842 // Acquire the top-level lock
2843 status = nvlink_lib_top_lock_acquire();
2844 if (status != NVL_SUCCESS)
2845 {
2846 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2847 "%s: Failed to acquire top-level lock\n",
2848 __FUNCTION__));
2849
2850 return status;
2851 }
2852
2853 //
2854 // Top-level lock is now acquired. Proceed to traversing the device and
2855 // link lists and connections list
2856 //
2857
2858 //
2859 // look-up the nvlink link objects. Look-up will fail if there is a
2860 // fabric node id mismatch. So an explicit check against self
2861 // node id is not required.
2862 //
2863 nvlink_core_get_link_by_endpoint(&subLinkParams->localEndPoint, &localLink);
2864
2865 // user specified link is not available
2866 if (localLink == NULL)
2867 {
2868 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2869 "%s: Internode connection sublink train request with no matching local endpoint\n",
2870 __FUNCTION__));
2871
2872 //
2873 // Couldn't find the endpoint registered in the core library. Release
2874 // the top-level lock and return
2875 //
2876 nvlink_lib_top_lock_release();
2877
2878 return NVL_BAD_ARGS;
2879 }
2880
2881 nvlink_core_get_internode_conn(localLink, &interConn);
2882 if (interConn == NULL)
2883 {
2884 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2885 "%s: No Internode connection exists for local endpoint %s: %s.\n",
2886 __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
2887
2888 //
2889 // Couldn't find an associated connection for the endpoint. Release
2890 // the top-level lock and return
2891 //
2892 nvlink_lib_top_lock_release();
2893
2894 return NVL_BAD_ARGS;
2895 }
2896
2897 // Acquire the per-link lock
2898 status = nvlink_lib_link_locks_acquire(&localLink, 1);
2899 if (status != NVL_SUCCESS)
2900 {
2901 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
2902 "%s: Failed to acquire per-link locks\n",
2903 __FUNCTION__));
2904
2905 // Release the top-level lock
2906 nvlink_lib_top_lock_release();
2907
2908 return status;
2909 }
2910
2911 //
2912 // All the required per-link locks are now successfully acquired
2913 // Release the top level-lock
2914 //
2915 nvlink_lib_top_lock_release();
2916
2917 switch (subLinkParams->trainTo)
2918 {
2919 case nvlink_train_sublink_off_to_safe:
2920 {
2921 // OFF to SAFE is part of initialization sequence as of now.
2922 status = NVL_BAD_ARGS;
2923 break;
2924 }
2925 case nvlink_train_sublink_safe_to_hs:
2926 {
2927 // NVLink 3.0 onwards this is handled through INITOPTIMIZE
2928 if (localLink->version >= NVLINK_DEVICE_VERSION_30)
2929 {
2930 return NVL_ERR_NOT_SUPPORTED;
2931 }
2932 status = nvlink_core_train_internode_conn_sublink_from_safe_to_hs(
2933 interConn, NVLINK_STATE_CHANGE_SYNC);
2934 break;
2935 }
2936 case nvlink_train_sublink_to_off:
2937 {
2938 // OFF state transitions are not supported/tested
2939 status = NVL_BAD_ARGS;
2940 break;
2941 }
2942 case nvlink_train_sublink_hs_to_safe:
2943 {
2944 // not implemented/supported now
2945 status = NVL_BAD_ARGS;
2946 break;
2947 }
2948 case nvlink_train_sublink_safe_to_off:
2949 {
2950 // OFF state transitions are not supported/tested
2951 status = NVL_BAD_ARGS;
2952 break;
2953 }
2954 default:
2955 {
2956 status = NVL_BAD_ARGS;
2957 break;
2958 }
2959 }
2960
2961 //
2962 // always get the latest link state values so that
2963 // user has additional information other than just the return value.
2964 //
2965 nvlink_core_get_endpoint_state(localLink, &subLinkParams->localEndState);
2966
2967 // Release the per-link lock
2968 nvlink_lib_link_locks_release(&localLink, 1);
2969
2970 return status;
2971 }
2972
2973 /**
2974 * Send INITOPTIMIZE on the given internode links
2975 *
2976 * @param[in] initoptimizeParams IOCTL params
2977 *
2978 * return NvlStatus
2979 */
2980 static NvlStatus
nvlink_lib_ctrl_train_internode_links_initoptimize(nvlink_train_internode_links_initoptimize * initoptimizeParams)2981 nvlink_lib_ctrl_train_internode_links_initoptimize
2982 (
2983 nvlink_train_internode_links_initoptimize *initoptimizeParams
2984 )
2985 {
2986 nvlink_link *endpoint = NULL;
2987 NvlStatus status = NVL_SUCCESS;
2988 NvU32 numLinks = 0;
2989 NvU32 i = 0;
2990
2991 nvlink_link **links = (nvlink_link **)nvlink_malloc(
2992 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
2993 if (links == NULL)
2994 {
2995 return NVL_NO_MEM;
2996 }
2997
2998 if (initoptimizeParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
2999 {
3000 nvlink_free((void *)links);
3001 return NVL_BAD_ARGS;
3002 }
3003
3004 // Acquire the top-level lock
3005 status = nvlink_lib_top_lock_acquire();
3006 if (status != NVL_SUCCESS)
3007 {
3008 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3009 "%s: Failed to acquire top-level lock\n",
3010 __FUNCTION__));
3011
3012 nvlink_free((void *)links);
3013 return status;
3014 }
3015
3016 //
3017 // Top-level lock is now acquired. Proceed to traversing the device and
3018 // link lists and connections list
3019 //
3020
3021 for (i = 0; i < initoptimizeParams->endPointCount; i++)
3022 {
3023 endpoint = NULL;
3024 nvlink_core_get_link_by_endpoint(&initoptimizeParams->endPoints[i], &endpoint);
3025
3026 // we can't send INITOPTIMIZE if the endpoint is not found
3027 if (endpoint == NULL)
3028 {
3029 //
3030 // Couldn't find the endpoint registered in the core library. Release
3031 // the top-level lock and return
3032 //
3033 nvlink_lib_top_lock_release();
3034
3035 nvlink_free((void *)links);
3036 return NVL_BAD_ARGS;
3037 }
3038 else if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
3039 {
3040 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3041 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3042 __FUNCTION__));
3043
3044 nvlink_assert(0);
3045
3046 // Release the top-level lock and free links
3047 nvlink_lib_top_lock_release();
3048 nvlink_free((void *)links);
3049 return NVL_ERR_INVALID_STATE;
3050 }
3051
3052 links[numLinks] = endpoint;
3053 numLinks++;
3054 }
3055
3056 // Acquire the per-link locks
3057 status = nvlink_lib_link_locks_acquire(links, numLinks);
3058 if (status != NVL_SUCCESS)
3059 {
3060 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3061 "%s: Failed to acquire per-link locks\n",
3062 __FUNCTION__));
3063
3064 // Release the top-level lock
3065 nvlink_lib_top_lock_release();
3066
3067 nvlink_free((void *)links);
3068 return status;
3069 }
3070
3071 //
3072 // All the required per-link locks are now successfully acquired
3073 // Release the top level-lock
3074 //
3075 nvlink_lib_top_lock_release();
3076
3077 for (i = 0; i < numLinks; i++)
3078 {
3079 // INITOPTIMIZE is not supported before NVLink 3.0
3080 if (links[i]->version < NVLINK_DEVICE_VERSION_30)
3081 continue;
3082
3083 // Continue if the link is already active, nothing to do
3084 if ((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_HS)) &&
3085 (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_HS)) &&
3086 (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_HS)))
3087 {
3088 continue;
3089 }
3090
3091 //
3092 // For INITOPTIMIZE, link should be in SWCFG, else flag error and continue
3093 // to next link
3094 //
3095 if (!((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_SAFE)) &&
3096 (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_SAFE)) &&
3097 (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_SAFE))))
3098 {
3099 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3100 "%s: INITOPTIMIZE only works for links in SAFE %s:%s.\n",
3101 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3102 continue;
3103 }
3104
3105 status = links[i]->link_handlers->set_dl_link_mode(links[i],
3106 NVLINK_LINKSTATE_INITOPTIMIZE,
3107 NVLINK_STATE_CHANGE_ASYNC);
3108
3109 // Although it failed we need to continue on other links.
3110 if (status != NVL_SUCCESS)
3111 {
3112 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3113 "%s: INITOPTIMIZE failed on Device:Link %s:%s\n",
3114 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3115 }
3116 }
3117
3118 // Release the per-link locks
3119 nvlink_lib_link_locks_release(links, numLinks);
3120
3121 if (links != NULL)
3122 {
3123 nvlink_free((void *)links);
3124 }
3125 return NVL_SUCCESS;
3126 }
3127
3128 /**
3129 * Send POSTINITOPTIMIZE on the given internode links
3130 *
3131 * @param[in] initoptimizeParams IOCTL params
3132 *
3133 * return NvlStatus
3134 */
3135 static NvlStatus
nvlink_lib_ctrl_train_internode_links_post_initoptimize(nvlink_train_internode_links_post_initoptimize * postinitoptimizeParams)3136 nvlink_lib_ctrl_train_internode_links_post_initoptimize
3137 (
3138 nvlink_train_internode_links_post_initoptimize *postinitoptimizeParams
3139 )
3140 {
3141 nvlink_link *endpoint = NULL;
3142 NvlStatus status = NVL_SUCCESS;
3143 NvU32 numLinks = 0;
3144 NvU32 i = 0;
3145
3146 nvlink_link **links = (nvlink_link **)nvlink_malloc(
3147 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3148 if (links == NULL)
3149 {
3150 return NVL_NO_MEM;
3151 }
3152
3153 if (postinitoptimizeParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
3154 {
3155 nvlink_free((void *)links);
3156 return NVL_BAD_ARGS;
3157 }
3158
3159 // Acquire the top-level lock
3160 status = nvlink_lib_top_lock_acquire();
3161 if (status != NVL_SUCCESS)
3162 {
3163 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3164 "%s: Failed to acquire top-level lock\n",
3165 __FUNCTION__));
3166
3167 nvlink_free((void *)links);
3168 return status;
3169 }
3170
3171 //
3172 // Top-level lock is now acquired. Proceed to traversing the device and
3173 // link lists and connections list
3174 //
3175
3176 for (i = 0; i < postinitoptimizeParams->endPointCount; i++)
3177 {
3178 endpoint = NULL;
3179 nvlink_core_get_link_by_endpoint(&postinitoptimizeParams->endPoints[i], &endpoint);
3180
3181 // we can't send INITOPTIMIZE if the endpoint is not found
3182 if (endpoint == NULL)
3183 {
3184 //
3185 // Couldn't find the endpoint registered in the core library. Release
3186 // the top-level lock and return
3187 //
3188 nvlink_lib_top_lock_release();
3189
3190 nvlink_free((void *)links);
3191 return NVL_BAD_ARGS;
3192 }
3193 else if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
3194 {
3195 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3196 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3197 __FUNCTION__));
3198
3199 nvlink_assert(0);
3200
3201 // Release the top-level lock and free links
3202 nvlink_lib_top_lock_release();
3203 nvlink_free((void *)links);
3204 return NVL_ERR_INVALID_STATE;
3205 }
3206
3207 links[numLinks] = endpoint;
3208 numLinks++;
3209 }
3210
3211 // Acquire the per-link locks
3212 status = nvlink_lib_link_locks_acquire(links, numLinks);
3213 if (status != NVL_SUCCESS)
3214 {
3215 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3216 "%s: Failed to acquire per-link locks\n",
3217 __FUNCTION__));
3218
3219 // Release the top-level lock
3220 nvlink_lib_top_lock_release();
3221
3222 nvlink_free((void *)links);
3223 return status;
3224 }
3225
3226 //
3227 // All the required per-link locks are now successfully acquired
3228 // Release the top level-lock
3229 //
3230 nvlink_lib_top_lock_release();
3231
3232 for (i = 0; i < numLinks; i++)
3233 {
3234 // POST_INITOPTIMIZE is not supported before NVLink 3.0
3235 if (links[i]->version < NVLINK_DEVICE_VERSION_30)
3236 continue;
3237
3238 // Continue if the link is already active, nothing to do
3239 if ((nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_HS)) &&
3240 (nvlink_core_check_tx_sublink_state(links[i], NVLINK_SUBLINK_STATE_TX_HS)) &&
3241 (nvlink_core_check_rx_sublink_state(links[i], NVLINK_SUBLINK_STATE_RX_HS)))
3242 {
3243 continue;
3244 }
3245
3246 status = links[i]->link_handlers->set_dl_link_mode(links[i],
3247 NVLINK_LINKSTATE_POST_INITOPTIMIZE,
3248 NVLINK_STATE_CHANGE_ASYNC);
3249
3250 // Although it failed we need to continue on other links.
3251 if (status != NVL_SUCCESS)
3252 {
3253 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3254 "%s: POST_INITOPTIMIZE failed on Device:Link %s:%s\n",
3255 __FUNCTION__, links[i]->dev->deviceName, links[i]->linkName));
3256 }
3257 }
3258
3259 // Release the per-link locks
3260 nvlink_lib_link_locks_release(links, numLinks);
3261
3262 if (links != NULL)
3263 {
3264 nvlink_free((void *)links);
3265 }
3266 return NVL_SUCCESS;
3267 }
3268
3269 /**
3270 * Train the internode connection links to the target state
3271 *
3272 * @param[in] linkParams IOCTL params
3273 *
3274 * return NvlStatus
3275 */
3276 static NvlStatus
nvlink_lib_ctrl_train_internode_conns_parallel(nvlink_train_internode_conns_parallel * linkParams)3277 nvlink_lib_ctrl_train_internode_conns_parallel
3278 (
3279 nvlink_train_internode_conns_parallel *linkParams
3280 )
3281 {
3282 nvlink_link *localLink = NULL;
3283 NvlStatus status = NVL_SUCCESS;
3284 NvU32 numLinks = 0;
3285 NvU32 i = 0;
3286 nvlink_link **links = NULL;
3287 nvlink_internode_conn **interConns = NULL;
3288
3289 links = (nvlink_link **)nvlink_malloc(
3290 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3291 if (links == NULL)
3292 {
3293 status = NVL_NO_MEM;
3294 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3295 }
3296
3297 interConns = (nvlink_internode_conn **)nvlink_malloc(
3298 sizeof(nvlink_internode_conn *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3299 if (interConns == NULL)
3300 {
3301 status = NVL_NO_MEM;
3302 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3303 }
3304
3305 if (linkParams->localEndPointCount > NVLINK_MAX_PARALLEL_CONNS_TRAIN_COUNT)
3306 {
3307 status = NVL_BAD_ARGS;
3308 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3309 }
3310
3311 // Acquire the top-level lock
3312 status = nvlink_lib_top_lock_acquire();
3313 if (status != NVL_SUCCESS)
3314 {
3315 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3316 "%s: Failed to acquire top-level lock\n",
3317 __FUNCTION__));
3318
3319 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3320 }
3321
3322 //
3323 // Top-level lock is now acquired. Proceed to traversing the device and
3324 // link lists and connections list
3325 //
3326 for (i = 0; i < linkParams->localEndPointCount; i++)
3327 {
3328 //
3329 // look-up the nvlink link objects. Look-up will fail if there is a
3330 // fabric node id mismatch. So an explicit check against self
3331 // node id is not required.
3332 //
3333 nvlink_core_get_link_by_endpoint(&linkParams->localEndPoints[i], &localLink);
3334
3335 // user specified link is not available
3336 if (localLink == NULL)
3337 {
3338 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3339 "%s: Internode connection link train request with no matching local endpoint\n",
3340 __FUNCTION__));
3341
3342 //
3343 // Couldn't find the endpoint registered in the core library. Release
3344 // the top-level lock and return
3345 //
3346 nvlink_lib_top_lock_release();
3347
3348 status = NVL_BAD_ARGS;
3349 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3350 }
3351
3352 nvlink_core_get_internode_conn(localLink, &(interConns[i]));
3353 if (interConns[i] == NULL)
3354 {
3355 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3356 "%s: No Internode connection exists for local endpoint %s: %s.\n",
3357 __FUNCTION__, localLink->dev->deviceName, localLink->linkName));
3358
3359 //
3360 // Couldn't find an associated connection for the endpoint. Release
3361 // the top-level lock and return
3362 //
3363 nvlink_lib_top_lock_release();
3364
3365 status = NVL_BAD_ARGS;
3366 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3367 }
3368
3369 links[numLinks] = localLink;
3370 numLinks++;
3371 }
3372
3373 // Acquire the per-link lock
3374 status = nvlink_lib_link_locks_acquire(links, numLinks);
3375 if (status != NVL_SUCCESS)
3376 {
3377 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3378 "%s: Failed to acquire per-link locks\n",
3379 __FUNCTION__));
3380
3381 // Release the top-level lock
3382 nvlink_lib_top_lock_release();
3383
3384 goto nvlink_lib_ctrl_train_internode_conns_parallel_end;
3385 }
3386
3387 //
3388 // All the required per-link locks are now successfully acquired
3389 // Release the top level-lock
3390 //
3391 nvlink_lib_top_lock_release();
3392
3393 switch (linkParams->trainTo)
3394 {
3395 case nvlink_train_link_off_to_swcfg:
3396 {
3397 // OFF to SAFE is part of initialization sequence as of now.
3398 status = NVL_BAD_ARGS;
3399 break;
3400 }
3401 case nvlink_train_link_swcfg_to_active:
3402 {
3403 status = nvlink_core_train_internode_conns_from_swcfg_to_active(
3404 interConns, numLinks, linkParams->isMasterEnd, NVLINK_STATE_CHANGE_SYNC);
3405 break;
3406 }
3407 case nvlink_train_link_to_off:
3408 {
3409 // OFF state transitions are not supported/tested
3410 status = NVL_BAD_ARGS;
3411 break;
3412 }
3413 case nvlink_train_link_active_to_swcfg:
3414 {
3415 // not implemented/supported now
3416 status = NVL_BAD_ARGS;
3417 break;
3418 }
3419 case nvlink_train_link_swcfg_to_off:
3420 {
3421 // OFF state transitions are not supported/tested
3422 status = NVL_BAD_ARGS;
3423 break;
3424 }
3425 default:
3426 {
3427 status = NVL_BAD_ARGS;
3428 break;
3429 }
3430 }
3431
3432 for (i = 0; i < numLinks; i++)
3433 {
3434
3435 //
3436 // always get the latest link state values so that
3437 // user has additional information other than just the return value.
3438 //
3439 nvlink_core_get_endpoint_state(links[i], &linkParams->localEndStates[i]);
3440 }
3441
3442 // Release the per-link lock
3443 nvlink_lib_link_locks_release(links, numLinks);
3444
3445 nvlink_lib_ctrl_train_internode_conns_parallel_end:
3446
3447 if (links != NULL)
3448 {
3449 nvlink_free((void *)links);
3450 }
3451 if (interConns != NULL)
3452 {
3453 nvlink_free((void *)interConns);
3454 }
3455
3456 return status;
3457 }
3458
3459 /**
3460 * Get the device information for all registered devices
3461 *
3462 * @param[in] infoParams IOCTL params
3463 *
3464 * return NvlStatus
3465 */
3466 static NvlStatus
nvlink_lib_ctrl_get_devices_info(nvlink_get_devices_info * infoParams)3467 nvlink_lib_ctrl_get_devices_info
3468 (
3469 nvlink_get_devices_info *infoParams
3470 )
3471 {
3472 nvlink_device *dev = NULL;
3473 NvlStatus status = NVL_SUCCESS;
3474 NvU32 numDevices = 0;
3475
3476 // Initialize number of devices to 0
3477 infoParams->numDevice = 0;
3478
3479 // Acquire the top-level lock
3480 status = nvlink_lib_top_lock_acquire();
3481 if (status != NVL_SUCCESS)
3482 {
3483 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3484 "%s: Failed to acquire top-level lock\n",
3485 __FUNCTION__));
3486
3487 return status;
3488 }
3489
3490 //
3491 // Top-level lock is now acquired. Proceed to traversing the device and
3492 // link lists and connections list
3493 //
3494
3495 FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
3496 {
3497 // total number of devices should be within NVLINK_DEVICE_INSTANCE_MAX
3498 if (numDevices >= NVLINK_DEVICE_INSTANCE_MAX)
3499 {
3500 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3501 "%s: numDevices >= NVLINK_DEVICE_INSTANCE_MAX",
3502 __FUNCTION__));
3503
3504 nvlink_assert(0);
3505 nvlink_lib_top_lock_release();
3506 return NVL_ERR_INVALID_STATE;
3507 }
3508
3509 // copy device information
3510 nvlink_core_copy_device_info(dev, &infoParams->devInfo[numDevices]);
3511 numDevices++;
3512 }
3513
3514 infoParams->numDevice = numDevices;
3515
3516 // Release the top-level lock
3517 nvlink_lib_top_lock_release();
3518
3519 return status;
3520 }
3521
3522 static NvlStatus
nvlink_lib_ctrl_acquire_capability(nvlink_ioctrl_params * ctrlParams,nvlink_acquire_capability * capParams)3523 nvlink_lib_ctrl_acquire_capability
3524 (
3525 nvlink_ioctrl_params *ctrlParams,
3526 nvlink_acquire_capability *capParams
3527 )
3528 {
3529 NvlStatus status;
3530
3531 if (capParams == NULL)
3532 {
3533 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3534 "%s: Bad ioctl capability ctrl params specified.\n",
3535 __FUNCTION__));
3536 return NVL_BAD_ARGS;
3537 }
3538
3539 status = nvlink_acquire_fabric_mgmt_cap(ctrlParams->osPrivate,
3540 capParams->capDescriptor);
3541 if (status != NVL_SUCCESS)
3542 {
3543 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3544 "%s: Failed to acquire fabric mgmt capability.\n",
3545 __FUNCTION__));
3546 return status;
3547 }
3548
3549 return NVL_SUCCESS;
3550 }
3551
nvlink_lib_ctrl_get_link_state(nvlink_get_link_state * linkParams)3552 static NvlStatus nvlink_lib_ctrl_get_link_state
3553 (
3554 nvlink_get_link_state *linkParams
3555 )
3556 {
3557 nvlink_link *endpoint = NULL;
3558 NvlStatus status = NVL_SUCCESS;
3559 NvU32 numLinks = 0;
3560 NvU32 i = 0;
3561
3562 ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
3563
3564 nvlink_link **links = (nvlink_link **)nvlink_malloc(
3565 sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
3566 if (links == NULL)
3567 {
3568 return NVL_NO_MEM;
3569 }
3570
3571 if (linkParams->endPointCount > NVLINK_MAX_NVLINK_ENDPOINTS)
3572 {
3573 nvlink_free((void *)links);
3574 return NVL_BAD_ARGS;
3575 }
3576
3577 // Acquire the top-level lock
3578 status = nvlink_lib_top_lock_acquire();
3579 if (status != NVL_SUCCESS)
3580 {
3581 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3582 "%s: Failed to acquire top-level lock\n",
3583 __FUNCTION__));
3584
3585 nvlink_free((void *)links);
3586 return status;
3587 }
3588
3589 //
3590 // Top-level lock is now acquired. Proceed to traversing the device and
3591 // link lists and connections list
3592 //
3593
3594 for (i = 0; i < linkParams->endPointCount; i++)
3595 {
3596 endpoint = NULL;
3597 nvlink_core_get_link_by_endpoint(&linkParams->endPoints[i], &endpoint);
3598
3599 // we can't send this command if the endpoint is not found
3600 if (endpoint == NULL)
3601 {
3602 //
3603 // Couldn't find the endpoint registered in the core library. Release
3604 // the top-level lock and return
3605 //
3606 nvlink_lib_top_lock_release();
3607
3608 nvlink_free((void *)links);
3609 return NVL_BAD_ARGS;
3610 }
3611 else if (numLinks >= NVLINK_MAX_NVLINK_ENDPOINTS)
3612 {
3613 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3614 "%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
3615 __FUNCTION__));
3616
3617 nvlink_assert(0);
3618
3619 // Release the top-level lock and free links
3620 nvlink_lib_top_lock_release();
3621 nvlink_free((void *)links);
3622 return NVL_ERR_INVALID_STATE;
3623 }
3624
3625 links[numLinks] = endpoint;
3626 numLinks++;
3627 }
3628
3629 // Acquire the per-link locks
3630 status = nvlink_lib_link_locks_acquire(links, numLinks);
3631 if (status != NVL_SUCCESS)
3632 {
3633 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3634 "%s: Failed to acquire per-link locks\n",
3635 __FUNCTION__));
3636
3637 // Release the top-level lock
3638 nvlink_lib_top_lock_release();
3639
3640 nvlink_free((void *)links);
3641 return status;
3642 }
3643
3644 //
3645 // All the required per-link locks are now successfully acquired
3646 // Release the top level-lock
3647 //
3648 nvlink_lib_top_lock_release();
3649
3650 for (i = 0; i < numLinks; i++)
3651 {
3652 // Wait for the link state to change.
3653 status = nvlink_core_poll_link_state(links[i],
3654 NVLINK_LINKSTATE_HS,
3655 NVLINK_TRANSITION_POST_HS_TIMEOUT);
3656 if (status != NVL_SUCCESS)
3657 {
3658 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3659 "%s: Unable to set link state to ACTIVE for link"
3660 " %s:%s \n",
3661 __FUNCTION__,
3662 links[i]->dev->deviceName, links[i]->linkName));
3663 }
3664 else
3665 {
3666 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_SETUP,
3667 "%s: Successfully able to set link state to ACTIVE for link"
3668 " %s:%s \n",
3669 __FUNCTION__,
3670 links[i]->dev->deviceName, links[i]->linkName));
3671 }
3672
3673 nvlink_core_get_endpoint_state(links[i], &linkParams->endState[i]);
3674 }
3675
3676 // Release the per-link locks
3677 nvlink_lib_link_locks_release(links, numLinks);
3678
3679 if (links != NULL)
3680 {
3681 nvlink_free((void *)links);
3682 }
3683
3684 return NVL_SUCCESS;
3685 }
3686
3687 static NvlStatus
nvlink_lib_ctrl_get_device_link_states(nvlink_get_device_link_states * params)3688 nvlink_lib_ctrl_get_device_link_states
3689 (
3690 nvlink_get_device_link_states *params
3691 )
3692 {
3693 nvlink_link *endpoint = NULL;
3694 nvlink_device *dev = NULL;
3695 NvlStatus status = NVL_SUCCESS;
3696 NvU32 numLinks = 0;
3697 NvU32 i = 0;
3698 NvU8 linkNumber;
3699
3700 nvlink_link **links = (nvlink_link **)nvlink_malloc(
3701 sizeof(nvlink_link *) * NVLINK_MAX_DEVICE_CONN);
3702
3703 // Get current monotonic time in seconds.nanoseconds
3704 params->time = nvlink_get_platform_time();
3705
3706 if (links == NULL)
3707 {
3708 return NVL_NO_MEM;
3709 }
3710
3711 nvlink_memset(params->endStates, 0x0, sizeof(params->endStates));
3712
3713 // Acquire the top-level lock
3714 status = nvlink_lib_top_lock_acquire();
3715 if (status != NVL_SUCCESS)
3716 {
3717 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3718 "%s: Failed to acquire top-level lock\n",
3719 __FUNCTION__));
3720
3721 nvlink_free((void *)links);
3722 return status;
3723 }
3724
3725 // look-up user requested nvlink device object
3726 nvlink_core_get_device_by_devinfo(¶ms->devInfo, &dev);
3727 if (dev == NULL)
3728 {
3729 //
3730 // Couldn't find the device ptr in the core library. Release the
3731 // top-level lock and return
3732 //
3733 nvlink_lib_top_lock_release();
3734
3735 nvlink_free((void *)links);
3736 return NVL_BAD_ARGS;
3737 }
3738
3739 //
3740 // Top-level lock is now acquired. Proceed to traversing the list
3741 // of devices and list of links to lock all links
3742 //
3743 FOR_EACH_LINK_REGISTERED(endpoint, dev, node)
3744 {
3745 if (numLinks >= NVLINK_MAX_DEVICE_CONN)
3746 {
3747 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3748 "%s: numLinks >= NVLINK_MAX_DEVICE_CONN",
3749 __FUNCTION__));
3750
3751 nvlink_assert(0);
3752
3753 // Release the top-level lock and free links
3754 nvlink_lib_top_lock_release();
3755 nvlink_free((void *)links);
3756 return NVL_ERR_INVALID_STATE;
3757 }
3758 links[numLinks] = endpoint;
3759 numLinks++;
3760 }
3761
3762 // Acquire the per-link locks
3763 status = nvlink_lib_link_locks_acquire(links, numLinks);
3764 if (status != NVL_SUCCESS)
3765 {
3766 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
3767 "%s: Failed to acquire per-link locks\n",
3768 __FUNCTION__));
3769
3770 // Release the top-level lock
3771 nvlink_lib_top_lock_release();
3772 nvlink_free((void *)links);
3773 return status;
3774 }
3775
3776 //
3777 // All the required per-link locks are now successfully acquired
3778 // Release the top level-lock
3779 //
3780 nvlink_lib_top_lock_release();
3781
3782 nvlink_assert((links != NULL) && (numLinks > 0));
3783
3784 for (i = 0; i < numLinks; ++i)
3785 {
3786 linkNumber = links[i]->linkNumber;
3787
3788 nvlink_assert(linkNumber < NVLINK_MAX_DEVICE_CONN);
3789
3790 // Get the endpoint states of the link
3791 nvlink_core_get_endpoint_state(links[i], &(params->endStates[linkNumber]));
3792
3793 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
3794 "%s: link 0x%x -- linkMode 0x%x,\n",
3795 __FUNCTION__, linkNumber, params->endStates[linkNumber].linkMode));
3796 }
3797
3798 // This is done to preserve client behavior that uses endStatesCount to iterate across endStates array
3799 params->endStatesCount = NVLINK_MAX_DEVICE_CONN;
3800
3801 // Release the per-link locks
3802 nvlink_lib_link_locks_release(links, numLinks);
3803
3804 if (links != NULL)
3805 {
3806 nvlink_free((void *)links);
3807 }
3808
3809 return status;
3810 }
3811