1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "nvlink.h"
25 #include "nvlink_export.h"
26 #include "nvlink_os.h"
27 #include "../nvlink_ctx.h"
28 #include "../nvlink_helper.h"
29 #include "nvlink_lock.h"
30
31 /**
32 * TODO: Rework this function to acquire locks and update callers
33 *
34 * Check if the device has no links registered
35 *
36 * @param[in] dev NVLink Device pointer
37 *
38 * return NV_TRUE if the device has no links registered
39 */
40 NvBool
nvlink_lib_is_link_list_empty(nvlink_device * dev)41 nvlink_lib_is_link_list_empty
42 (
43 nvlink_device *dev
44 )
45 {
46 NvBool isEmpty = NV_TRUE;
47
48 isEmpty = nvListIsEmpty(&dev->link_list);
49
50 return isEmpty;
51 }
52
53 /**
54 * Get the link associated with the given link id.
55 *
56 * @param[in] device NVLink Device Pointer
57 * @param[in] link_id Link Id of the given link
58 * @param[out] link NVLink Link pointer
59 *
60 * return NVL_SUCCESS on success
61 */
62 NvlStatus
nvlink_lib_get_link(nvlink_device * device,NvU32 link_id,nvlink_link ** link)63 nvlink_lib_get_link
64 (
65 nvlink_device *device,
66 NvU32 link_id,
67 nvlink_link **link
68 )
69 {
70 nvlink_link *cur = NULL;
71 NvlStatus status = -NVL_NOT_FOUND;
72
73 if (device == NULL || link == NULL)
74 {
75 return -NVL_BAD_ARGS;
76 }
77
78 *link = NULL;
79
80 // Acquire the top-level lock
81 status = nvlink_lib_top_lock_acquire();
82 if (status != NVL_SUCCESS)
83 {
84 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
85 "%s: Failed to acquire top-level lock\n",
86 __FUNCTION__));
87
88 return status;
89 }
90
91 //
92 // Top-level lock is now acquired. Proceed to traversing the
93 // link list for the device
94 //
95
96 // Reset status to -NVL_NOT_FOUND
97 status = -NVL_NOT_FOUND;
98
99 FOR_EACH_LINK_REGISTERED(cur, device, node)
100 {
101 if (cur->linkNumber == link_id)
102 {
103 *link = cur;
104 status = NVL_SUCCESS;
105 break;
106 }
107 }
108
109 // Release the top level-lock
110 nvlink_lib_top_lock_release();
111
112 return status;
113 }
114
115 /**
116 * Set the given link as the link master.
117 * This requires that the remote end of the link is known, and that it
118 * hasn't set itself to be the master.
119 *
120 * Note: This function is used by RM to set master attribute to a link
121 * in order to handle GPU lock inversion problem while servicing
122 * link interrupts(re-training). With external fabric management
123 * enabled, we don't have the issue. Also we don't have to worry
124 * about the inter-node connections which are managed by FM.
125 *
126 * @param[in] link NVLink Link pointer
127 *
128 * return NVL_SUCCESS if the master was set
129 */
130 NvlStatus
nvlink_lib_set_link_master(nvlink_link * link)131 nvlink_lib_set_link_master
132 (
133 nvlink_link *link
134 )
135 {
136 nvlink_link *remote_end = NULL;
137 NvlStatus status = NVL_SUCCESS;
138 nvlink_intranode_conn *conn = NULL;
139 nvlink_link *links[2] = {0};
140 NvU32 numLinks = 0;
141
142 if (link == NULL)
143 {
144 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
145 "%s: Bad link pointer specified.\n",
146 __FUNCTION__));
147 return NVL_ERR_GENERIC;
148 }
149
150 // Acquire the top-level lock
151 status = nvlink_lib_top_lock_acquire();
152 if (status != NVL_SUCCESS)
153 {
154 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
155 "%s: Failed to acquire top-level lock\n",
156 __FUNCTION__));
157
158 return status;
159 }
160
161 //
162 // Top-level lock is now acquired. Proceed to traversing the
163 // connection list
164 //
165
166 links[numLinks] = link;
167 numLinks++;
168
169 nvlink_core_get_intranode_conn(link, &conn);
170 if (conn != NULL)
171 {
172 remote_end = (conn->end0 == link ? conn->end1 : conn->end0);
173 links[numLinks] = remote_end;
174 numLinks++;
175 }
176
177 // Acquire the per-link locks for all links captured
178 status = nvlink_lib_link_locks_acquire(links, numLinks);
179 if (status != NVL_SUCCESS)
180 {
181 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
182 "%s: Failed to acquire per-link locks\n",
183 __FUNCTION__));
184
185 // Release the top-level lock
186 nvlink_lib_top_lock_release();
187
188 return status;
189 }
190
191 //
192 // All the required per-link locks are successfully acquired
193 // The connection list traversal is also complete now
194 // Release the top level-lock
195 //
196 nvlink_lib_top_lock_release();
197
198 // Early return if we've already done this
199 if (link->master)
200 {
201 status = NVL_SUCCESS;
202 }
203 else
204 {
205 // Make sure the remote end exists and hasn't claimed the master yet
206 if (remote_end == NULL || remote_end->master)
207 {
208 status = NVL_ERR_INVALID_STATE;
209 }
210 else
211 {
212 link->master = NV_TRUE;
213 }
214 }
215
216 // Release the per-link locks
217 nvlink_lib_link_locks_release(links, numLinks);
218
219 return status;
220 }
221
222 /**
223 * Get the link master associated with the given link.
224 * This may be the given link, or it may be the remote end. In the case
225 * when no master is assigned or the remote end is not known, this will
226 * return an error.
227 *
228 * @param[in] link NVLink Link pointer
229 * @param[out] master Master endpoint for the link
230 *
231 * return NVL_SUCCESS if the master was found
232 */
233 NvlStatus
nvlink_lib_get_link_master(nvlink_link * link,nvlink_link ** master)234 nvlink_lib_get_link_master
235 (
236 nvlink_link *link,
237 nvlink_link **master
238 )
239 {
240 nvlink_link *remote_end = NULL;
241 nvlink_intranode_conn *conn = NULL;
242 NvlStatus status = NVL_SUCCESS;
243 nvlink_link *links[2] = {0};
244 NvU32 numLinks = 0;
245
246 if (link == NULL || master == NULL)
247 {
248 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
249 "%s: Bad link pointer specified.\n",
250 __FUNCTION__));
251 return NVL_ERR_GENERIC;
252 }
253
254 // Acquire the top-level lock
255 status = nvlink_lib_top_lock_acquire();
256 if (status != NVL_SUCCESS)
257 {
258 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
259 "%s: Failed to acquire top-level lock\n",
260 __FUNCTION__));
261
262 return status;
263 }
264
265 //
266 // Top-level lock is now acquired. Proceed to traversing the
267 // connection list
268 //
269
270 links[numLinks] = link;
271 numLinks++;
272
273 nvlink_core_get_intranode_conn(link, &conn);
274 if (conn != NULL)
275 {
276 remote_end = (conn->end0 == link ? conn->end1 : conn->end0);
277 links[numLinks] = remote_end;
278 numLinks++;
279 }
280
281 // Acquire the per-link locks for all links captured
282 status = nvlink_lib_link_locks_acquire(links, numLinks);
283 if (status != NVL_SUCCESS)
284 {
285 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
286 "%s: Failed to acquire per-link locks\n",
287 __FUNCTION__));
288
289 // Release the top-level lock
290 nvlink_lib_top_lock_release();
291
292 return status;
293 }
294
295 //
296 // All the required per-link locks are successfully acquired
297 // The connection list traversal is also complete now
298 // Release the top level-lock
299 //
300 nvlink_lib_top_lock_release();
301
302 if (link->master)
303 {
304 *master = link;
305 }
306 else
307 {
308 // Make sure the remote end exists and hasn't claimed the master yet
309 if (remote_end == NULL)
310 {
311 status = NVL_ERR_INVALID_STATE;
312 }
313
314 *master = remote_end;
315 }
316
317 // Release the per-link locks
318 nvlink_lib_link_locks_release(links, numLinks);
319
320 return status;
321 }
322
323 /**
324 * Set whether the link is using ALI for training.
325 *
326 * @param[in] link NVLink Link pointer
327 * @param[in] enableALI Boolean for whether the link is using
328 * ALI to train the link
329 *
330 * return NvlSuccess if setting the variable was successful.
331 */
332 NvlStatus
nvlink_lib_link_set_training_mode(nvlink_link * link,NvBool enableALI)333 nvlink_lib_link_set_training_mode
334 (
335 nvlink_link *link,
336 NvBool enableALI
337 )
338 {
339 NvlStatus status = NVL_SUCCESS;
340 if (link == NULL)
341 {
342 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
343 "%s: Bad link pointer specified.\n",
344 __FUNCTION__));
345 return NVL_ERR_GENERIC;
346 }
347
348 // Acquire the top-level lock
349 status = nvlink_lib_top_lock_acquire();
350 if (status != NVL_SUCCESS)
351 {
352 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
353 "%s: Failed to acquire top-level lock\n",
354 __FUNCTION__));
355
356 return status;
357 }
358
359
360 // Acquire the per-link lock
361 status = nvlink_lib_link_locks_acquire(&link, 1);
362 if (status != NVL_SUCCESS)
363 {
364 NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
365 "%s: Failed to acquire per-link locks\n",
366 __FUNCTION__));
367
368 // Release the top-level lock
369 nvlink_lib_top_lock_release();
370
371 return status;
372 }
373
374 //
375 // All the required per-link locks are successfully acquired
376 // The connection list traversal is also complete now
377 // Release the top level-lock
378 //
379 nvlink_lib_top_lock_release();
380
381 // TODO: Add Setter for per-link enableALI state variable
382
383 // Release the per-link lock
384 nvlink_lib_link_locks_release(&link, 1);
385
386 return status;
387 }
388