1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "nvlink.h"
25 #include "nvlink_export.h"
26 #include "nvlink_os.h"
27 #include "../nvlink_ctx.h"
28 #include "../nvlink_helper.h"
29 #include "nvlink_lock.h"
30 
31 /**
32  * TODO: Rework this function to acquire locks and update callers
33  *
34  * Check if the device has no links registered
35  *
36  * @param[in]  dev  NVLink Device pointer
37  *
38  * return NV_TRUE if the device has no links registered
39  */
40 NvBool
nvlink_lib_is_link_list_empty(nvlink_device * dev)41 nvlink_lib_is_link_list_empty
42 (
43     nvlink_device *dev
44 )
45 {
46     NvBool isEmpty = NV_TRUE;
47 
48     isEmpty = nvListIsEmpty(&dev->link_list);
49 
50     return isEmpty;
51 }
52 
53 /**
54  * Get the link associated with the given link id.
55  *
56  * @param[in]   device   NVLink Device Pointer
57  * @param[in]   link_id  Link Id of the given link
58  * @param[out]  link     NVLink Link pointer
59  *
60  * return NVL_SUCCESS on success
61  */
62 NvlStatus
nvlink_lib_get_link(nvlink_device * device,NvU32 link_id,nvlink_link ** link)63 nvlink_lib_get_link
64 (
65     nvlink_device  *device,
66     NvU32           link_id,
67     nvlink_link   **link
68 )
69 {
70     nvlink_link *cur    = NULL;
71     NvlStatus    status = -NVL_NOT_FOUND;
72 
73     if (device == NULL || link == NULL)
74     {
75         return -NVL_BAD_ARGS;
76     }
77 
78     *link = NULL;
79 
80     // Acquire the top-level lock
81     status = nvlink_lib_top_lock_acquire();
82     if (status != NVL_SUCCESS)
83     {
84         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
85             "%s: Failed to acquire top-level lock\n",
86             __FUNCTION__));
87 
88         return status;
89     }
90 
91     //
92     // Top-level lock is now acquired. Proceed to traversing the
93     // link list for the device
94     //
95 
96     // Reset status to -NVL_NOT_FOUND
97     status = -NVL_NOT_FOUND;
98 
99     FOR_EACH_LINK_REGISTERED(cur, device, node)
100     {
101         if (cur->linkNumber == link_id)
102         {
103             *link  = cur;
104             status = NVL_SUCCESS;
105             break;
106         }
107     }
108 
109     // Release the top level-lock
110     nvlink_lib_top_lock_release();
111 
112     return status;
113 }
114 
115 /**
116  * Set the given link as the link master.
117  *   This requires that the remote end of the link is known, and that it
118  *   hasn't set itself to be the master.
119  *
120  *   Note: This function is used by RM to set master attribute to a link
121  *         in order to handle GPU lock inversion problem while servicing
122  *         link interrupts(re-training). With external fabric management
123  *         enabled, we don't have the issue. Also we don't have to worry
124  *         about the inter-node connections which are managed by FM.
125  *
126  * @param[in]  link  NVLink Link pointer
127  *
128  * return NVL_SUCCESS if the master was set
129  */
130 NvlStatus
nvlink_lib_set_link_master(nvlink_link * link)131 nvlink_lib_set_link_master
132 (
133     nvlink_link *link
134 )
135 {
136     nvlink_link           *remote_end = NULL;
137     NvlStatus              status     = NVL_SUCCESS;
138     nvlink_intranode_conn *conn       = NULL;
139     nvlink_link           *links[2]   = {0};
140     NvU32                  numLinks   = 0;
141 
142     if (link == NULL)
143     {
144         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
145             "%s: Bad link pointer specified.\n",
146             __FUNCTION__));
147         return NVL_ERR_GENERIC;
148     }
149 
150     // Acquire the top-level lock
151     status = nvlink_lib_top_lock_acquire();
152     if (status != NVL_SUCCESS)
153     {
154         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
155             "%s: Failed to acquire top-level lock\n",
156             __FUNCTION__));
157 
158         return status;
159     }
160 
161     //
162     // Top-level lock is now acquired. Proceed to traversing the
163     // connection list
164     //
165 
166     links[numLinks] = link;
167     numLinks++;
168 
169     nvlink_core_get_intranode_conn(link, &conn);
170     if (conn != NULL)
171     {
172         remote_end      = (conn->end0 == link ? conn->end1 : conn->end0);
173         links[numLinks] = remote_end;
174         numLinks++;
175     }
176 
177     // Acquire the per-link locks for all links captured
178     status = nvlink_lib_link_locks_acquire(links, numLinks);
179     if (status != NVL_SUCCESS)
180     {
181         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
182             "%s: Failed to acquire per-link locks\n",
183             __FUNCTION__));
184 
185         // Release the top-level lock
186         nvlink_lib_top_lock_release();
187 
188         return status;
189     }
190 
191     //
192     // All the required per-link locks are successfully acquired
193     // The connection list traversal is also complete now
194     // Release the top level-lock
195     //
196     nvlink_lib_top_lock_release();
197 
198     // Early return if we've already done this
199     if (link->master)
200     {
201         status = NVL_SUCCESS;
202     }
203     else
204     {
205         // Make sure the remote end exists and hasn't claimed the master yet
206         if (remote_end == NULL || remote_end->master)
207         {
208             status = NVL_ERR_INVALID_STATE;
209         }
210         else
211         {
212             link->master = NV_TRUE;
213         }
214     }
215 
216     // Release the per-link locks
217     nvlink_lib_link_locks_release(links, numLinks);
218 
219     return status;
220 }
221 
222 /**
223  * Get the link master associated with the given link.
224  *   This may be the given link, or it may be the remote end. In the case
225  *   when no master is assigned or the remote end is not known, this will
226  *   return an error.
227  *
228  * @param[in]  link    NVLink Link pointer
229  * @param[out] master  Master endpoint for the link
230  *
231  * return NVL_SUCCESS if the master was found
232  */
233 NvlStatus
nvlink_lib_get_link_master(nvlink_link * link,nvlink_link ** master)234 nvlink_lib_get_link_master
235 (
236     nvlink_link  *link,
237     nvlink_link **master
238 )
239 {
240     nvlink_link           *remote_end = NULL;
241     nvlink_intranode_conn *conn       = NULL;
242     NvlStatus              status     = NVL_SUCCESS;
243     nvlink_link           *links[2]   = {0};
244     NvU32                  numLinks   = 0;
245 
246     if (link == NULL || master == NULL)
247     {
248         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
249             "%s: Bad link pointer specified.\n",
250             __FUNCTION__));
251         return NVL_ERR_GENERIC;
252     }
253 
254     // Acquire the top-level lock
255     status = nvlink_lib_top_lock_acquire();
256     if (status != NVL_SUCCESS)
257     {
258         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
259             "%s: Failed to acquire top-level lock\n",
260             __FUNCTION__));
261 
262         return status;
263     }
264 
265     //
266     // Top-level lock is now acquired. Proceed to traversing the
267     // connection list
268     //
269 
270     links[numLinks] = link;
271     numLinks++;
272 
273     nvlink_core_get_intranode_conn(link, &conn);
274     if (conn != NULL)
275     {
276         remote_end      = (conn->end0 == link ? conn->end1 : conn->end0);
277         links[numLinks] = remote_end;
278         numLinks++;
279     }
280 
281     // Acquire the per-link locks for all links captured
282     status = nvlink_lib_link_locks_acquire(links, numLinks);
283     if (status != NVL_SUCCESS)
284     {
285         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
286             "%s: Failed to acquire per-link locks\n",
287             __FUNCTION__));
288 
289         // Release the top-level lock
290         nvlink_lib_top_lock_release();
291 
292         return status;
293     }
294 
295     //
296     // All the required per-link locks are successfully acquired
297     // The connection list traversal is also complete now
298     // Release the top level-lock
299     //
300     nvlink_lib_top_lock_release();
301 
302     if (link->master)
303     {
304         *master = link;
305     }
306     else
307     {
308         // Make sure the remote end exists and hasn't claimed the master yet
309         if (remote_end == NULL)
310         {
311             status = NVL_ERR_INVALID_STATE;
312         }
313 
314         *master = remote_end;
315     }
316 
317     // Release the per-link locks
318     nvlink_lib_link_locks_release(links, numLinks);
319 
320     return status;
321 }
322 
323 /**
324  * Set whether the link is using ALI for training.
325  *
326  * @param[in]  link       NVLink Link pointer
327  * @param[in]  enableALI  Boolean for whether the link is using
328  *                        ALI to train the link
329  *
330  * return NvlSuccess if setting the variable was successful.
331  */
332 NvlStatus
nvlink_lib_link_set_training_mode(nvlink_link * link,NvBool enableALI)333 nvlink_lib_link_set_training_mode
334 (
335     nvlink_link  *link,
336     NvBool        enableALI
337 )
338 {
339     NvlStatus status = NVL_SUCCESS;
340     if (link == NULL)
341     {
342         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
343             "%s: Bad link pointer specified.\n",
344             __FUNCTION__));
345         return NVL_ERR_GENERIC;
346     }
347 
348     // Acquire the top-level lock
349     status = nvlink_lib_top_lock_acquire();
350     if (status != NVL_SUCCESS)
351     {
352         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
353             "%s: Failed to acquire top-level lock\n",
354             __FUNCTION__));
355 
356         return status;
357     }
358 
359 
360     // Acquire the per-link lock
361     status = nvlink_lib_link_locks_acquire(&link, 1);
362     if (status != NVL_SUCCESS)
363     {
364         NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
365             "%s: Failed to acquire per-link locks\n",
366             __FUNCTION__));
367 
368         // Release the top-level lock
369         nvlink_lib_top_lock_release();
370 
371         return status;
372     }
373 
374     //
375     // All the required per-link locks are successfully acquired
376     // The connection list traversal is also complete now
377     // Release the top level-lock
378     //
379     nvlink_lib_top_lock_release();
380 
381     // TODO: Add Setter for per-link enableALI state variable
382 
383     // Release the per-link lock
384     nvlink_lib_link_locks_release(&link, 1);
385 
386     return status;
387 }
388