1 /*******************************************************************************
2     Copyright (c) 2013-2023 NVIDIA Corporation
3 
4     This program is free software; you can redistribute it and/or
5     modify it under the terms of the GNU General Public License
6     as published by the Free Software Foundation; either version 2
7     of the License, or (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17     02110-1301, USA.
18 *******************************************************************************/
19 
20 #include "uvm_common.h"
21 #include "uvm_linux.h"
22 #include "uvm_forward_decl.h"
23 
24 // TODO: Bug 1710855: Tweak this number through benchmarks
25 #define UVM_SPIN_LOOP_SCHEDULE_TIMEOUT_NS   (10*1000ULL)
26 #define UVM_SPIN_LOOP_PRINT_TIMEOUT_SEC     30ULL
27 
28 // Default to debug prints being enabled for debug and develop builds and
29 // disabled for release builds.
30 static int uvm_debug_prints = UVM_IS_DEBUG() || UVM_IS_DEVELOP();
31 
32 // Make the module param writable so that prints can be enabled or disabled at
33 // any time by modifying the module parameter.
34 module_param(uvm_debug_prints, int, S_IRUGO|S_IWUSR);
35 MODULE_PARM_DESC(uvm_debug_prints, "Enable uvm debug prints.");
36 
uvm_debug_prints_enabled(void)37 bool uvm_debug_prints_enabled(void)
38 {
39     return uvm_debug_prints != 0;
40 }
41 
42 // This parameter allows a program in user mode to call the kernel tests
43 // defined in this module. This parameter should only be used for testing and
44 // must not be set to true otherwise since it breaks security when it is
45 // enabled. By default and for safety reasons this parameter is set to false.
46 int uvm_enable_builtin_tests __read_mostly = 0;
47 module_param(uvm_enable_builtin_tests, int, S_IRUGO);
48 MODULE_PARM_DESC(uvm_enable_builtin_tests,
49                  "Enable the UVM built-in tests. (This is a security risk)");
50 
51 // Default to release asserts being enabled.
52 int uvm_release_asserts __read_mostly = 1;
53 
54 // Make the module param writable so that release asserts can be enabled or
55 // disabled at any time by modifying the module parameter.
56 module_param(uvm_release_asserts, int, S_IRUGO|S_IWUSR);
57 MODULE_PARM_DESC(uvm_release_asserts, "Enable uvm asserts included in release builds.");
58 
59 // Default to failed release asserts not dumping stack.
60 int uvm_release_asserts_dump_stack __read_mostly = 0;
61 
62 // Make the module param writable so that dumping the stack can be enabled and
63 // disabled at any time by modifying the module parameter.
64 module_param(uvm_release_asserts_dump_stack, int, S_IRUGO|S_IWUSR);
65 MODULE_PARM_DESC(uvm_release_asserts_dump_stack, "dump_stack() on failed UVM release asserts.");
66 
67 // Default to failed release asserts not setting the global UVM error.
68 int uvm_release_asserts_set_global_error __read_mostly = 0;
69 
70 // Make the module param writable so that setting the global fatal error can be
71 // enabled and disabled at any time by modifying the module parameter.
72 module_param(uvm_release_asserts_set_global_error, int, S_IRUGO|S_IWUSR);
73 MODULE_PARM_DESC(uvm_release_asserts_set_global_error, "Set UVM global fatal error on failed release asserts.");
74 
75 // A separate flag to enable setting global error, to be used by tests only.
76 bool uvm_release_asserts_set_global_error_for_tests __read_mostly = false;
77 
78 //
79 // Convert kernel errno codes to corresponding NV_STATUS
80 //
errno_to_nv_status(int errnoCode)81 NV_STATUS errno_to_nv_status(int errnoCode)
82 {
83     if (errnoCode < 0)
84         errnoCode = -errnoCode;
85 
86     switch (errnoCode)
87     {
88         case 0:
89             return NV_OK;
90 
91         case E2BIG:
92         case EINVAL:
93             return NV_ERR_INVALID_ARGUMENT;
94 
95         case EACCES:
96             return NV_ERR_INVALID_ACCESS_TYPE;
97 
98         case EADDRINUSE:
99         case EADDRNOTAVAIL:
100             return NV_ERR_UVM_ADDRESS_IN_USE;
101 
102         case EFAULT:
103             return NV_ERR_INVALID_ADDRESS;
104 
105         case EOVERFLOW:
106             return NV_ERR_OUT_OF_RANGE;
107 
108         case EINTR:
109         case EBUSY:
110         case EAGAIN:
111             return NV_ERR_BUSY_RETRY;
112 
113         case ENXIO:
114         case ENODEV:
115             return NV_ERR_MODULE_LOAD_FAILED;
116 
117         case ENOMEM:
118             return NV_ERR_NO_MEMORY;
119 
120         case EPERM:
121             return NV_ERR_INSUFFICIENT_PERMISSIONS;
122 
123         case ESRCH:
124             return NV_ERR_PID_NOT_FOUND;
125 
126         case ETIMEDOUT:
127             return NV_ERR_TIMEOUT;
128 
129         case EEXIST:
130             return NV_ERR_IN_USE;
131 
132         case ENOSYS:
133         case EOPNOTSUPP:
134             return NV_ERR_NOT_SUPPORTED;
135 
136         case ENOENT:
137             return NV_ERR_NO_VALID_PATH;
138 
139         case EIO:
140             return NV_ERR_RC_ERROR;
141 
142         case ENODATA:
143             return NV_ERR_OBJECT_NOT_FOUND;
144 
145         default:
146             return NV_ERR_GENERIC;
147     };
148 }
149 
150 // Returns POSITIVE errno
nv_status_to_errno(NV_STATUS status)151 int nv_status_to_errno(NV_STATUS status)
152 {
153     switch (status) {
154         case NV_OK:
155             return 0;
156 
157         case NV_ERR_BUSY_RETRY:
158             return EAGAIN;
159 
160         case NV_ERR_INSUFFICIENT_PERMISSIONS:
161             return EPERM;
162 
163         case NV_ERR_GPU_UUID_NOT_FOUND:
164             return ENODEV;
165 
166         case NV_ERR_INSUFFICIENT_RESOURCES:
167         case NV_ERR_NO_MEMORY:
168             return ENOMEM;
169 
170         case NV_ERR_INVALID_ACCESS_TYPE:
171             return EACCES;
172 
173         case NV_ERR_INVALID_ADDRESS:
174             return EFAULT;
175 
176         case NV_ERR_INVALID_ARGUMENT:
177         case NV_ERR_INVALID_DEVICE:
178         case NV_ERR_INVALID_PARAMETER:
179         case NV_ERR_INVALID_REQUEST:
180         case NV_ERR_INVALID_STATE:
181             return EINVAL;
182 
183         case NV_ERR_NOT_SUPPORTED:
184             return ENOSYS;
185 
186         case NV_ERR_OBJECT_NOT_FOUND:
187             return ENODATA;
188 
189         case NV_ERR_MODULE_LOAD_FAILED:
190             return ENXIO;
191 
192         case NV_ERR_OVERLAPPING_UVM_COMMIT:
193         case NV_ERR_UVM_ADDRESS_IN_USE:
194             return EADDRINUSE;
195 
196         case NV_ERR_PID_NOT_FOUND:
197             return ESRCH;
198 
199         case NV_ERR_TIMEOUT:
200         case NV_ERR_TIMEOUT_RETRY:
201             return ETIMEDOUT;
202 
203         case NV_ERR_IN_USE:
204             return EEXIST;
205 
206         case NV_ERR_NO_VALID_PATH:
207             return ENOENT;
208 
209         case NV_ERR_RC_ERROR:
210         case NV_ERR_ECC_ERROR:
211             return EIO;
212 
213         case NV_ERR_OUT_OF_RANGE:
214             return EOVERFLOW;
215 
216         default:
217             UVM_ASSERT_MSG(0, "No errno conversion set up for NV_STATUS %s\n", nvstatusToString(status));
218             return EINVAL;
219     }
220 }
221 
222 //
223 // This routine retrieves the process ID of current, but makes no attempt to
224 // refcount or lock the pid in place.
225 //
uvm_get_stale_process_id(void)226 unsigned uvm_get_stale_process_id(void)
227 {
228     return (unsigned)task_tgid_vnr(current);
229 }
230 
uvm_get_stale_thread_id(void)231 unsigned uvm_get_stale_thread_id(void)
232 {
233     return (unsigned)task_pid_vnr(current);
234 }
235 
on_uvm_test_fail(void)236 void on_uvm_test_fail(void)
237 {
238     (void)NULL;
239 }
240 
on_uvm_assert(void)241 void on_uvm_assert(void)
242 {
243     (void)NULL;
244 #ifdef __COVERITY__
245     __coverity_panic__()
246 #endif
247 }
248 
uvm_spin_loop(uvm_spin_loop_t * spin)249 NV_STATUS uvm_spin_loop(uvm_spin_loop_t *spin)
250 {
251     NvU64 curr = NV_GETTIME();
252 
253     // This schedule() is required for functionality, not just system
254     // performance. It allows RM to run and unblock the UVM driver:
255     //
256     // - UVM must service faults in order for RM to idle/preempt a context
257     // - RM must service interrupts which stall UVM (SW methods, stalling CE
258     //   interrupts, etc) in order for UVM to service faults
259     //
260     // Even though UVM's bottom half is preemptable, we have encountered cases
261     // in which a user thread running in RM won't preempt the UVM driver's
262     // thread unless the UVM driver thread gives up its timeslice. This is also
263     // theoretically possible if the RM thread has a low nice priority.
264     //
265     // TODO: Bug 1710855: Look into proper prioritization of these threads as a longer-term
266     //       solution.
267     if (curr - spin->start_time_ns >= UVM_SPIN_LOOP_SCHEDULE_TIMEOUT_NS && NV_MAY_SLEEP()) {
268         schedule();
269         curr = NV_GETTIME();
270     }
271 
272     cpu_relax();
273 
274     // TODO: Bug 1710855: Also check fatal_signal_pending() here if the caller can handle it.
275 
276     if (curr - spin->print_time_ns >= 1000*1000*1000*UVM_SPIN_LOOP_PRINT_TIMEOUT_SEC) {
277         spin->print_time_ns = curr;
278         return NV_ERR_TIMEOUT_RETRY;
279     }
280 
281     return NV_OK;
282 }
283 
284 // This formats a GPU UUID, in a UVM-friendly way. That is, nearly the same as
285 // what nvidia-smi reports.  It will always prefix the UUID with UVM-GPU so
286 // that we know that we have a real, binary formatted UUID that will work in
287 // the UVM APIs.
288 //
289 // It comes out like this:
290 //
291 //     UVM-GPU-d802726c-df8d-a3c3-ec53-48bdec201c27
292 //
293 //  This routine will always null-terminate the string for you. This is true
294 //  even if the buffer was too small!
295 //
296 //  Return value is the number of non-null characters written.
297 //
298 // Note that if you were to let the NV2080_CTRL_CMD_GPU_GET_GID_INFO command
299 // return it's default format, which is ascii, not binary, then you would get
300 // this back:
301 //
302 //     GPU-d802726c-df8d-a3c3-ec53-48bdec201c27
303 //
304 //  ...which is actually a character string, and won't work for UVM API calls.
305 //  So it's very important to be able to see the difference.
306 //
uvm_digit_to_hex(unsigned value)307 static char uvm_digit_to_hex(unsigned value)
308 {
309     if (value >= 10)
310         return value - 10 + 'a';
311     else
312         return value + '0';
313 }
314 
format_uuid_to_buffer(char * buffer,unsigned bufferLength,const NvProcessorUuid * pUuidStruct)315 int format_uuid_to_buffer(char *buffer, unsigned bufferLength, const NvProcessorUuid *pUuidStruct)
316 {
317     char *str = buffer+8;
318     unsigned i;
319     unsigned dashMask = 1 << 4 | 1 << 6 | 1 << 8 | 1 << 10;
320 
321     if (bufferLength < (8 /*prefix*/+ 16 * 2 /*digits*/ + 4 * 1 /*dashes*/ + 1 /*null*/))
322         return *buffer = 0;
323 
324     memcpy(buffer, "UVM-GPU-", 8);
325 
326     for (i = 0; i < 16; i++) {
327         *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] >> 4);
328         *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] & 0xF);
329 
330         if (dashMask & (1 << (i+1)))
331             *str++ = '-';
332     }
333 
334     *str = 0;
335 
336     return (int)(str-buffer);
337 }
338 
339