1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #define  __NO_VERSION__
25 
26 #include "os-interface.h"
27 #include "nv-linux.h"
28 #include "nv-caps-imex.h"
29 
30 #include "nv-time.h"
31 
32 #include <linux/mmzone.h>
33 #include <linux/numa.h>
34 #include <linux/cpuset.h>
35 
36 #include <linux/pid.h>
37 #if defined(CONFIG_LOCKDEP)
38 #include <linux/lockdep.h>
39 #endif // CONFIG_LOCKDEP
40 
41 extern char *NVreg_TemporaryFilePath;
42 
43 #define MAX_ERROR_STRING 528
44 static char nv_error_string[MAX_ERROR_STRING];
45 static NV_DEFINE_SPINLOCK(nv_error_string_lock);
46 
47 extern nv_linux_state_t nv_ctl_device;
48 
49 extern nv_kthread_q_t nv_kthread_q;
50 
51 NvU32 os_page_size  = PAGE_SIZE;
52 NvU64 os_page_mask  = NV_PAGE_MASK;
53 NvU8  os_page_shift = PAGE_SHIFT;
54 NvBool os_cc_enabled = 0;
55 NvBool os_cc_tdx_enabled = 0;
56 
57 #if defined(CONFIG_DMA_SHARED_BUFFER)
58 NvBool os_dma_buf_enabled = NV_TRUE;
59 #else
60 NvBool os_dma_buf_enabled = NV_FALSE;
61 #endif // CONFIG_DMA_SHARED_BUFFER
62 
63 NvBool os_imex_channel_is_supported = NV_TRUE;
64 
os_disable_console_access(void)65 void NV_API_CALL os_disable_console_access(void)
66 {
67     console_lock();
68 }
69 
os_enable_console_access(void)70 void NV_API_CALL os_enable_console_access(void)
71 {
72     console_unlock();
73 }
74 
75 typedef struct semaphore os_mutex_t;
76 
77 //
78 // os_alloc_mutex - Allocate the RM mutex
79 //
80 //  ppMutex - filled in with pointer to opaque structure to mutex data type
81 //
os_alloc_mutex(void ** ppMutex)82 NV_STATUS NV_API_CALL os_alloc_mutex
83 (
84     void **ppMutex
85 )
86 {
87     NV_STATUS rmStatus;
88     os_mutex_t *os_mutex;
89 
90     rmStatus = os_alloc_mem(ppMutex, sizeof(os_mutex_t));
91     if (rmStatus != NV_OK)
92     {
93         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate mutex!\n");
94         return rmStatus;
95     }
96     os_mutex = (os_mutex_t *)*ppMutex;
97     NV_INIT_MUTEX(os_mutex);
98 
99     return NV_OK;
100 }
101 
102 //
103 // os_free_mutex - Free resources associated with mutex allocated
104 //                via os_alloc_mutex above.
105 //
106 //  pMutex - Pointer to opaque structure to mutex data type
107 //
os_free_mutex(void * pMutex)108 void NV_API_CALL os_free_mutex
109 (
110     void  *pMutex
111 )
112 {
113     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
114 
115     if (os_mutex != NULL)
116     {
117         os_free_mem(pMutex);
118     }
119 }
120 
121 //
122 //  pMutex - Pointer to opaque structure to mutex data type
123 //
124 
os_acquire_mutex(void * pMutex)125 NV_STATUS NV_API_CALL os_acquire_mutex
126 (
127     void  *pMutex
128 )
129 {
130     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
131 
132     if (!NV_MAY_SLEEP())
133     {
134         return NV_ERR_INVALID_REQUEST;
135     }
136     down(os_mutex);
137 
138     return NV_OK;
139 }
140 
os_cond_acquire_mutex(void * pMutex)141 NV_STATUS NV_API_CALL os_cond_acquire_mutex
142 (
143     void * pMutex
144 )
145 {
146     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
147     if (!NV_MAY_SLEEP())
148     {
149         return NV_ERR_INVALID_REQUEST;
150     }
151 
152     if (down_trylock(os_mutex))
153     {
154         return NV_ERR_TIMEOUT_RETRY;
155     }
156 
157     return NV_OK;
158 }
159 
160 
os_release_mutex(void * pMutex)161 void NV_API_CALL os_release_mutex
162 (
163     void *pMutex
164 )
165 {
166     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
167     up(os_mutex);
168 }
169 
170 typedef struct semaphore os_semaphore_t;
171 
172 
os_alloc_semaphore(NvU32 initialValue)173 void* NV_API_CALL os_alloc_semaphore
174 (
175     NvU32 initialValue
176 )
177 {
178     NV_STATUS rmStatus;
179     os_semaphore_t *os_sema;
180 
181     rmStatus = os_alloc_mem((void *)&os_sema, sizeof(os_semaphore_t));
182     if (rmStatus != NV_OK)
183     {
184         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate semaphore!\n");
185         return NULL;
186     }
187 
188     sema_init(os_sema, initialValue);
189 
190     return (void *)os_sema;
191 }
192 
os_free_semaphore(void * pSema)193 void NV_API_CALL os_free_semaphore
194 (
195     void *pSema
196 )
197 {
198     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
199 
200     os_free_mem(os_sema);
201 }
202 
os_acquire_semaphore(void * pSema)203 NV_STATUS NV_API_CALL os_acquire_semaphore
204 (
205     void *pSema
206 )
207 {
208     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
209 
210     if (!NV_MAY_SLEEP())
211     {
212         return NV_ERR_INVALID_REQUEST;
213     }
214     down(os_sema);
215     return NV_OK;
216 }
217 
os_cond_acquire_semaphore(void * pSema)218 NV_STATUS NV_API_CALL os_cond_acquire_semaphore
219 (
220     void * pSema
221 )
222 {
223     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
224     //
225     // NOTE: down_trylock() is safe to call from IRQ, se we don't need an
226     // NV_MAY_SLEEP() check here. We do check it in os_cond_acquire_mutex(),
227     // even though it is also calling down_trylock(), since that keeps it
228     // in line with the kernel's 'struct mutex' API.
229     //
230     if (down_trylock(os_sema))
231     {
232         return NV_ERR_TIMEOUT_RETRY;
233     }
234 
235     return NV_OK;
236 }
237 
os_release_semaphore(void * pSema)238 NV_STATUS NV_API_CALL os_release_semaphore
239 (
240     void *pSema
241 )
242 {
243     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
244     up(os_sema);
245     return NV_OK;
246 }
247 
248 typedef struct
249 {
250     struct rw_semaphore sem;
251 
252 #if defined(CONFIG_LOCKDEP)
253     /**
254      * A key of lock class. It would be registered to Lockdep validator so all
255      * instances' usages and dependencies will contribute to constructing correct
256      * locking rules and this lock will be tracked by the Lockdep validator.
257      *
258      */
259     struct lock_class_key key;
260 #endif // CONFIG_LOCKDEP
261 } os_rwlock_t;
262 
os_alloc_rwlock(void)263 void* NV_API_CALL os_alloc_rwlock(void)
264 {
265     os_rwlock_t *os_rwlock = NULL;
266 
267     NV_STATUS rmStatus = os_alloc_mem((void *)&os_rwlock, sizeof(os_rwlock_t));
268     if (rmStatus != NV_OK)
269     {
270         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate a struct os_rwlock_t!\n");
271         return NULL;
272     }
273 
274     init_rwsem(&os_rwlock->sem);
275 
276 #if defined(CONFIG_LOCKDEP)
277     // Register the dynamically allocated key to Lockdep.
278     lockdep_register_key(&os_rwlock->key);
279     lockdep_set_class(&os_rwlock->sem, &os_rwlock->key);
280 #endif // CONFIG_LOCKDEP
281 
282     return os_rwlock;
283 }
284 
os_free_rwlock(void * pRwLock)285 void NV_API_CALL os_free_rwlock(void *pRwLock)
286 {
287     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
288 
289 #if defined(CONFIG_LOCKDEP)
290     // Unregister the dynamically allocated key.
291     lockdep_unregister_key(&os_rwlock->key);
292 #endif // CONFIG_LOCKDEP
293 
294     os_free_mem(os_rwlock);
295 }
296 
os_acquire_rwlock_read(void * pRwLock)297 NV_STATUS NV_API_CALL os_acquire_rwlock_read(void *pRwLock)
298 {
299     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
300 
301     if (!NV_MAY_SLEEP())
302     {
303         return NV_ERR_INVALID_REQUEST;
304     }
305     down_read(&os_rwlock->sem);
306     return NV_OK;
307 }
308 
os_acquire_rwlock_write(void * pRwLock)309 NV_STATUS NV_API_CALL os_acquire_rwlock_write(void *pRwLock)
310 {
311     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
312 
313     if (!NV_MAY_SLEEP())
314     {
315         return NV_ERR_INVALID_REQUEST;
316     }
317     down_write(&os_rwlock->sem);
318     return NV_OK;
319 }
320 
os_cond_acquire_rwlock_read(void * pRwLock)321 NV_STATUS NV_API_CALL os_cond_acquire_rwlock_read(void *pRwLock)
322 {
323     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
324 
325     if (down_read_trylock(&os_rwlock->sem))
326     {
327         return NV_ERR_TIMEOUT_RETRY;
328     }
329 
330     return NV_OK;
331 }
332 
os_cond_acquire_rwlock_write(void * pRwLock)333 NV_STATUS NV_API_CALL os_cond_acquire_rwlock_write(void *pRwLock)
334 {
335     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
336 
337     if (down_write_trylock(&os_rwlock->sem))
338     {
339         return NV_ERR_TIMEOUT_RETRY;
340     }
341 
342     return NV_OK;
343 }
344 
os_release_rwlock_read(void * pRwLock)345 void NV_API_CALL os_release_rwlock_read(void *pRwLock)
346 {
347     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
348     up_read(&os_rwlock->sem);
349 }
350 
os_release_rwlock_write(void * pRwLock)351 void NV_API_CALL os_release_rwlock_write(void *pRwLock)
352 {
353     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
354     up_write(&os_rwlock->sem);
355 }
356 
os_semaphore_may_sleep(void)357 NvBool NV_API_CALL os_semaphore_may_sleep(void)
358 {
359     return NV_MAY_SLEEP();
360 }
361 
os_is_isr(void)362 NvBool NV_API_CALL os_is_isr(void)
363 {
364     return (in_irq());
365 }
366 
367 // return TRUE if the caller is the super-user
os_is_administrator(void)368 NvBool NV_API_CALL os_is_administrator(void)
369 {
370     return NV_IS_SUSER();
371 }
372 
os_allow_priority_override(void)373 NvBool NV_API_CALL os_allow_priority_override(void)
374 {
375     return capable(CAP_SYS_NICE);
376 }
377 
os_string_copy(char * dst,const char * src)378 char* NV_API_CALL os_string_copy(
379     char *dst,
380     const char *src
381 )
382 {
383     return strcpy(dst, src);
384 }
385 
os_string_length(const char * str)386 NvU32 NV_API_CALL os_string_length(
387     const char* str
388 )
389 {
390     return strlen(str);
391 }
392 
os_strtoul(const char * str,char ** endp,NvU32 base)393 NvU32 NV_API_CALL os_strtoul(const char *str, char **endp, NvU32 base)
394 {
395     return (NvU32)simple_strtoul(str, endp, base);
396 }
397 
os_string_compare(const char * str1,const char * str2)398 NvS32 NV_API_CALL os_string_compare(const char *str1, const char *str2)
399 {
400     return strcmp(str1, str2);
401 }
402 
os_mem_copy_custom(void * dstPtr,const void * srcPtr,NvU32 length)403 void *os_mem_copy_custom(
404     void       *dstPtr,
405     const void *srcPtr,
406     NvU32       length
407 )
408 {
409     void *ret = dstPtr;
410     NvU32 dwords, bytes = length;
411     NvU8 *dst = dstPtr;
412     const NvU8 *src = srcPtr;
413 
414     if ((length >= 128) &&
415         (((NvUPtr)dst & 3) == 0) & (((NvUPtr)src & 3) == 0))
416     {
417         dwords = (length / sizeof(NvU32));
418         bytes = (length % sizeof(NvU32));
419 
420         while (dwords != 0)
421         {
422             *(NvU32 *)dst = *(const NvU32 *)src;
423             dst += sizeof(NvU32);
424             src += sizeof(NvU32);
425             dwords--;
426         }
427     }
428 
429     while (bytes != 0)
430     {
431         *dst = *src;
432         dst++;
433         src++;
434         bytes--;
435     }
436 
437     return ret;
438 }
439 
os_mem_copy(void * dst,const void * src,NvU32 length)440 void *NV_API_CALL os_mem_copy(
441     void       *dst,
442     const void *src,
443     NvU32       length
444 )
445 {
446 #if defined(NVCPU_AARCH64)
447     /*
448      * TODO: Remove once memset/memcpy restructure is complete
449      *
450      * When performing memcpy for memory mapped as device, memcpy_[to/from]io
451      * must be used. WAR to check the source and destination to determine the
452      * correct memcpy_io to use.
453      *
454      * This WAR is limited to just aarch64 for now because the address range used
455      * to map ioremap and vmalloc is different on ppc64le, and is_vmalloc_addr()
456      * does not correctly handle this. is_ioremap_addr() is needed instead. This
457      * will have to be addressed when reorganizing RM to use the new memset model.
458      */
459     if (is_vmalloc_addr(dst) && !is_vmalloc_addr(src))
460     {
461         memcpy_toio(dst, src, length);
462         return dst;
463     }
464     else if (!is_vmalloc_addr(dst) && is_vmalloc_addr(src))
465     {
466         memcpy_fromio(dst, src, length);
467         return dst;
468     }
469     else if (is_vmalloc_addr(dst) && is_vmalloc_addr(src))
470     {
471         return os_mem_copy_custom(dst, src, length);
472     }
473     else
474 #endif
475     {
476 #if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE)
477         /*
478          * When the kernel is configured with CC_OPTIMIZE_FOR_SIZE=y, Kbuild uses
479          * -Os universally. With -Os, GCC will aggressively inline builtins, even
480          * if -fno-builtin is specified, including memcpy with a tiny byte-copy
481          * loop on x86 (rep movsb). This is horrible for performance - a strict
482          * dword copy is much faster - so when we detect this case, just provide
483          * our own implementation.
484          */
485         return os_mem_copy_custom(dst, src, length);
486 #else
487         /*
488          * Generally speaking, the kernel-provided memcpy will be the fastest,
489          * (optimized much better for the target architecture than the above
490          * loop), so we want to use that whenever we can get to it.
491          */
492         return memcpy(dst, src, length);
493 #endif
494     }
495 }
496 
os_memcpy_from_user(void * to,const void * from,NvU32 n)497 NV_STATUS NV_API_CALL os_memcpy_from_user(
498     void       *to,
499     const void *from,
500     NvU32       n
501 )
502 {
503     return (NV_COPY_FROM_USER(to, from, n) ? NV_ERR_INVALID_ADDRESS : NV_OK);
504 }
505 
os_memcpy_to_user(void * to,const void * from,NvU32 n)506 NV_STATUS NV_API_CALL os_memcpy_to_user(
507     void       *to,
508     const void *from,
509     NvU32       n
510 )
511 {
512     return (NV_COPY_TO_USER(to, from, n) ? NV_ERR_INVALID_ADDRESS : NV_OK);
513 }
514 
os_mem_set(void * dst,NvU8 c,NvU32 length)515 void* NV_API_CALL os_mem_set(
516     void  *dst,
517     NvU8   c,
518     NvU32  length
519 )
520 {
521 #if defined(NVCPU_AARCH64)
522     /*
523      * TODO: Remove once memset/memcpy restructure is complete
524      *
525      * WAR to check the destination to determine if the memory is of type Device
526      * or Normal, and use the correct memset.
527      *
528      * This WAR is limited to just aarch64 for now because the address range used
529      * to map ioremap and vmalloc is different on ppc64le, and is_vmalloc_addr()
530      * does not correctly handle this. is_ioremap_addr() is needed instead. This
531      * will have to be addressed when reorganizing RM to use the new memset model.
532      */
533     if (is_vmalloc_addr(dst))
534     {
535         memset_io(dst, (int)c, length);
536         return dst;
537     }
538     else
539 #endif
540        return memset(dst, (int)c, length);
541 }
542 
os_mem_cmp(const NvU8 * buf0,const NvU8 * buf1,NvU32 length)543 NvS32 NV_API_CALL os_mem_cmp(
544     const NvU8 *buf0,
545     const NvU8* buf1,
546     NvU32 length
547 )
548 {
549     return memcmp(buf0, buf1, length);
550 }
551 
552 
553 /*
554  * Operating System Memory Functions
555  *
556  * There are 2 interesting aspects of resource manager memory allocations
557  * that need special consideration on Linux:
558  *
559  * 1. They are typically very large, (e.g. single allocations of 164KB)
560  *
561  * 2. The resource manager assumes that it can safely allocate memory in
562  *    interrupt handlers.
563  *
564  * The first requires that we call vmalloc, the second kmalloc. We decide
565  * which one to use at run time, based on the size of the request and the
566  * context. Allocations larger than 128KB require vmalloc, in the context
567  * of an ISR they fail.
568  */
569 
570 #if defined(NV_VGX_HYPER)
571 /*
572  * Citrix Hypervisor-8.0 Dom0 sysmem ends up getting fragmented because
573  * of which high-order kmalloc allocations fail. We try to avoid it by
574  * requesting allocations not larger than 8K.
575  *
576  * KVM will be affected low memory pressure situation a lot,
577  * particularly if hugetlbfs hugepages are being used. Hence, 8K applies
578  * here too.
579  */
580 #define KMALLOC_LIMIT 8192
581 #else
582 #define KMALLOC_LIMIT 131072
583 #endif
584 
585 #define VMALLOC_ALLOCATION_SIZE_FLAG (1 << 0)
586 
os_alloc_mem(void ** address,NvU64 size)587 NV_STATUS NV_API_CALL os_alloc_mem(
588     void **address,
589     NvU64 size
590 )
591 {
592     NvU64 original_size = size;
593     unsigned long alloc_size;
594 
595     if (address == NULL)
596         return NV_ERR_INVALID_ARGUMENT;
597 
598     *address = NULL;
599     NV_MEM_TRACKING_PAD_SIZE(size);
600 
601     // check for integer overflow on size
602     if (size < original_size)
603         return NV_ERR_INVALID_ARGUMENT;
604 
605     //
606     // NV_KMALLOC, nv_vmalloc take an input of 4 bytes in x86. To avoid
607     // truncation and wrong allocation, below check is required.
608     //
609     alloc_size = size;
610 
611     if (alloc_size != size)
612         return NV_ERR_INVALID_PARAMETER;
613 
614     if (!NV_MAY_SLEEP())
615     {
616         if (alloc_size <= KMALLOC_LIMIT)
617             NV_KMALLOC_ATOMIC(*address, alloc_size);
618     }
619     else
620     {
621         if (alloc_size <= KMALLOC_LIMIT)
622         {
623             NV_KMALLOC_NO_OOM(*address, alloc_size);
624         }
625         if (*address == NULL)
626         {
627             *address = nv_vmalloc(alloc_size);
628             alloc_size |= VMALLOC_ALLOCATION_SIZE_FLAG;
629         }
630     }
631 
632     NV_MEM_TRACKING_HIDE_SIZE(address, alloc_size);
633 
634     return ((*address != NULL) ? NV_OK : NV_ERR_NO_MEMORY);
635 }
636 
os_free_mem(void * address)637 void NV_API_CALL os_free_mem(void *address)
638 {
639     NvU64 size;
640 
641     NV_MEM_TRACKING_RETRIEVE_SIZE(address, size);
642 
643     if (size & VMALLOC_ALLOCATION_SIZE_FLAG)
644     {
645         size &= ~VMALLOC_ALLOCATION_SIZE_FLAG;
646         nv_vfree(address, size);
647     }
648     else
649         NV_KFREE(address, size);
650 }
651 
652 
653 /*****************************************************************************
654 *
655 *   Name: osGetCurrentTime
656 *
657 *****************************************************************************/
658 
os_get_current_time(NvU32 * seconds,NvU32 * useconds)659 NV_STATUS NV_API_CALL os_get_current_time(
660     NvU32 *seconds,
661     NvU32 *useconds
662 )
663 {
664     struct timespec64 tm;
665 
666     ktime_get_real_ts64(&tm);
667 
668     *seconds = tm.tv_sec;
669     *useconds = tm.tv_nsec / NSEC_PER_USEC;
670 
671     return NV_OK;
672 }
673 
674 //
675 // Get the High resolution tick count of the system uptime
676 //
os_get_current_tick_hr(void)677 NvU64 NV_API_CALL os_get_current_tick_hr(void)
678 {
679     struct timespec64 tm;
680     ktime_get_raw_ts64(&tm);
681     return (NvU64) timespec64_to_ns(&tm);
682 }
683 
684 #if BITS_PER_LONG >= 64
685 
os_get_current_tick(void)686 NvU64 NV_API_CALL os_get_current_tick(void)
687 {
688 #if defined(NV_JIFFIES_TO_TIMESPEC_PRESENT)
689     struct timespec ts;
690     jiffies_to_timespec(jiffies, &ts);
691     return (NvU64) timespec_to_ns(&ts);
692 #else
693     struct timespec64 ts;
694     jiffies_to_timespec64(jiffies, &ts);
695     return (NvU64) timespec64_to_ns(&ts);
696 #endif
697 }
698 
os_get_tick_resolution(void)699 NvU64 NV_API_CALL os_get_tick_resolution(void)
700 {
701     return (NvU64)jiffies_to_usecs(1) * NSEC_PER_USEC;
702 }
703 
704 #else
705 
os_get_current_tick(void)706 NvU64 NV_API_CALL os_get_current_tick(void)
707 {
708     /*
709      * 'jiffies' overflows regularly on 32-bit builds (unsigned long is 4 bytes
710      * instead of 8 bytes), so it's unwise to build a tick counter on it, since
711      * the rest of the Resman assumes the 'tick' returned from this function is
712      * monotonically increasing and never overflows.
713      *
714      * Instead, use the previous implementation that we've lived with since the
715      * beginning, which uses system clock time to calculate the tick. This is
716      * subject to problems if the system clock time changes dramatically
717      * (more than a second or so) while the Resman is actively tracking a
718      * timeout.
719      */
720     NvU32 seconds, useconds;
721 
722     (void) os_get_current_time(&seconds, &useconds);
723 
724     return ((NvU64)seconds * NSEC_PER_SEC +
725                  (NvU64)useconds * NSEC_PER_USEC);
726 }
727 
os_get_tick_resolution(void)728 NvU64 NV_API_CALL os_get_tick_resolution(void)
729 {
730     /*
731      * os_get_current_tick() uses os_get_current_time(), which has
732      * microsecond resolution.
733      */
734     return 1000ULL;
735 }
736 
737 #endif
738 
739 //---------------------------------------------------------------------------
740 //
741 //  Misc services.
742 //
743 //---------------------------------------------------------------------------
744 
os_delay_us(NvU32 MicroSeconds)745 NV_STATUS NV_API_CALL os_delay_us(NvU32 MicroSeconds)
746 {
747     return nv_sleep_us(MicroSeconds);
748 }
749 
os_delay(NvU32 MilliSeconds)750 NV_STATUS NV_API_CALL os_delay(NvU32 MilliSeconds)
751 {
752     return nv_sleep_ms(MilliSeconds);
753 }
754 
os_get_cpu_frequency(void)755 NvU64 NV_API_CALL os_get_cpu_frequency(void)
756 {
757     NvU64 cpu_hz = 0;
758 #if defined(CONFIG_CPU_FREQ)
759     cpu_hz = (cpufreq_get(0) * 1000);
760 #elif defined(NVCPU_X86_64)
761     NvU64 tsc[2];
762 
763     tsc[0] = nv_rdtsc();
764     mdelay(250);
765     tsc[1] = nv_rdtsc();
766 
767     cpu_hz = ((tsc[1] - tsc[0]) * 4);
768 #endif
769     return cpu_hz;
770 }
771 
os_get_current_process(void)772 NvU32 NV_API_CALL os_get_current_process(void)
773 {
774     return NV_GET_CURRENT_PROCESS();
775 }
776 
os_get_current_process_name(char * buf,NvU32 len)777 void NV_API_CALL os_get_current_process_name(char *buf, NvU32 len)
778 {
779     task_lock(current);
780     strncpy(buf, current->comm, len - 1);
781     buf[len - 1] = '\0';
782     task_unlock(current);
783 }
784 
os_get_current_thread(NvU64 * threadId)785 NV_STATUS NV_API_CALL os_get_current_thread(NvU64 *threadId)
786 {
787     if (in_interrupt())
788         *threadId = 0;
789     else
790         *threadId = (NvU64) current->pid;
791 
792     return NV_OK;
793 }
794 
795 /*******************************************************************************/
796 /*                                                                             */
797 /* Debug and logging utilities follow                                          */
798 /*                                                                             */
799 /*******************************************************************************/
800 
801 // The current debug display level (default to maximum debug level)
802 NvU32 cur_debuglevel = 0xffffffff;
803 
804 /*
805  * The binary core of RM (nv-kernel.o) calls both out_string, and nv_printf.
806  */
out_string(const char * str)807 inline void NV_API_CALL out_string(const char *str)
808 {
809     printk("%s", str);
810 }
811 
812 /*
813  * nv_printf() prints to the kernel log for the driver.
814  * Returns the number of characters written.
815  */
nv_printf(NvU32 debuglevel,const char * printf_format,...)816 int NV_API_CALL nv_printf(NvU32 debuglevel, const char *printf_format, ...)
817 {
818     va_list arglist;
819     int chars_written = 0;
820 
821     if (debuglevel >= ((cur_debuglevel >> 4) & 0x3))
822     {
823         size_t length;
824         unsigned long flags;
825 
826         // When printk is called to extend the output of the previous line
827         // (i.e. when the previous line did not end in \n), the printk call
828         // must contain KERN_CONT.  Older kernels still print the line
829         // correctly, but KERN_CONT was technically always required.
830 
831         // This means that every call to printk() needs to have a KERN_xxx
832         // prefix.  The only way to get this is to rebuild the format string
833         // into a new buffer, with a KERN_xxx prefix prepended.
834 
835         // Unfortunately, we can't guarantee that two calls to nv_printf()
836         // won't be interrupted by a printk from another driver.  So to be
837         // safe, we always append KERN_CONT.  It's still technically wrong,
838         // but it works.
839 
840         // The long-term fix is to modify all NV_PRINTF-ish calls so that the
841         // string always contains only one \n (at the end) and NV_PRINTF_EX
842         // is deleted.  But that is unlikely to ever happen.
843 
844         length = strlen(printf_format);
845         if (length < 1)
846             return 0;
847 
848         NV_SPIN_LOCK_IRQSAVE(&nv_error_string_lock, flags);
849 
850         // KERN_CONT changed in the 3.6 kernel, so we can't assume its
851         // composition or size.
852         memcpy(nv_error_string, KERN_CONT, sizeof(KERN_CONT) - 1);
853         memcpy(nv_error_string + sizeof(KERN_CONT) - 1, printf_format, length + 1);
854 
855         va_start(arglist, printf_format);
856         chars_written = vprintk(nv_error_string, arglist);
857         va_end(arglist);
858 
859         NV_SPIN_UNLOCK_IRQRESTORE(&nv_error_string_lock, flags);
860     }
861 
862     return chars_written;
863 }
864 
os_snprintf(char * buf,NvU32 size,const char * fmt,...)865 NvS32 NV_API_CALL os_snprintf(char *buf, NvU32 size, const char *fmt, ...)
866 {
867     va_list arglist;
868     int chars_written;
869 
870     va_start(arglist, fmt);
871     chars_written = vsnprintf(buf, size, fmt, arglist);
872     va_end(arglist);
873 
874     return chars_written;
875 }
876 
os_vsnprintf(char * buf,NvU32 size,const char * fmt,va_list arglist)877 NvS32 NV_API_CALL os_vsnprintf(char *buf, NvU32 size, const char *fmt, va_list arglist)
878 {
879     return vsnprintf(buf, size, fmt, arglist);
880 }
881 
os_log_error(const char * fmt,va_list ap)882 void NV_API_CALL os_log_error(const char *fmt, va_list ap)
883 {
884     unsigned long flags;
885 
886     NV_SPIN_LOCK_IRQSAVE(&nv_error_string_lock, flags);
887 
888     vsnprintf(nv_error_string, MAX_ERROR_STRING, fmt, ap);
889     nv_error_string[MAX_ERROR_STRING - 1] = 0;
890     printk(KERN_ERR "%s", nv_error_string);
891 
892     NV_SPIN_UNLOCK_IRQRESTORE(&nv_error_string_lock, flags);
893 }
894 
os_io_write_byte(NvU32 address,NvU8 value)895 void NV_API_CALL os_io_write_byte(
896     NvU32 address,
897     NvU8 value
898 )
899 {
900     outb(value, address);
901 }
902 
os_io_write_word(NvU32 address,NvU16 value)903 void NV_API_CALL os_io_write_word(
904     NvU32 address,
905     NvU16 value
906 )
907 {
908     outw(value, address);
909 }
910 
os_io_write_dword(NvU32 address,NvU32 value)911 void NV_API_CALL os_io_write_dword(
912     NvU32 address,
913     NvU32 value
914 )
915 {
916     outl(value, address);
917 }
918 
os_io_read_byte(NvU32 address)919 NvU8 NV_API_CALL os_io_read_byte(
920     NvU32 address
921 )
922 {
923     return inb(address);
924 }
925 
os_io_read_word(NvU32 address)926 NvU16 NV_API_CALL os_io_read_word(
927     NvU32 address
928 )
929 {
930     return inw(address);
931 }
932 
os_io_read_dword(NvU32 address)933 NvU32 NV_API_CALL os_io_read_dword(
934     NvU32 address
935 )
936 {
937     return inl(address);
938 }
939 
940 
xen_support_fully_virtualized_kernel(void)941 static NvBool NV_API_CALL xen_support_fully_virtualized_kernel(void)
942 {
943 #if defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
944     return (os_is_vgx_hyper());
945 #endif
946     return NV_FALSE;
947 }
948 
os_map_kernel_space(NvU64 start,NvU64 size_bytes,NvU32 mode)949 void* NV_API_CALL os_map_kernel_space(
950     NvU64 start,
951     NvU64 size_bytes,
952     NvU32 mode
953 )
954 {
955     void *vaddr;
956 
957     if (!xen_support_fully_virtualized_kernel() && start == 0)
958     {
959         if (mode != NV_MEMORY_CACHED)
960         {
961             nv_printf(NV_DBG_ERRORS,
962                 "NVRM: os_map_kernel_space: won't map address 0x%0llx UC!\n", start);
963             return NULL;
964         }
965         else
966             return (void *)PAGE_OFFSET;
967     }
968 
969     if (!NV_MAY_SLEEP())
970     {
971         nv_printf(NV_DBG_ERRORS,
972             "NVRM: os_map_kernel_space: can't map 0x%0llx, invalid context!\n", start);
973         os_dbg_breakpoint();
974         return NULL;
975     }
976 
977     switch (mode)
978     {
979         case NV_MEMORY_CACHED:
980             vaddr = nv_ioremap_cache(start, size_bytes);
981             break;
982         case NV_MEMORY_WRITECOMBINED:
983             vaddr = rm_disable_iomap_wc() ?
984                     nv_ioremap_nocache(start, size_bytes) :
985                     nv_ioremap_wc(start, size_bytes);
986             break;
987         case NV_MEMORY_UNCACHED:
988         case NV_MEMORY_DEFAULT:
989             vaddr = nv_ioremap_nocache(start, size_bytes);
990             break;
991         default:
992             nv_printf(NV_DBG_ERRORS,
993                 "NVRM: os_map_kernel_space: unsupported mode!\n");
994             return NULL;
995     }
996 
997     return vaddr;
998 }
999 
os_unmap_kernel_space(void * addr,NvU64 size_bytes)1000 void NV_API_CALL os_unmap_kernel_space(
1001     void *addr,
1002     NvU64 size_bytes
1003 )
1004 {
1005     if (addr == (void *)PAGE_OFFSET)
1006         return;
1007 
1008     nv_iounmap(addr, size_bytes);
1009 }
1010 
1011 #if NVCPU_IS_AARCH64
1012 
nv_flush_cache_cpu(void * info)1013 static inline void nv_flush_cache_cpu(void *info)
1014 {
1015     if (!nvos_is_chipset_io_coherent())
1016     {
1017 #if defined(NV_FLUSH_CACHE_ALL_PRESENT)
1018         flush_cache_all();
1019 #else
1020         WARN_ONCE(0, "kernel does not provide flush_cache_all()\n");
1021 #endif
1022     }
1023 }
1024 
1025 // flush the cache of all cpus
os_flush_cpu_cache_all(void)1026 NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void)
1027 {
1028     on_each_cpu(nv_flush_cache_cpu, NULL, 1);
1029     return NV_OK;
1030 }
1031 
os_flush_user_cache(void)1032 NV_STATUS NV_API_CALL os_flush_user_cache(void)
1033 {
1034     if (!NV_MAY_SLEEP())
1035     {
1036         return NV_ERR_NOT_SUPPORTED;
1037     }
1038 
1039     //
1040     // The Linux kernel does not export an interface for flushing a range,
1041     // although it is possible. For now, just flush the entire cache to be
1042     // safe.
1043     //
1044     on_each_cpu(nv_flush_cache_cpu, NULL, 1);
1045     return NV_OK;
1046 }
1047 
1048 #else // NVCPU_IS_AARCH64
1049 
os_flush_cpu_cache_all(void)1050 NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void)
1051 {
1052     return NV_ERR_NOT_SUPPORTED;
1053 }
1054 
os_flush_user_cache(void)1055 NV_STATUS NV_API_CALL os_flush_user_cache(void)
1056 {
1057     return NV_ERR_NOT_SUPPORTED;
1058 }
1059 
1060 #endif
1061 
os_flush_cpu_write_combine_buffer(void)1062 void NV_API_CALL os_flush_cpu_write_combine_buffer(void)
1063 {
1064 #if defined(NVCPU_X86_64)
1065     asm volatile("sfence" ::: "memory");
1066 #elif defined(NVCPU_PPC64LE)
1067     __asm__ __volatile__ ("sync" : : : "memory");
1068 #elif defined(NVCPU_AARCH64)
1069     asm volatile("dsb st" : : : "memory");
1070 #else
1071     mb();
1072 #endif
1073 }
1074 
1075 // override initial debug level from registry
os_dbg_init(void)1076 void NV_API_CALL os_dbg_init(void)
1077 {
1078     NvU32 new_debuglevel;
1079     nvidia_stack_t *sp = NULL;
1080 
1081     if (nv_kmem_cache_alloc_stack(&sp) != 0)
1082     {
1083         return;
1084     }
1085 
1086     if (NV_OK == rm_read_registry_dword(sp, NULL,
1087                                         "ResmanDebugLevel",
1088                                         &new_debuglevel))
1089     {
1090         if (new_debuglevel != (NvU32)~0)
1091             cur_debuglevel = new_debuglevel;
1092     }
1093 
1094     nv_kmem_cache_free_stack(sp);
1095 }
1096 
os_dbg_set_level(NvU32 new_debuglevel)1097 void NV_API_CALL os_dbg_set_level(NvU32 new_debuglevel)
1098 {
1099     nv_printf(NV_DBG_SETUP, "NVRM: Changing debuglevel from 0x%x to 0x%x\n",
1100         cur_debuglevel, new_debuglevel);
1101     cur_debuglevel = new_debuglevel;
1102 }
1103 
os_get_max_user_va(void)1104 NvU64 NV_API_CALL os_get_max_user_va(void)
1105 {
1106     return TASK_SIZE;
1107 }
1108 
os_schedule(void)1109 NV_STATUS NV_API_CALL os_schedule(void)
1110 {
1111     if (NV_MAY_SLEEP())
1112     {
1113         set_current_state(TASK_INTERRUPTIBLE);
1114         schedule_timeout(1);
1115         return NV_OK;
1116     }
1117     else
1118     {
1119         nv_printf(NV_DBG_ERRORS, "NVRM: os_schedule: Attempted to yield"
1120                                  " the CPU while in atomic or interrupt"
1121                                  " context\n");
1122         return NV_ERR_ILLEGAL_ACTION;
1123     }
1124 }
1125 
1126 typedef struct {
1127     nv_kthread_q_item_t item;
1128     void *data;
1129 } os_queue_data_t;
1130 
os_execute_work_item(void * _oqd)1131 static void os_execute_work_item(void *_oqd)
1132 {
1133     os_queue_data_t *oqd = _oqd;
1134     nvidia_stack_t *sp = NULL;
1135     void *data = oqd->data;
1136 
1137     NV_KFREE(oqd, sizeof(os_queue_data_t));
1138 
1139     if (nv_kmem_cache_alloc_stack(&sp) != 0)
1140     {
1141         return;
1142     }
1143 
1144     rm_execute_work_item(sp, data);
1145 
1146     nv_kmem_cache_free_stack(sp);
1147 }
1148 
os_queue_work_item(struct os_work_queue * queue,void * data)1149 NV_STATUS NV_API_CALL os_queue_work_item(struct os_work_queue *queue, void *data)
1150 {
1151     os_queue_data_t *oqd;
1152     nv_kthread_q_t *kthread;
1153 
1154     /* Use the global queue unless a valid queue was provided */
1155     kthread = queue ? &queue->nvk : &nv_kthread_q;
1156 
1157     /* Make sure the kthread is active */
1158     if (unlikely(!kthread->q_kthread)) {
1159         nv_printf(NV_DBG_ERRORS, "NVRM: queue is not enabled\n");
1160         return NV_ERR_NOT_READY;
1161     }
1162 
1163     /* Allocate atomically just in case we're called in atomic context. */
1164     NV_KMALLOC_ATOMIC(oqd, sizeof(os_queue_data_t));
1165     if (!oqd)
1166         return NV_ERR_NO_MEMORY;
1167 
1168     nv_kthread_q_item_init(&oqd->item, os_execute_work_item, oqd);
1169     oqd->data = data;
1170 
1171     nv_kthread_q_schedule_q_item(kthread, &oqd->item);
1172 
1173     return NV_OK;
1174 }
1175 
os_flush_work_queue(struct os_work_queue * queue)1176 NV_STATUS NV_API_CALL os_flush_work_queue(struct os_work_queue *queue)
1177 {
1178     nv_kthread_q_t *kthread;
1179 
1180     /* Use the global queue unless a valid queue was provided */
1181     kthread = queue ? &queue->nvk : &nv_kthread_q;
1182 
1183     if (NV_MAY_SLEEP())
1184     {
1185         if (kthread->q_kthread)
1186             nv_kthread_q_flush(kthread);
1187 
1188         return NV_OK;
1189     }
1190     else
1191     {
1192         nv_printf(NV_DBG_ERRORS,
1193                   "NVRM: os_flush_work_queue: attempted to execute passive"
1194                   "work from an atomic or interrupt context.\n");
1195         return NV_ERR_ILLEGAL_ACTION;
1196     }
1197 }
1198 
1199 extern NvU32 NVreg_EnableDbgBreakpoint;
1200 
os_dbg_breakpoint(void)1201 void NV_API_CALL os_dbg_breakpoint(void)
1202 {
1203     if (NVreg_EnableDbgBreakpoint == 0)
1204     {
1205         return;
1206     }
1207 
1208 #if defined(CONFIG_X86_REMOTE_DEBUG) || defined(CONFIG_KGDB) || defined(CONFIG_XMON)
1209   #if defined(NVCPU_X86_64)
1210     __asm__ __volatile__ ("int $3");
1211   #elif defined(NVCPU_ARM)
1212     __asm__ __volatile__ (".word %c0" :: "i" (KGDB_COMPILED_BREAK));
1213   #elif defined(NVCPU_AARCH64)
1214     # warning "Need to implement os_dbg_breakpoint() for aarch64"
1215   #elif defined(NVCPU_PPC64LE)
1216     __asm__ __volatile__ ("trap");
1217   #endif // NVCPU_*
1218 #elif defined(CONFIG_KDB)
1219     KDB_ENTER();
1220 #endif // CONFIG_X86_REMOTE_DEBUG || CONFIG_KGDB || CONFIG_XMON
1221 }
1222 
os_get_cpu_number(void)1223 NvU32 NV_API_CALL os_get_cpu_number(void)
1224 {
1225     NvU32 cpu_id = get_cpu();
1226     put_cpu();
1227     return cpu_id;
1228 }
1229 
os_get_cpu_count(void)1230 NvU32 NV_API_CALL os_get_cpu_count(void)
1231 {
1232     return NV_NUM_CPUS();
1233 }
1234 
os_pat_supported(void)1235 NvBool NV_API_CALL os_pat_supported(void)
1236 {
1237     return (nv_pat_mode != NV_PAT_MODE_DISABLED);
1238 }
1239 
os_is_efi_enabled(void)1240 NvBool NV_API_CALL os_is_efi_enabled(void)
1241 {
1242     return efi_enabled(EFI_BOOT);
1243 }
1244 
os_dump_stack(void)1245 void NV_API_CALL os_dump_stack(void)
1246 {
1247     dump_stack();
1248 }
1249 
1250 typedef struct os_spinlock_s
1251 {
1252     nv_spinlock_t      lock;
1253     unsigned long      eflags;
1254 } os_spinlock_t;
1255 
os_alloc_spinlock(void ** ppSpinlock)1256 NV_STATUS NV_API_CALL os_alloc_spinlock(void **ppSpinlock)
1257 {
1258     NV_STATUS rmStatus;
1259     os_spinlock_t *os_spinlock;
1260 
1261     rmStatus = os_alloc_mem(ppSpinlock, sizeof(os_spinlock_t));
1262     if (rmStatus != NV_OK)
1263     {
1264         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate spinlock!\n");
1265         return rmStatus;
1266     }
1267 
1268     os_spinlock = (os_spinlock_t *)*ppSpinlock;
1269     NV_SPIN_LOCK_INIT(&os_spinlock->lock);
1270     os_spinlock->eflags = 0;
1271     return NV_OK;
1272 }
1273 
os_free_spinlock(void * pSpinlock)1274 void NV_API_CALL os_free_spinlock(void *pSpinlock)
1275 {
1276     os_free_mem(pSpinlock);
1277 }
1278 
os_acquire_spinlock(void * pSpinlock)1279 NvU64 NV_API_CALL os_acquire_spinlock(void *pSpinlock)
1280 {
1281     os_spinlock_t *os_spinlock = (os_spinlock_t *)pSpinlock;
1282     unsigned long eflags;
1283 
1284     NV_SPIN_LOCK_IRQSAVE(&os_spinlock->lock, eflags);
1285     os_spinlock->eflags = eflags;
1286 
1287 #if defined(NVCPU_X86_64)
1288     eflags &= X86_EFLAGS_IF;
1289 #elif defined(NVCPU_AARCH64)
1290     eflags &= PSR_I_BIT;
1291 #endif
1292     return eflags;
1293 }
1294 
os_release_spinlock(void * pSpinlock,NvU64 oldIrql)1295 void NV_API_CALL os_release_spinlock(void *pSpinlock, NvU64 oldIrql)
1296 {
1297     os_spinlock_t *os_spinlock = (os_spinlock_t *)pSpinlock;
1298     unsigned long eflags;
1299 
1300     eflags = os_spinlock->eflags;
1301     os_spinlock->eflags = 0;
1302     NV_SPIN_UNLOCK_IRQRESTORE(&os_spinlock->lock, eflags);
1303 }
1304 
1305 #define NV_KERNEL_RELEASE    ((LINUX_VERSION_CODE >> 16) & 0x0ff)
1306 #define NV_KERNEL_VERSION    ((LINUX_VERSION_CODE >> 8)  & 0x0ff)
1307 #define NV_KERNEL_SUBVERSION ((LINUX_VERSION_CODE)       & 0x0ff)
1308 
os_get_version_info(os_version_info * pOsVersionInfo)1309 NV_STATUS NV_API_CALL os_get_version_info(os_version_info * pOsVersionInfo)
1310 {
1311     NV_STATUS status      = NV_OK;
1312 
1313     pOsVersionInfo->os_major_version = NV_KERNEL_RELEASE;
1314     pOsVersionInfo->os_minor_version = NV_KERNEL_VERSION;
1315     pOsVersionInfo->os_build_number  = NV_KERNEL_SUBVERSION;
1316 
1317 #if defined(UTS_RELEASE)
1318     pOsVersionInfo->os_build_version_str = UTS_RELEASE;
1319 #endif
1320 
1321 #if defined(UTS_VERSION)
1322     pOsVersionInfo->os_build_date_plus_str = UTS_VERSION;
1323 #endif
1324 
1325     return status;
1326 }
1327 
os_is_xen_dom0(void)1328 NvBool NV_API_CALL os_is_xen_dom0(void)
1329 {
1330 #if defined(NV_DOM0_KERNEL_PRESENT)
1331     return NV_TRUE;
1332 #else
1333     return NV_FALSE;
1334 #endif
1335 }
1336 
os_is_vgx_hyper(void)1337 NvBool NV_API_CALL os_is_vgx_hyper(void)
1338 {
1339 #if defined(NV_VGX_HYPER)
1340     return NV_TRUE;
1341 #else
1342     return NV_FALSE;
1343 #endif
1344 }
1345 
os_inject_vgx_msi(NvU16 guestID,NvU64 msiAddr,NvU32 msiData)1346 NV_STATUS NV_API_CALL os_inject_vgx_msi(NvU16 guestID, NvU64 msiAddr, NvU32 msiData)
1347 {
1348 #if defined(NV_VGX_HYPER) && defined(NV_DOM0_KERNEL_PRESENT) && \
1349     defined(NV_XEN_IOEMU_INJECT_MSI)
1350     int rc = 0;
1351     rc = xen_ioemu_inject_msi(guestID, msiAddr, msiData);
1352     if (rc)
1353     {
1354         nv_printf(NV_DBG_ERRORS,
1355             "NVRM: %s: can't inject MSI to guest:%d, addr:0x%x, data:0x%x, err:%d\n",
1356             __FUNCTION__, guestID, msiAddr, msiData, rc);
1357         return NV_ERR_OPERATING_SYSTEM;
1358     }
1359     return NV_OK;
1360 #else
1361     return NV_ERR_NOT_SUPPORTED;
1362 #endif
1363 }
1364 
os_is_grid_supported(void)1365 NvBool NV_API_CALL os_is_grid_supported(void)
1366 {
1367 #if defined(NV_GRID_BUILD)
1368     return NV_TRUE;
1369 #else
1370     return NV_FALSE;
1371 #endif
1372 }
1373 
os_get_grid_csp_support(void)1374 NvU32 NV_API_CALL os_get_grid_csp_support(void)
1375 {
1376 #if defined(NV_GRID_BUILD_CSP)
1377     return NV_GRID_BUILD_CSP;
1378 #else
1379     return 0;
1380 #endif
1381 }
1382 
os_bug_check(NvU32 bugCode,const char * bugCodeStr)1383 void NV_API_CALL os_bug_check(NvU32 bugCode, const char *bugCodeStr)
1384 {
1385     panic(bugCodeStr);
1386 }
1387 
os_get_euid(NvU32 * pSecToken)1388 NV_STATUS NV_API_CALL os_get_euid(NvU32 *pSecToken)
1389 {
1390     *pSecToken = NV_CURRENT_EUID();
1391     return NV_OK;
1392 }
1393 
1394 #if defined(NVCPU_X86_64) || defined(NVCPU_AARCH64)
1395 
os_verify_checksum(const NvU8 * pMappedAddr,NvU32 length)1396 static NvBool os_verify_checksum(const NvU8 *pMappedAddr, NvU32 length)
1397 {
1398     NvU8 sum = 0;
1399     NvU32 iter = 0;
1400 
1401     for (iter = 0; iter < length; iter++)
1402         sum += pMappedAddr[iter];
1403 
1404     return sum == 0;
1405 }
1406 
1407 #define _VERIFY_SMBIOS3(_pMappedAddr)                        \
1408         _pMappedAddr &&                                      \
1409         (os_mem_cmp(_pMappedAddr, "_SM3_", 5) == 0  &&       \
1410         _pMappedAddr[6] < 32 &&                              \
1411         _pMappedAddr[6] > 0 &&                               \
1412         os_verify_checksum(_pMappedAddr, _pMappedAddr[6]))
1413 
1414 #define OS_VERIFY_SMBIOS3(pMappedAddr) _VERIFY_SMBIOS3((pMappedAddr))
1415 
1416 #define _VERIFY_SMBIOS(_pMappedAddr)                           \
1417         _pMappedAddr &&                                        \
1418         (os_mem_cmp(_pMappedAddr, "_SM_", 4) == 0  &&          \
1419         _pMappedAddr[5] < 32 &&                                \
1420         _pMappedAddr[5] > 0 &&                                 \
1421         os_verify_checksum(_pMappedAddr, _pMappedAddr[5]) &&   \
1422         os_mem_cmp((_pMappedAddr + 16), "_DMI_", 5) == 0  &&   \
1423         os_verify_checksum((_pMappedAddr + 16), 15))
1424 
1425 #define OS_VERIFY_SMBIOS(pMappedAddr) _VERIFY_SMBIOS((pMappedAddr))
1426 
1427 #define SMBIOS_LEGACY_BASE 0xF0000
1428 #define SMBIOS_LEGACY_SIZE 0x10000
1429 
os_get_smbios_header_legacy(NvU64 * pSmbsAddr)1430 static NV_STATUS os_get_smbios_header_legacy(NvU64 *pSmbsAddr)
1431 {
1432 #if !defined(NVCPU_X86_64)
1433     return NV_ERR_NOT_SUPPORTED;
1434 #else
1435     NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
1436     NvU8 *pMappedAddr = NULL;
1437     NvU8 *pIterAddr = NULL;
1438 
1439     pMappedAddr = (NvU8*)os_map_kernel_space(SMBIOS_LEGACY_BASE,
1440                                              SMBIOS_LEGACY_SIZE,
1441                                              NV_MEMORY_CACHED);
1442     if (pMappedAddr == NULL)
1443     {
1444         return NV_ERR_INSUFFICIENT_RESOURCES;
1445     }
1446 
1447     pIterAddr = pMappedAddr;
1448 
1449     for (; pIterAddr < (pMappedAddr + SMBIOS_LEGACY_SIZE); pIterAddr += 16)
1450     {
1451         if (OS_VERIFY_SMBIOS3(pIterAddr))
1452         {
1453             *pSmbsAddr = SMBIOS_LEGACY_BASE + (pIterAddr - pMappedAddr);
1454             status = NV_OK;
1455             break;
1456         }
1457 
1458         if (OS_VERIFY_SMBIOS(pIterAddr))
1459         {
1460             *pSmbsAddr = SMBIOS_LEGACY_BASE + (pIterAddr - pMappedAddr);
1461             status = NV_OK;
1462             break;
1463         }
1464     }
1465 
1466     os_unmap_kernel_space(pMappedAddr, SMBIOS_LEGACY_SIZE);
1467 
1468     return status;
1469 #endif
1470 }
1471 
1472 // This function is needed only if "efi" is enabled.
1473 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
os_verify_smbios_header_uefi(NvU64 smbsAddr)1474 static NV_STATUS os_verify_smbios_header_uefi(NvU64 smbsAddr)
1475 {
1476     NV_STATUS status = NV_ERR_OBJECT_NOT_FOUND;
1477     NvU64 start= 0, offset =0 , size = 32;
1478     NvU8 *pMappedAddr = NULL, *pBufAddr = NULL;
1479 
1480     start = smbsAddr;
1481     offset = (start & ~os_page_mask);
1482     start &= os_page_mask;
1483     size = ((size + offset + ~os_page_mask) & os_page_mask);
1484 
1485     pBufAddr = (NvU8*)os_map_kernel_space(start,
1486                                           size,
1487                                           NV_MEMORY_CACHED);
1488     if (pBufAddr == NULL)
1489     {
1490         return NV_ERR_INSUFFICIENT_RESOURCES;
1491     }
1492 
1493     pMappedAddr = pBufAddr + offset;
1494 
1495     if (OS_VERIFY_SMBIOS3(pMappedAddr))
1496     {
1497         status = NV_OK;
1498         goto done;
1499     }
1500 
1501     if (OS_VERIFY_SMBIOS(pMappedAddr))
1502     {
1503         status = NV_OK;
1504     }
1505 
1506 done:
1507     os_unmap_kernel_space(pBufAddr, size);
1508     return status;
1509 }
1510 #endif
1511 
os_get_smbios_header_uefi(NvU64 * pSmbsAddr)1512 static NV_STATUS os_get_smbios_header_uefi(NvU64 *pSmbsAddr)
1513 {
1514     NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
1515 
1516 // Make sure that efi.h is present before using "struct efi".
1517 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
1518 
1519 // Make sure that efi.h has SMBIOS3_TABLE_GUID present.
1520 #if defined(SMBIOS3_TABLE_GUID)
1521     if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
1522     {
1523         status = os_verify_smbios_header_uefi(efi.smbios3);
1524         if (status == NV_OK)
1525         {
1526             *pSmbsAddr = efi.smbios3;
1527             return NV_OK;
1528         }
1529     }
1530 #endif
1531 
1532     if (efi.smbios != EFI_INVALID_TABLE_ADDR)
1533     {
1534         status = os_verify_smbios_header_uefi(efi.smbios);
1535         if (status == NV_OK)
1536         {
1537             *pSmbsAddr = efi.smbios;
1538             return NV_OK;
1539         }
1540     }
1541 #endif
1542 
1543     return status;
1544 }
1545 
1546 #endif // defined(NVCPU_X86_64) || defined(NVCPU_AARCH64)
1547 
1548 // The function locates the SMBIOS entry point.
os_get_smbios_header(NvU64 * pSmbsAddr)1549 NV_STATUS NV_API_CALL os_get_smbios_header(NvU64 *pSmbsAddr)
1550 {
1551 
1552 #if !defined(NVCPU_X86_64) && !defined(NVCPU_AARCH64)
1553     return NV_ERR_NOT_SUPPORTED;
1554 #else
1555     NV_STATUS status = NV_OK;
1556 
1557     if (os_is_efi_enabled())
1558     {
1559         status = os_get_smbios_header_uefi(pSmbsAddr);
1560     }
1561     else
1562     {
1563         status = os_get_smbios_header_legacy(pSmbsAddr);
1564     }
1565 
1566     return status;
1567 #endif
1568 }
1569 
os_get_acpi_rsdp_from_uefi(NvU32 * pRsdpAddr)1570 NV_STATUS NV_API_CALL os_get_acpi_rsdp_from_uefi
1571 (
1572     NvU32  *pRsdpAddr
1573 )
1574 {
1575     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
1576 
1577     if (pRsdpAddr == NULL)
1578     {
1579         return NV_ERR_INVALID_STATE;
1580     }
1581 
1582     *pRsdpAddr = 0;
1583 
1584 // Make sure that efi.h is present before using "struct efi".
1585 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
1586 
1587     if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
1588     {
1589         *pRsdpAddr = efi.acpi20;
1590         status = NV_OK;
1591     }
1592     else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
1593     {
1594         *pRsdpAddr = efi.acpi;
1595         status = NV_OK;
1596     }
1597     else
1598     {
1599         nv_printf(NV_DBG_ERRORS, "NVRM: RSDP Not found!\n");
1600         status = NV_ERR_OPERATING_SYSTEM;
1601     }
1602 #endif
1603 
1604     return status;
1605 }
1606 
os_add_record_for_crashLog(void * pbuffer,NvU32 size)1607 void NV_API_CALL os_add_record_for_crashLog(void *pbuffer, NvU32 size)
1608 {
1609 }
1610 
os_delete_record_for_crashLog(void * pbuffer)1611 void NV_API_CALL os_delete_record_for_crashLog(void *pbuffer)
1612 {
1613 }
1614 
1615 #if !defined(NV_VGPU_KVM_BUILD)
os_call_vgpu_vfio(void * pvgpu_vfio_info,NvU32 cmd_type)1616 NV_STATUS NV_API_CALL os_call_vgpu_vfio(void *pvgpu_vfio_info, NvU32 cmd_type)
1617 {
1618     return NV_ERR_NOT_SUPPORTED;
1619 }
1620 #endif
1621 
os_alloc_pages_node(NvS32 nid,NvU32 size,NvU32 flag,NvU64 * pAddress)1622 NV_STATUS NV_API_CALL os_alloc_pages_node
1623 (
1624     NvS32  nid,
1625     NvU32  size,
1626     NvU32  flag,
1627     NvU64 *pAddress
1628 )
1629 {
1630     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
1631 
1632 #if defined(__GFP_THISNODE) && defined(GFP_HIGHUSER_MOVABLE) && \
1633     defined(__GFP_COMP) && defined(__GFP_NORETRY) && defined(__GFP_NOWARN)
1634     gfp_t gfp_mask;
1635     struct page *alloc_addr;
1636     unsigned int order = get_order(size);
1637 
1638     /*
1639      * Explanation of flags used:
1640      *
1641      * 1. __GFP_THISNODE:           This will make sure the allocation happens
1642      *                              on the node specified by nid.
1643      *
1644      * 2. GFP_HIGHUSER_MOVABLE:     This makes allocations from ZONE_MOVABLE.
1645      *
1646      * 3. __GFP_COMP:               This will make allocations with compound
1647      *                              pages, which is needed in order to use
1648      *                              vm_insert_page API.
1649      *
1650      * 4. __GFP_NORETRY:            Used to avoid the Linux kernel OOM killer.
1651      *
1652      * 5. __GFP_NOWARN:             Used to avoid a WARN_ON in the slowpath if
1653      *                              the requested order is too large (just fail
1654      *                              instead).
1655      *
1656      * 6. (Optional) __GFP_RECLAIM: Used to allow/forbid reclaim.
1657      *                              This is part of GFP_USER and consequently
1658      *                              GFP_HIGHUSER_MOVABLE.
1659      *
1660      * Some of these flags are relatively more recent, with the last of them
1661      * (GFP_HIGHUSER_MOVABLE) having been added with this Linux kernel commit:
1662      *
1663      * 2007-07-17 769848c03895b63e5662eb7e4ec8c4866f7d0183
1664      *
1665      * Assume that this feature will only be used on kernels that support all
1666      * of the needed GFP flags.
1667      */
1668 
1669     gfp_mask = __GFP_THISNODE | GFP_HIGHUSER_MOVABLE | __GFP_COMP |
1670                __GFP_NORETRY | __GFP_NOWARN;
1671 
1672 #if defined(__GFP_RECLAIM)
1673     if (flag & NV_ALLOC_PAGES_NODE_SKIP_RECLAIM)
1674     {
1675         gfp_mask &= ~(__GFP_RECLAIM);
1676     }
1677 #endif // defined(__GFP_RECLAIM)
1678 
1679     alloc_addr = alloc_pages_node(nid, gfp_mask, order);
1680     if (alloc_addr == NULL)
1681     {
1682         nv_printf(NV_DBG_INFO,
1683             "NVRM: alloc_pages_node(node = %d, order = %u) failed\n",
1684             nid, order);
1685         status = NV_ERR_NO_MEMORY;
1686     }
1687     else if (page_to_nid(alloc_addr) != nid)
1688     {
1689         //
1690         // We can hit this case when a Linux kernel bug is not patched.
1691         // The needed patch is https://patchwork.kernel.org/patch/10427387/
1692         //
1693         nv_printf(NV_DBG_ERRORS,
1694             "NVRM: alloc_pages_node(node = %d, order = %u) wrong node ID.\n",
1695             nid, order);
1696         __free_pages(alloc_addr, order);
1697         status = NV_ERR_NO_MEMORY;
1698     }
1699     else
1700     {
1701         *pAddress = (NvU64)page_to_phys(alloc_addr);
1702         status = NV_OK;
1703     }
1704 #endif // GFP flags
1705 
1706     return status;
1707 }
1708 
os_get_page(NvU64 address)1709 NV_STATUS NV_API_CALL os_get_page
1710 (
1711     NvU64 address
1712 )
1713 {
1714     get_page(NV_GET_PAGE_STRUCT(address));
1715     return NV_OK;
1716 }
1717 
os_put_page(NvU64 address)1718 NV_STATUS NV_API_CALL os_put_page
1719 (
1720     NvU64 address
1721 )
1722 {
1723     put_page(NV_GET_PAGE_STRUCT(address));
1724     return NV_OK;
1725 }
1726 
os_get_page_refcount(NvU64 address)1727 NvU32 NV_API_CALL os_get_page_refcount
1728 (
1729     NvU64 address
1730 )
1731 {
1732     return NV_PAGE_COUNT(NV_GET_PAGE_STRUCT(address));
1733 }
1734 
os_count_tail_pages(NvU64 address)1735 NvU32 NV_API_CALL os_count_tail_pages
1736 (
1737     NvU64 address
1738 )
1739 {
1740     NvU32 order = compound_order(compound_head(NV_GET_PAGE_STRUCT(address)));
1741 
1742     return 1 << order;
1743 }
1744 
os_free_pages_phys(NvU64 address,NvU32 size)1745 void NV_API_CALL os_free_pages_phys
1746 (
1747     NvU64 address,
1748     NvU32 size
1749 )
1750 {
1751     __free_pages(NV_GET_PAGE_STRUCT(address), get_order(size));
1752 }
1753 
os_numa_memblock_size(NvU64 * memblock_size)1754 NV_STATUS NV_API_CALL os_numa_memblock_size
1755 (
1756     NvU64 *memblock_size
1757 )
1758 {
1759 #if NV_IS_EXPORT_SYMBOL_PRESENT_memory_block_size_bytes
1760     *memblock_size = memory_block_size_bytes();
1761     return NV_OK;
1762 #endif
1763     if (nv_ctl_device.numa_memblock_size == 0)
1764         return NV_ERR_INVALID_STATE;
1765     *memblock_size = nv_ctl_device.numa_memblock_size;
1766     return NV_OK;
1767 }
1768 
os_open_temporary_file(void ** ppFile)1769 NV_STATUS NV_API_CALL os_open_temporary_file
1770 (
1771     void **ppFile
1772 )
1773 {
1774 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1775 #if defined(O_TMPFILE)
1776     struct file *file;
1777     const char *default_path = "/tmp";
1778     const int flags = O_TMPFILE | O_LARGEFILE | O_RDWR;
1779     const char *path = NVreg_TemporaryFilePath;
1780 
1781     /*
1782      * The filp_open() call below depends on the current task's fs_struct
1783      * (current->fs), which may already be NULL if this is called during
1784      * process teardown.
1785      */
1786     if (current->fs == NULL)
1787     {
1788         return NV_ERR_OPERATING_SYSTEM;
1789     }
1790 
1791     if (!path)
1792     {
1793         path = default_path;
1794     }
1795 
1796     file = filp_open(path, flags, 0);
1797     if (IS_ERR(file))
1798     {
1799         if ((path != default_path) && (PTR_ERR(file) == -ENOENT))
1800         {
1801             nv_printf(NV_DBG_ERRORS,
1802                       "NVRM: The temporary file path specified via the NVreg_TemporaryFilePath\n"
1803                       "NVRM: module parameter does not exist. Defaulting to /tmp.\n");
1804 
1805             file = filp_open(default_path, flags, 0);
1806         }
1807     }
1808 
1809     if (IS_ERR(file))
1810     {
1811         return NV_ERR_OPERATING_SYSTEM;
1812     }
1813 
1814     *ppFile = (void *)file;
1815 
1816     return NV_OK;
1817 #else
1818     return NV_ERR_NOT_SUPPORTED;
1819 #endif
1820 #else
1821     return NV_ERR_NOT_SUPPORTED;
1822 #endif
1823 }
1824 
os_close_file(void * pFile)1825 void NV_API_CALL os_close_file
1826 (
1827     void *pFile
1828 )
1829 {
1830 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1831     filp_close(pFile, NULL);
1832 #endif
1833 }
1834 
1835 #define NV_MAX_NUM_FILE_IO_RETRIES 10
1836 
os_write_file(void * pFile,NvU8 * pBuffer,NvU64 size,NvU64 offset)1837 NV_STATUS NV_API_CALL os_write_file
1838 (
1839     void *pFile,
1840     NvU8 *pBuffer,
1841     NvU64 size,
1842     NvU64 offset
1843 )
1844 {
1845 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1846     loff_t f_pos = offset;
1847     ssize_t num_written;
1848     int num_retries = NV_MAX_NUM_FILE_IO_RETRIES;
1849 
1850 retry:
1851 #if defined(NV_KERNEL_WRITE_HAS_POINTER_POS_ARG)
1852     num_written = kernel_write(pFile, pBuffer, size, &f_pos);
1853 #else
1854     num_written = kernel_write(pFile, pBuffer, size, f_pos);
1855 #endif
1856     if (num_written < 0)
1857     {
1858         return NV_ERR_OPERATING_SYSTEM;
1859     }
1860     else if (num_written < size)
1861     {
1862         if (num_written > 0)
1863         {
1864             pBuffer += num_written;
1865             size -= num_written;
1866         }
1867         if (--num_retries > 0)
1868         {
1869             cond_resched();
1870             goto retry;
1871         }
1872         return NV_ERR_OPERATING_SYSTEM;
1873     }
1874 
1875     return NV_OK;
1876 #else
1877     return NV_ERR_NOT_SUPPORTED;
1878 #endif
1879 }
1880 
os_read_file(void * pFile,NvU8 * pBuffer,NvU64 size,NvU64 offset)1881 NV_STATUS NV_API_CALL os_read_file
1882 (
1883     void *pFile,
1884     NvU8 *pBuffer,
1885     NvU64 size,
1886     NvU64 offset
1887 )
1888 {
1889 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1890     loff_t f_pos = offset;
1891     ssize_t num_read;
1892     int num_retries = NV_MAX_NUM_FILE_IO_RETRIES;
1893 
1894 retry:
1895 #if defined(NV_KERNEL_READ_HAS_POINTER_POS_ARG)
1896     num_read = kernel_read(pFile, pBuffer, size, &f_pos);
1897 #else
1898     num_read = kernel_read(pFile, f_pos, pBuffer, size);
1899 #endif
1900     if (num_read < 0)
1901     {
1902         return NV_ERR_OPERATING_SYSTEM;
1903     }
1904     else if (num_read < size)
1905     {
1906         if (num_read > 0)
1907         {
1908             pBuffer += num_read;
1909             size -= num_read;
1910         }
1911         if (--num_retries > 0)
1912         {
1913             cond_resched();
1914             goto retry;
1915         }
1916         return NV_ERR_OPERATING_SYSTEM;
1917     }
1918 
1919     return NV_OK;
1920 #else
1921     return NV_ERR_NOT_SUPPORTED;
1922 #endif
1923 }
1924 
os_open_readonly_file(const char * filename,void ** ppFile)1925 NV_STATUS NV_API_CALL os_open_readonly_file
1926 (
1927     const char  *filename,
1928     void       **ppFile
1929 )
1930 {
1931 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1932     struct file *file;
1933 
1934     /*
1935      * The filp_open() call below depends on the current task's fs_struct
1936      * (current->fs), which may already be NULL if this is called during
1937      * process teardown.
1938      */
1939     if (current->fs == NULL)
1940     {
1941         return NV_ERR_OPERATING_SYSTEM;
1942     }
1943 
1944     file = filp_open(filename, O_RDONLY, 0);
1945     if (IS_ERR(file))
1946     {
1947         return NV_ERR_OPERATING_SYSTEM;
1948     }
1949 
1950     *ppFile = (void *)file;
1951 
1952     return NV_OK;
1953 #else
1954     return NV_ERR_NOT_SUPPORTED;
1955 #endif
1956 }
1957 
os_open_and_read_file(const char * filename,NvU8 * buf,NvU64 count)1958 NV_STATUS NV_API_CALL os_open_and_read_file
1959 (
1960     const char *filename,
1961     NvU8       *buf,
1962     NvU64       count
1963 )
1964 {
1965     void *fileHandle;
1966     NV_STATUS status;
1967 
1968     status = os_open_readonly_file(filename, &fileHandle);
1969     if (status != NV_OK)
1970     {
1971         return status;
1972     }
1973 
1974     status = os_read_file(fileHandle, buf, count, 0);
1975 
1976     os_close_file(fileHandle);
1977 
1978     return status;
1979 }
1980 
os_is_nvswitch_present(void)1981 NvBool NV_API_CALL os_is_nvswitch_present(void)
1982 {
1983     struct pci_device_id nvswitch_pci_table[] = {
1984         {
1985             PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID),
1986             .class      = PCI_CLASS_BRIDGE_OTHER << 8,
1987             .class_mask = PCI_ANY_ID
1988         },
1989         {0}
1990     };
1991 
1992     return !!pci_dev_present(nvswitch_pci_table);
1993 }
1994 
1995 /*
1996  * This function may sleep (interruptible).
1997  */
os_get_random_bytes(NvU8 * bytes,NvU16 numBytes)1998 NV_STATUS NV_API_CALL os_get_random_bytes
1999 (
2000     NvU8 *bytes,
2001     NvU16 numBytes
2002 )
2003 {
2004 #if defined NV_WAIT_FOR_RANDOM_BYTES_PRESENT
2005     if (wait_for_random_bytes() < 0)
2006         return NV_ERR_NOT_READY;
2007 #endif
2008 
2009     get_random_bytes(bytes, numBytes);
2010     return NV_OK;
2011 }
2012 
os_alloc_wait_queue(os_wait_queue ** wq)2013 NV_STATUS NV_API_CALL os_alloc_wait_queue
2014 (
2015     os_wait_queue **wq
2016 )
2017 {
2018     NV_KMALLOC(*wq, sizeof(os_wait_queue));
2019     if (*wq == NULL)
2020         return NV_ERR_NO_MEMORY;
2021 
2022     init_completion(&(*wq)->q);
2023 
2024     return NV_OK;
2025 }
2026 
os_free_wait_queue(os_wait_queue * wq)2027 void NV_API_CALL os_free_wait_queue
2028 (
2029     os_wait_queue *wq
2030 )
2031 {
2032     NV_KFREE(wq, sizeof(os_wait_queue));
2033 }
2034 
os_wait_uninterruptible(os_wait_queue * wq)2035 void NV_API_CALL os_wait_uninterruptible
2036 (
2037     os_wait_queue *wq
2038 )
2039 {
2040     wait_for_completion(&wq->q);
2041 }
2042 
os_wait_interruptible(os_wait_queue * wq)2043 void NV_API_CALL os_wait_interruptible
2044 (
2045     os_wait_queue *wq
2046 )
2047 {
2048     wait_for_completion_interruptible(&wq->q);
2049 }
2050 
os_wake_up(os_wait_queue * wq)2051 void NV_API_CALL os_wake_up
2052 (
2053     os_wait_queue *wq
2054 )
2055 {
2056     complete_all(&wq->q);
2057 }
2058 
os_nv_cap_init(const char * path)2059 nv_cap_t* NV_API_CALL os_nv_cap_init
2060 (
2061     const char *path
2062 )
2063 {
2064     return nv_cap_init(path);
2065 }
2066 
os_nv_cap_create_dir_entry(nv_cap_t * parent_cap,const char * name,int mode)2067 nv_cap_t* NV_API_CALL os_nv_cap_create_dir_entry
2068 (
2069     nv_cap_t *parent_cap,
2070     const char *name,
2071     int mode
2072 )
2073 {
2074     return nv_cap_create_dir_entry(parent_cap, name, mode);
2075 }
2076 
os_nv_cap_create_file_entry(nv_cap_t * parent_cap,const char * name,int mode)2077 nv_cap_t* NV_API_CALL os_nv_cap_create_file_entry
2078 (
2079     nv_cap_t *parent_cap,
2080     const char *name,
2081     int mode
2082 )
2083 {
2084     return nv_cap_create_file_entry(parent_cap, name, mode);
2085 }
2086 
os_nv_cap_destroy_entry(nv_cap_t * cap)2087 void NV_API_CALL os_nv_cap_destroy_entry
2088 (
2089     nv_cap_t *cap
2090 )
2091 {
2092     nv_cap_destroy_entry(cap);
2093 }
2094 
os_nv_cap_validate_and_dup_fd(const nv_cap_t * cap,int fd)2095 int NV_API_CALL os_nv_cap_validate_and_dup_fd
2096 (
2097     const nv_cap_t *cap,
2098     int fd
2099 )
2100 {
2101     return nv_cap_validate_and_dup_fd(cap, fd);
2102 }
2103 
os_nv_cap_close_fd(int fd)2104 void NV_API_CALL os_nv_cap_close_fd
2105 (
2106     int fd
2107 )
2108 {
2109     nv_cap_close_fd(fd);
2110 }
2111 
os_imex_channel_count(void)2112 NvS32 NV_API_CALL os_imex_channel_count
2113 (
2114     void
2115 )
2116 {
2117     return nv_caps_imex_channel_count();
2118 }
2119 
os_imex_channel_get(NvU64 descriptor)2120 NvS32 NV_API_CALL os_imex_channel_get
2121 (
2122     NvU64 descriptor
2123 )
2124 {
2125     return nv_caps_imex_channel_get((int)descriptor);
2126 }
2127 
2128 /*
2129  * Reads the total memory and free memory of a NUMA node from the kernel.
2130  */
os_get_numa_node_memory_usage(NvS32 node_id,NvU64 * free_memory_bytes,NvU64 * total_memory_bytes)2131 NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
2132 (
2133     NvS32 node_id,
2134     NvU64 *free_memory_bytes,
2135     NvU64 *total_memory_bytes
2136 )
2137 {
2138     struct pglist_data *pgdat;
2139     struct zone *zone;
2140     NvU32 zone_id;
2141 
2142     if (node_id >= MAX_NUMNODES)
2143     {
2144         nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
2145         return NV_ERR_INVALID_ARGUMENT;
2146     }
2147 
2148     pgdat = NODE_DATA(node_id);
2149 
2150     *free_memory_bytes = 0;
2151     *total_memory_bytes = 0;
2152 
2153     for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
2154     {
2155         zone = &(pgdat->node_zones[zone_id]);
2156         if (!populated_zone(zone))
2157             continue;
2158         *free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
2159         *total_memory_bytes += (zone->present_pages * PAGE_SIZE);
2160     }
2161 
2162     return NV_OK;
2163 }
2164 
2165 typedef struct os_numa_gpu_mem_hotplug_notifier_s
2166 {
2167     NvU64 start_pa;
2168     NvU64 size;
2169     nv_pci_info_t pci_info;
2170     struct notifier_block memory_notifier;
2171 } os_numa_gpu_mem_hotplug_notifier_t;
2172 
os_numa_verify_gpu_memory_zone(struct notifier_block * nb,unsigned long action,void * data)2173 static int os_numa_verify_gpu_memory_zone(struct notifier_block *nb,
2174                                           unsigned long action, void *data)
2175 {
2176     os_numa_gpu_mem_hotplug_notifier_t *notifier = container_of(nb,
2177         os_numa_gpu_mem_hotplug_notifier_t,
2178         memory_notifier);
2179     struct memory_notify *mhp = data;
2180     NvU64 start_pa = PFN_PHYS(mhp->start_pfn);
2181     NvU64 size = PFN_PHYS(mhp->nr_pages);
2182 
2183     if (action == MEM_GOING_ONLINE)
2184     {
2185         // Check if onlining memory falls in the GPU memory range
2186         if ((start_pa >= notifier->start_pa) &&
2187             (start_pa + size) <= (notifier->start_pa + notifier->size))
2188         {
2189             /*
2190              * Verify GPU memory NUMA node has memory only in ZONE_MOVABLE before
2191              * onlining the memory so that incorrect auto online setting doesn't
2192              * cause the memory onlined in a zone where kernel allocations
2193              * could happen, resulting in GPU memory hot unpluggable and requiring
2194              * system reboot.
2195              */
2196             if (page_zonenum((pfn_to_page(mhp->start_pfn))) != ZONE_MOVABLE)
2197             {
2198                 nv_printf(NV_DBG_ERRORS, "NVRM: Failing GPU memory onlining as the onlining zone "
2199                           "is not movable. pa: 0x%llx size: 0x%llx\n"
2200                           "NVRM: The NVIDIA GPU %04x:%02x:%02x.%x installed in the system\n"
2201                           "NVRM: requires auto onlining mode online_movable enabled in\n"
2202                           "NVRM: /sys/devices/system/memory/auto_online_blocks\n",
2203                           start_pa, size, notifier->pci_info.domain, notifier->pci_info.bus,
2204                           notifier->pci_info.slot, notifier->pci_info.function);
2205                 return NOTIFY_BAD;
2206             }
2207         }
2208     }
2209     return NOTIFY_OK;
2210 }
2211 
2212 #define ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS 4
2213 
os_numa_add_gpu_memory(void * handle,NvU64 offset,NvU64 size,NvU32 * nodeId)2214 NV_STATUS NV_API_CALL os_numa_add_gpu_memory
2215 (
2216     void *handle,
2217     NvU64 offset,
2218     NvU64 size,
2219     NvU32 *nodeId
2220 )
2221 {
2222 #if defined(NV_ADD_MEMORY_DRIVER_MANAGED_PRESENT)
2223     int node = 0;
2224     nv_linux_state_t *nvl = pci_get_drvdata(handle);
2225     nv_state_t *nv = NV_STATE_PTR(nvl);
2226     NvU64 base = offset + nvl->coherent_link_info.gpu_mem_pa;
2227     int ret = 0;
2228     NvU64 memblock_size;
2229     NvU64 size_remaining;
2230     NvU64 calculated_segment_size;
2231     NvU64 segment_size;
2232     NvU64 segment_base;
2233     os_numa_gpu_mem_hotplug_notifier_t notifier =
2234     {
2235         .start_pa = base,
2236         .size = size,
2237         .pci_info = nv->pci_info,
2238         .memory_notifier.notifier_call = os_numa_verify_gpu_memory_zone,
2239     };
2240 
2241     if (nodeId == NULL)
2242     {
2243         return NV_ERR_INVALID_ARGUMENT;
2244     }
2245 
2246     if (bitmap_empty(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES))
2247     {
2248         return NV_ERR_IN_USE;
2249     }
2250     node = find_first_bit(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES);
2251     if (node == MAX_NUMNODES)
2252     {
2253         return NV_ERR_INVALID_STATE;
2254     }
2255 
2256     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE_IN_PROGRESS);
2257 
2258     ret = register_memory_notifier(&notifier.memory_notifier);
2259     if (ret)
2260     {
2261         nv_printf(NV_DBG_ERRORS, "NVRM: Memory hotplug notifier registration failed\n");
2262         goto failed;
2263     }
2264 
2265     //
2266     // Adding all memory at once can take a long time. Split up memory into segments
2267     // with schedule() in between to prevent soft lockups. Memory segments for
2268     // add_memory_driver_managed() need to be aligned to memblock size.
2269     //
2270     // If there are any issues splitting into segments, then add all memory at once.
2271     //
2272     if (os_numa_memblock_size(&memblock_size) == NV_OK)
2273     {
2274         calculated_segment_size = NV_ALIGN_UP(size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
2275     }
2276     else
2277     {
2278         // Don't split into segments, add all memory at once
2279         calculated_segment_size = size;
2280     }
2281 
2282     segment_size = calculated_segment_size;
2283     segment_base = base;
2284     size_remaining = size;
2285 
2286     while ((size_remaining > 0) &&
2287            (ret == 0))
2288     {
2289         if (segment_size > size_remaining)
2290         {
2291             segment_size = size_remaining;
2292         }
2293 
2294 #ifdef NV_ADD_MEMORY_DRIVER_MANAGED_HAS_MHP_FLAGS_ARG
2295         ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)", MHP_NONE);
2296 #else
2297         ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)");
2298 #endif
2299         nv_printf(NV_DBG_SETUP, "NVRM: add_memory_driver_managed() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2300                   ret, segment_base, segment_size);
2301 
2302         segment_base += segment_size;
2303         size_remaining -= segment_size;
2304 
2305         // Yield CPU to prevent soft lockups
2306         schedule();
2307     }
2308     unregister_memory_notifier(&notifier.memory_notifier);
2309 
2310     if (ret == 0)
2311     {
2312         struct zone *zone = &NODE_DATA(node)->node_zones[ZONE_MOVABLE];
2313         NvU64 start_pfn = base >> PAGE_SHIFT;
2314         NvU64 end_pfn = (base + size) >> PAGE_SHIFT;
2315 
2316         /* Verify the full GPU memory range passed on is onlined */
2317         if (zone->zone_start_pfn != start_pfn ||
2318             zone_end_pfn(zone) != end_pfn)
2319         {
2320             nv_printf(NV_DBG_ERRORS, "NVRM: GPU memory zone movable auto onlining failed!\n");
2321 
2322 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2323             // Since zone movable auto onlining failed, need to remove the added memory.
2324             segment_size = calculated_segment_size;
2325             segment_base = base;
2326             size_remaining = size;
2327 
2328             while (size_remaining > 0)
2329             {
2330                 if (segment_size > size_remaining)
2331                 {
2332                     segment_size = size_remaining;
2333                 }
2334 
2335 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2336                 ret = offline_and_remove_memory(node, segment_base, segment_size);
2337 #else
2338                 ret = offline_and_remove_memory(segment_base, segment_size);
2339 #endif
2340                 nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2341                           ret, segment_base, segment_size);
2342 
2343                 segment_base += segment_size;
2344                 size_remaining -= segment_size;
2345 
2346                 // Yield CPU to prevent soft lockups
2347                 schedule();
2348             }
2349 #endif
2350             goto failed;
2351         }
2352 
2353         /*
2354          * On systems with cpuset cgroup controller enabled, memory alloc on
2355          * this just hotplugged GPU memory node can fail if the
2356          * cpuset_hotplug_work is not scheduled yet. cpuset_hotplug_work is
2357          * where the current->mems_allowed is updated in the path
2358          * cpuset_hotplug_workfn->update_tasks_nodemask. When cpuset is
2359          * enabled and current->mems_allowed is not updated, memory allocation
2360          * with __GFP_THISNODE and this node id fails. cpuset_wait_for_hotplug
2361          * kernel function can be used to wait for the work to finish but that
2362          * is not exported. Adding a time loop to wait for
2363          * current->mems_allowed to be updated as a WAR while an upstream
2364          * kernel fix is being explored. Bug 4385903
2365          */
2366         if (!node_isset(node, cpuset_current_mems_allowed))
2367         {
2368             unsigned long delay;
2369 
2370             delay = jiffies + (HZ / 10); // 100ms
2371             while(time_before(jiffies, delay) &&
2372                   !node_isset(node, cpuset_current_mems_allowed))
2373             {
2374                 os_schedule();
2375             }
2376 
2377             if (!node_isset(node, cpuset_current_mems_allowed))
2378             {
2379                 nv_printf(NV_DBG_ERRORS, "NVRM: Hotplugged GPU memory NUMA node: %d "
2380                           "not set in current->mems_allowed!\n", node);
2381             }
2382         }
2383 
2384         *nodeId = node;
2385         clear_bit(node, nvl->coherent_link_info.free_node_bitmap);
2386         NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE);
2387         return NV_OK;
2388     }
2389     nv_printf(NV_DBG_ERRORS, "NVRM: Memory add failed. base: 0x%lx size: 0x%lx ret: %d\n",
2390               base, size, ret);
2391 failed:
2392     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE_FAILED);
2393     return NV_ERR_OPERATING_SYSTEM;
2394 #endif
2395     return NV_ERR_NOT_SUPPORTED;
2396 }
2397 
2398 
2399 typedef struct {
2400     NvU64 base;
2401     NvU64 size;
2402     NvU32 nodeId;
2403     int ret;
2404 } remove_numa_memory_info_t;
2405 
offline_numa_memory_callback(void * args)2406 static void offline_numa_memory_callback
2407 (
2408     void *args
2409 )
2410 {
2411 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2412     remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
2413     int ret = 0;
2414     NvU64 memblock_size;
2415     NvU64 size_remaining;
2416     NvU64 calculated_segment_size;
2417     NvU64 segment_size;
2418     NvU64 segment_base;
2419 
2420     //
2421     // Removing all memory at once can take a long time. Split up memory into segments
2422     // with schedule() in between to prevent soft lockups. Memory segments for
2423     // offline_and_remove_memory() need to be aligned to memblock size.
2424     //
2425     // If there are any issues splitting into segments, then remove all memory at once.
2426     //
2427     if (os_numa_memblock_size(&memblock_size) == NV_OK)
2428     {
2429         calculated_segment_size = NV_ALIGN_UP(pNumaInfo->size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
2430     }
2431     else
2432     {
2433         // Don't split into segments, remove all memory at once
2434         calculated_segment_size = pNumaInfo->size;
2435     }
2436 
2437     segment_size = calculated_segment_size;
2438     segment_base = pNumaInfo->base;
2439     size_remaining = pNumaInfo->size;
2440 
2441     while (size_remaining > 0)
2442     {
2443         if (segment_size > size_remaining)
2444         {
2445             segment_size = size_remaining;
2446         }
2447 
2448 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2449         ret = offline_and_remove_memory(pNumaInfo->nodeId,
2450                                         segment_base,
2451                                         segment_size);
2452 #else
2453         ret = offline_and_remove_memory(segment_base,
2454                                         segment_size);
2455 #endif
2456         nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2457                   ret, segment_base, segment_size);
2458         pNumaInfo->ret |= ret;
2459 
2460         segment_base += segment_size;
2461         size_remaining -= segment_size;
2462 
2463         // Yield CPU to prevent soft lockups
2464         schedule();
2465     }
2466 #endif
2467 }
2468 
os_numa_remove_gpu_memory(void * handle,NvU64 offset,NvU64 size,NvU32 nodeId)2469 NV_STATUS NV_API_CALL os_numa_remove_gpu_memory
2470 (
2471     void *handle,
2472     NvU64 offset,
2473     NvU64 size,
2474     NvU32 nodeId
2475 )
2476 {
2477 #ifdef NV_ADD_MEMORY_DRIVER_MANAGED_PRESENT
2478     nv_linux_state_t *nvl = pci_get_drvdata(handle);
2479 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2480     NvU64 base = offset + nvl->coherent_link_info.gpu_mem_pa;
2481     remove_numa_memory_info_t numa_info;
2482     nv_kthread_q_item_t remove_numa_memory_q_item;
2483     int ret;
2484 #endif
2485 
2486     if (nodeId >= MAX_NUMNODES)
2487     {
2488         return NV_ERR_INVALID_ARGUMENT;
2489     }
2490     if ((nodeId == NUMA_NO_NODE) || test_bit(nodeId, nvl->coherent_link_info.free_node_bitmap))
2491     {
2492         return NV_ERR_INVALID_ARGUMENT;
2493     }
2494 
2495     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS);
2496 
2497 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2498     numa_info.base   = base;
2499     numa_info.size   = size;
2500     numa_info.nodeId = nodeId;
2501     numa_info.ret    = 0;
2502 
2503     nv_kthread_q_item_init(&remove_numa_memory_q_item,
2504                            offline_numa_memory_callback,
2505                            &numa_info);
2506     nv_kthread_q_schedule_q_item(&nvl->remove_numa_memory_q,
2507                                  &remove_numa_memory_q_item);
2508     nv_kthread_q_flush(&nvl->remove_numa_memory_q);
2509 
2510     ret = numa_info.ret;
2511 
2512     if (ret == 0)
2513     {
2514         set_bit(nodeId, nvl->coherent_link_info.free_node_bitmap);
2515 
2516         NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
2517         return NV_OK;
2518     }
2519 
2520     nv_printf(NV_DBG_ERRORS, "NVRM: Memory remove failed. base: 0x%lx size: 0x%lx ret: %d\n",
2521               base, size, ret);
2522 #endif
2523     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE_FAILED);
2524     return NV_ERR_OPERATING_SYSTEM;
2525 #endif
2526     return NV_ERR_NOT_SUPPORTED;
2527 }
2528 
os_offline_page_at_address(NvU64 address)2529 NV_STATUS NV_API_CALL os_offline_page_at_address
2530 (
2531     NvU64 address
2532 )
2533 {
2534 #if defined(CONFIG_MEMORY_FAILURE)
2535     int flags = 0;
2536     int ret;
2537     NvU64 pfn;
2538     struct page *page = NV_GET_PAGE_STRUCT(address);
2539 
2540     if (page == NULL)
2541     {
2542         nv_printf(NV_DBG_ERRORS, "NVRM: Failed to get page struct for address: 0x%llx\n",
2543                   address);
2544         return NV_ERR_INVALID_ARGUMENT;
2545     }
2546 
2547     pfn = page_to_pfn(page);
2548 
2549 #ifdef NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED
2550     //
2551     // Set MF_SW_SIMULATED flag so Linux kernel can differentiate this from a HW
2552     // memory failure. HW memory failures cannot be unset via unpoison_memory() API.
2553     //
2554     // Currently, RM does not use unpoison_memory(), so it makes no difference
2555     // whether or not MF_SW_SIMULATED is set. Regardless, it is semantically more
2556     // correct to set MF_SW_SIMULATED.
2557     //
2558     flags |= MF_SW_SIMULATED;
2559 #endif
2560 
2561 #ifdef NV_MEMORY_FAILURE_HAS_TRAPNO_ARG
2562     ret = memory_failure(pfn, 0, flags);
2563 #else
2564     ret = memory_failure(pfn, flags);
2565 #endif
2566 
2567     if (ret != 0)
2568     {
2569         nv_printf(NV_DBG_ERRORS, "NVRM: page offlining failed. address: 0x%llx pfn: 0x%llx ret: %d\n",
2570                   address, pfn, ret);
2571         return NV_ERR_OPERATING_SYSTEM;
2572     }
2573 
2574     return NV_OK;
2575 #else // !defined(CONFIG_MEMORY_FAILURE)
2576     nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure() not supported by kernel. page offlining failed. address: 0x%llx\n",
2577               address);
2578     return NV_ERR_NOT_SUPPORTED;
2579 #endif
2580 }
2581 
os_get_pid_info(void)2582 void* NV_API_CALL os_get_pid_info(void)
2583 {
2584     return get_task_pid(current, PIDTYPE_PID);
2585 }
2586 
os_put_pid_info(void * pid_info)2587 void NV_API_CALL os_put_pid_info(void *pid_info)
2588 {
2589     if (pid_info != NULL)
2590         put_pid(pid_info);
2591 }
2592 
os_find_ns_pid(void * pid_info,NvU32 * ns_pid)2593 NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
2594 {
2595     if ((pid_info == NULL) || (ns_pid == NULL))
2596         return NV_ERR_INVALID_ARGUMENT;
2597 
2598     *ns_pid = pid_vnr((struct pid *)pid_info);
2599 
2600     // The call returns 0 if the PID is not found in the current ns
2601     if (*ns_pid == 0)
2602         return NV_ERR_OBJECT_NOT_FOUND;
2603 
2604     return NV_OK;
2605 }
2606 
2607