1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #define  __NO_VERSION__
25 
26 #include "os-interface.h"
27 #include "nv-linux.h"
28 #include "nv-caps-imex.h"
29 
30 #include "nv-time.h"
31 
32 #include <linux/mmzone.h>
33 #include <linux/numa.h>
34 #include <linux/cpuset.h>
35 
36 #include <linux/pid.h>
37 #if defined(CONFIG_LOCKDEP)
38 #include <linux/lockdep.h>
39 #endif // CONFIG_LOCKDEP
40 
41 extern char *NVreg_TemporaryFilePath;
42 
43 #define MAX_ERROR_STRING 528
44 static char nv_error_string[MAX_ERROR_STRING];
45 static NV_DEFINE_SPINLOCK(nv_error_string_lock);
46 
47 extern nv_linux_state_t nv_ctl_device;
48 
49 extern nv_kthread_q_t nv_kthread_q;
50 
51 NvU32 os_page_size  = PAGE_SIZE;
52 NvU64 os_page_mask  = NV_PAGE_MASK;
53 NvU8  os_page_shift = PAGE_SHIFT;
54 NvBool os_cc_enabled = 0;
55 NvBool os_cc_tdx_enabled = 0;
56 
57 #if defined(CONFIG_DMA_SHARED_BUFFER)
58 NvBool os_dma_buf_enabled = NV_TRUE;
59 #else
60 NvBool os_dma_buf_enabled = NV_FALSE;
61 #endif // CONFIG_DMA_SHARED_BUFFER
62 
63 NvBool os_imex_channel_is_supported = NV_TRUE;
64 
65 void NV_API_CALL os_disable_console_access(void)
66 {
67     console_lock();
68 }
69 
70 void NV_API_CALL os_enable_console_access(void)
71 {
72     console_unlock();
73 }
74 
75 typedef struct semaphore os_mutex_t;
76 
77 //
78 // os_alloc_mutex - Allocate the RM mutex
79 //
80 //  ppMutex - filled in with pointer to opaque structure to mutex data type
81 //
82 NV_STATUS NV_API_CALL os_alloc_mutex
83 (
84     void **ppMutex
85 )
86 {
87     NV_STATUS rmStatus;
88     os_mutex_t *os_mutex;
89 
90     rmStatus = os_alloc_mem(ppMutex, sizeof(os_mutex_t));
91     if (rmStatus != NV_OK)
92     {
93         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate mutex!\n");
94         return rmStatus;
95     }
96     os_mutex = (os_mutex_t *)*ppMutex;
97     NV_INIT_MUTEX(os_mutex);
98 
99     return NV_OK;
100 }
101 
102 //
103 // os_free_mutex - Free resources associated with mutex allocated
104 //                via os_alloc_mutex above.
105 //
106 //  pMutex - Pointer to opaque structure to mutex data type
107 //
108 void NV_API_CALL os_free_mutex
109 (
110     void  *pMutex
111 )
112 {
113     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
114 
115     if (os_mutex != NULL)
116     {
117         os_free_mem(pMutex);
118     }
119 }
120 
121 //
122 //  pMutex - Pointer to opaque structure to mutex data type
123 //
124 
125 NV_STATUS NV_API_CALL os_acquire_mutex
126 (
127     void  *pMutex
128 )
129 {
130     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
131 
132     if (!NV_MAY_SLEEP())
133     {
134         return NV_ERR_INVALID_REQUEST;
135     }
136     down(os_mutex);
137 
138     return NV_OK;
139 }
140 
141 NV_STATUS NV_API_CALL os_cond_acquire_mutex
142 (
143     void * pMutex
144 )
145 {
146     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
147     if (!NV_MAY_SLEEP())
148     {
149         return NV_ERR_INVALID_REQUEST;
150     }
151 
152     if (down_trylock(os_mutex))
153     {
154         return NV_ERR_TIMEOUT_RETRY;
155     }
156 
157     return NV_OK;
158 }
159 
160 
161 void NV_API_CALL os_release_mutex
162 (
163     void *pMutex
164 )
165 {
166     os_mutex_t *os_mutex = (os_mutex_t *)pMutex;
167     up(os_mutex);
168 }
169 
170 typedef struct semaphore os_semaphore_t;
171 
172 
173 void* NV_API_CALL os_alloc_semaphore
174 (
175     NvU32 initialValue
176 )
177 {
178     NV_STATUS rmStatus;
179     os_semaphore_t *os_sema;
180 
181     rmStatus = os_alloc_mem((void *)&os_sema, sizeof(os_semaphore_t));
182     if (rmStatus != NV_OK)
183     {
184         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate semaphore!\n");
185         return NULL;
186     }
187 
188     sema_init(os_sema, initialValue);
189 
190     return (void *)os_sema;
191 }
192 
193 void NV_API_CALL os_free_semaphore
194 (
195     void *pSema
196 )
197 {
198     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
199 
200     os_free_mem(os_sema);
201 }
202 
203 NV_STATUS NV_API_CALL os_acquire_semaphore
204 (
205     void *pSema
206 )
207 {
208     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
209 
210     if (!NV_MAY_SLEEP())
211     {
212         return NV_ERR_INVALID_REQUEST;
213     }
214     down(os_sema);
215     return NV_OK;
216 }
217 
218 NV_STATUS NV_API_CALL os_cond_acquire_semaphore
219 (
220     void * pSema
221 )
222 {
223     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
224     //
225     // NOTE: down_trylock() is safe to call from IRQ, se we don't need an
226     // NV_MAY_SLEEP() check here. We do check it in os_cond_acquire_mutex(),
227     // even though it is also calling down_trylock(), since that keeps it
228     // in line with the kernel's 'struct mutex' API.
229     //
230     if (down_trylock(os_sema))
231     {
232         return NV_ERR_TIMEOUT_RETRY;
233     }
234 
235     return NV_OK;
236 }
237 
238 NV_STATUS NV_API_CALL os_release_semaphore
239 (
240     void *pSema
241 )
242 {
243     os_semaphore_t *os_sema = (os_semaphore_t *)pSema;
244     up(os_sema);
245     return NV_OK;
246 }
247 
248 typedef struct
249 {
250     struct rw_semaphore sem;
251 
252 #if defined(CONFIG_LOCKDEP)
253     /**
254      * A key of lock class. It would be registered to Lockdep validator so all
255      * instances' usages and dependencies will contribute to constructing correct
256      * locking rules and this lock will be tracked by the Lockdep validator.
257      *
258      */
259     struct lock_class_key key;
260 #endif // CONFIG_LOCKDEP
261 } os_rwlock_t;
262 
263 void* NV_API_CALL os_alloc_rwlock(void)
264 {
265     os_rwlock_t *os_rwlock = NULL;
266 
267     NV_STATUS rmStatus = os_alloc_mem((void *)&os_rwlock, sizeof(os_rwlock_t));
268     if (rmStatus != NV_OK)
269     {
270         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate a struct os_rwlock_t!\n");
271         return NULL;
272     }
273 
274     init_rwsem(&os_rwlock->sem);
275 
276 #if defined(CONFIG_LOCKDEP)
277     // Register the dynamically allocated key to Lockdep.
278     lockdep_register_key(&os_rwlock->key);
279     lockdep_set_class(&os_rwlock->sem, &os_rwlock->key);
280 #endif // CONFIG_LOCKDEP
281 
282     return os_rwlock;
283 }
284 
285 void NV_API_CALL os_free_rwlock(void *pRwLock)
286 {
287     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
288 
289 #if defined(CONFIG_LOCKDEP)
290     // Unregister the dynamically allocated key.
291     lockdep_unregister_key(&os_rwlock->key);
292 #endif // CONFIG_LOCKDEP
293 
294     os_free_mem(os_rwlock);
295 }
296 
297 NV_STATUS NV_API_CALL os_acquire_rwlock_read(void *pRwLock)
298 {
299     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
300 
301     if (!NV_MAY_SLEEP())
302     {
303         return NV_ERR_INVALID_REQUEST;
304     }
305     down_read(&os_rwlock->sem);
306     return NV_OK;
307 }
308 
309 NV_STATUS NV_API_CALL os_acquire_rwlock_write(void *pRwLock)
310 {
311     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
312 
313     if (!NV_MAY_SLEEP())
314     {
315         return NV_ERR_INVALID_REQUEST;
316     }
317     down_write(&os_rwlock->sem);
318     return NV_OK;
319 }
320 
321 NV_STATUS NV_API_CALL os_cond_acquire_rwlock_read(void *pRwLock)
322 {
323     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
324 
325     if (down_read_trylock(&os_rwlock->sem))
326     {
327         return NV_ERR_TIMEOUT_RETRY;
328     }
329 
330     return NV_OK;
331 }
332 
333 NV_STATUS NV_API_CALL os_cond_acquire_rwlock_write(void *pRwLock)
334 {
335     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
336 
337     if (down_write_trylock(&os_rwlock->sem))
338     {
339         return NV_ERR_TIMEOUT_RETRY;
340     }
341 
342     return NV_OK;
343 }
344 
345 void NV_API_CALL os_release_rwlock_read(void *pRwLock)
346 {
347     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
348     up_read(&os_rwlock->sem);
349 }
350 
351 void NV_API_CALL os_release_rwlock_write(void *pRwLock)
352 {
353     os_rwlock_t *os_rwlock = (os_rwlock_t *)pRwLock;
354     up_write(&os_rwlock->sem);
355 }
356 
357 NvBool NV_API_CALL os_semaphore_may_sleep(void)
358 {
359     return NV_MAY_SLEEP();
360 }
361 
362 NvBool NV_API_CALL os_is_isr(void)
363 {
364     return (in_irq());
365 }
366 
367 // return TRUE if the caller is the super-user
368 NvBool NV_API_CALL os_is_administrator(void)
369 {
370     return NV_IS_SUSER();
371 }
372 
373 NvBool NV_API_CALL os_allow_priority_override(void)
374 {
375     return capable(CAP_SYS_NICE);
376 }
377 
378 char* NV_API_CALL os_string_copy(
379     char *dst,
380     const char *src
381 )
382 {
383     return strcpy(dst, src);
384 }
385 
386 NvU32 NV_API_CALL os_string_length(
387     const char* str
388 )
389 {
390     return strlen(str);
391 }
392 
393 NvU32 NV_API_CALL os_strtoul(const char *str, char **endp, NvU32 base)
394 {
395     return (NvU32)simple_strtoul(str, endp, base);
396 }
397 
398 NvS32 NV_API_CALL os_string_compare(const char *str1, const char *str2)
399 {
400     return strcmp(str1, str2);
401 }
402 
403 void *os_mem_copy_custom(
404     void       *dstPtr,
405     const void *srcPtr,
406     NvU32       length
407 )
408 {
409     void *ret = dstPtr;
410     NvU32 dwords, bytes = length;
411     NvU8 *dst = dstPtr;
412     const NvU8 *src = srcPtr;
413 
414     if ((length >= 128) &&
415         (((NvUPtr)dst & 3) == 0) & (((NvUPtr)src & 3) == 0))
416     {
417         dwords = (length / sizeof(NvU32));
418         bytes = (length % sizeof(NvU32));
419 
420         while (dwords != 0)
421         {
422             *(NvU32 *)dst = *(const NvU32 *)src;
423             dst += sizeof(NvU32);
424             src += sizeof(NvU32);
425             dwords--;
426         }
427     }
428 
429     while (bytes != 0)
430     {
431         *dst = *src;
432         dst++;
433         src++;
434         bytes--;
435     }
436 
437     return ret;
438 }
439 
440 void *NV_API_CALL os_mem_copy(
441     void       *dst,
442     const void *src,
443     NvU32       length
444 )
445 {
446 #if defined(NVCPU_AARCH64)
447     /*
448      * TODO: Remove once memset/memcpy restructure is complete
449      *
450      * When performing memcpy for memory mapped as device, memcpy_[to/from]io
451      * must be used. WAR to check the source and destination to determine the
452      * correct memcpy_io to use.
453      *
454      * This WAR is limited to just aarch64 for now because the address range used
455      * to map ioremap and vmalloc is different on ppc64le, and is_vmalloc_addr()
456      * does not correctly handle this. is_ioremap_addr() is needed instead. This
457      * will have to be addressed when reorganizing RM to use the new memset model.
458      */
459     if (is_vmalloc_addr(dst) && !is_vmalloc_addr(src))
460     {
461         memcpy_toio(dst, src, length);
462         return dst;
463     }
464     else if (!is_vmalloc_addr(dst) && is_vmalloc_addr(src))
465     {
466         memcpy_fromio(dst, src, length);
467         return dst;
468     }
469     else if (is_vmalloc_addr(dst) && is_vmalloc_addr(src))
470     {
471         return os_mem_copy_custom(dst, src, length);
472     }
473     else
474 #endif
475     {
476 #if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE)
477         /*
478          * When the kernel is configured with CC_OPTIMIZE_FOR_SIZE=y, Kbuild uses
479          * -Os universally. With -Os, GCC will aggressively inline builtins, even
480          * if -fno-builtin is specified, including memcpy with a tiny byte-copy
481          * loop on x86 (rep movsb). This is horrible for performance - a strict
482          * dword copy is much faster - so when we detect this case, just provide
483          * our own implementation.
484          */
485         return os_mem_copy_custom(dst, src, length);
486 #else
487         /*
488          * Generally speaking, the kernel-provided memcpy will be the fastest,
489          * (optimized much better for the target architecture than the above
490          * loop), so we want to use that whenever we can get to it.
491          */
492         return memcpy(dst, src, length);
493 #endif
494     }
495 }
496 
497 NV_STATUS NV_API_CALL os_memcpy_from_user(
498     void       *to,
499     const void *from,
500     NvU32       n
501 )
502 {
503     return (NV_COPY_FROM_USER(to, from, n) ? NV_ERR_INVALID_ADDRESS : NV_OK);
504 }
505 
506 NV_STATUS NV_API_CALL os_memcpy_to_user(
507     void       *to,
508     const void *from,
509     NvU32       n
510 )
511 {
512     return (NV_COPY_TO_USER(to, from, n) ? NV_ERR_INVALID_ADDRESS : NV_OK);
513 }
514 
515 void* NV_API_CALL os_mem_set(
516     void  *dst,
517     NvU8   c,
518     NvU32  length
519 )
520 {
521 #if defined(NVCPU_AARCH64)
522     /*
523      * TODO: Remove once memset/memcpy restructure is complete
524      *
525      * WAR to check the destination to determine if the memory is of type Device
526      * or Normal, and use the correct memset.
527      *
528      * This WAR is limited to just aarch64 for now because the address range used
529      * to map ioremap and vmalloc is different on ppc64le, and is_vmalloc_addr()
530      * does not correctly handle this. is_ioremap_addr() is needed instead. This
531      * will have to be addressed when reorganizing RM to use the new memset model.
532      */
533     if (is_vmalloc_addr(dst))
534     {
535         memset_io(dst, (int)c, length);
536         return dst;
537     }
538     else
539 #endif
540        return memset(dst, (int)c, length);
541 }
542 
543 NvS32 NV_API_CALL os_mem_cmp(
544     const NvU8 *buf0,
545     const NvU8* buf1,
546     NvU32 length
547 )
548 {
549     return memcmp(buf0, buf1, length);
550 }
551 
552 
553 /*
554  * Operating System Memory Functions
555  *
556  * There are 2 interesting aspects of resource manager memory allocations
557  * that need special consideration on Linux:
558  *
559  * 1. They are typically very large, (e.g. single allocations of 164KB)
560  *
561  * 2. The resource manager assumes that it can safely allocate memory in
562  *    interrupt handlers.
563  *
564  * The first requires that we call vmalloc, the second kmalloc. We decide
565  * which one to use at run time, based on the size of the request and the
566  * context. Allocations larger than 128KB require vmalloc, in the context
567  * of an ISR they fail.
568  */
569 
570 #if defined(NV_VGX_HYPER)
571 /*
572  * Citrix Hypervisor-8.0 Dom0 sysmem ends up getting fragmented because
573  * of which high-order kmalloc allocations fail. We try to avoid it by
574  * requesting allocations not larger than 8K.
575  *
576  * KVM will be affected low memory pressure situation a lot,
577  * particularly if hugetlbfs hugepages are being used. Hence, 8K applies
578  * here too.
579  */
580 #define KMALLOC_LIMIT 8192
581 #else
582 #define KMALLOC_LIMIT 131072
583 #endif
584 
585 #define VMALLOC_ALLOCATION_SIZE_FLAG (1 << 0)
586 
587 NV_STATUS NV_API_CALL os_alloc_mem(
588     void **address,
589     NvU64 size
590 )
591 {
592     NvU64 original_size = size;
593     unsigned long alloc_size;
594 
595     if (address == NULL)
596         return NV_ERR_INVALID_ARGUMENT;
597 
598     *address = NULL;
599     NV_MEM_TRACKING_PAD_SIZE(size);
600 
601     // check for integer overflow on size
602     if (size < original_size)
603         return NV_ERR_INVALID_ARGUMENT;
604 
605     //
606     // NV_KMALLOC, nv_vmalloc take an input of 4 bytes in x86. To avoid
607     // truncation and wrong allocation, below check is required.
608     //
609     alloc_size = size;
610 
611     if (alloc_size != size)
612         return NV_ERR_INVALID_PARAMETER;
613 
614     if (!NV_MAY_SLEEP())
615     {
616         if (alloc_size <= KMALLOC_LIMIT)
617             NV_KMALLOC_ATOMIC(*address, alloc_size);
618     }
619     else
620     {
621         if (alloc_size <= KMALLOC_LIMIT)
622         {
623             NV_KMALLOC_NO_OOM(*address, alloc_size);
624         }
625         if (*address == NULL)
626         {
627             *address = nv_vmalloc(alloc_size);
628             alloc_size |= VMALLOC_ALLOCATION_SIZE_FLAG;
629         }
630     }
631 
632     NV_MEM_TRACKING_HIDE_SIZE(address, alloc_size);
633 
634     return ((*address != NULL) ? NV_OK : NV_ERR_NO_MEMORY);
635 }
636 
637 void NV_API_CALL os_free_mem(void *address)
638 {
639     NvU64 size;
640 
641     NV_MEM_TRACKING_RETRIEVE_SIZE(address, size);
642 
643     if (size & VMALLOC_ALLOCATION_SIZE_FLAG)
644     {
645         size &= ~VMALLOC_ALLOCATION_SIZE_FLAG;
646         nv_vfree(address, size);
647     }
648     else
649         NV_KFREE(address, size);
650 }
651 
652 
653 /*****************************************************************************
654 *
655 *   Name: osGetCurrentTime
656 *
657 *****************************************************************************/
658 
659 NV_STATUS NV_API_CALL os_get_current_time(
660     NvU32 *seconds,
661     NvU32 *useconds
662 )
663 {
664     struct timespec64 tm;
665 
666     ktime_get_real_ts64(&tm);
667 
668     *seconds = tm.tv_sec;
669     *useconds = tm.tv_nsec / NSEC_PER_USEC;
670 
671     return NV_OK;
672 }
673 
674 //
675 // Get the High resolution tick count of the system uptime
676 //
677 NvU64 NV_API_CALL os_get_current_tick_hr(void)
678 {
679     struct timespec64 tm;
680     ktime_get_raw_ts64(&tm);
681     return (NvU64) timespec64_to_ns(&tm);
682 }
683 
684 #if BITS_PER_LONG >= 64
685 
686 NvU64 NV_API_CALL os_get_current_tick(void)
687 {
688 #if defined(NV_JIFFIES_TO_TIMESPEC_PRESENT)
689     struct timespec ts;
690     jiffies_to_timespec(jiffies, &ts);
691     return (NvU64) timespec_to_ns(&ts);
692 #else
693     struct timespec64 ts;
694     jiffies_to_timespec64(jiffies, &ts);
695     return (NvU64) timespec64_to_ns(&ts);
696 #endif
697 }
698 
699 NvU64 NV_API_CALL os_get_tick_resolution(void)
700 {
701     return (NvU64)jiffies_to_usecs(1) * NSEC_PER_USEC;
702 }
703 
704 #else
705 
706 NvU64 NV_API_CALL os_get_current_tick(void)
707 {
708     /*
709      * 'jiffies' overflows regularly on 32-bit builds (unsigned long is 4 bytes
710      * instead of 8 bytes), so it's unwise to build a tick counter on it, since
711      * the rest of the Resman assumes the 'tick' returned from this function is
712      * monotonically increasing and never overflows.
713      *
714      * Instead, use the previous implementation that we've lived with since the
715      * beginning, which uses system clock time to calculate the tick. This is
716      * subject to problems if the system clock time changes dramatically
717      * (more than a second or so) while the Resman is actively tracking a
718      * timeout.
719      */
720     NvU32 seconds, useconds;
721 
722     (void) os_get_current_time(&seconds, &useconds);
723 
724     return ((NvU64)seconds * NSEC_PER_SEC +
725                  (NvU64)useconds * NSEC_PER_USEC);
726 }
727 
728 NvU64 NV_API_CALL os_get_tick_resolution(void)
729 {
730     /*
731      * os_get_current_tick() uses os_get_current_time(), which has
732      * microsecond resolution.
733      */
734     return 1000ULL;
735 }
736 
737 #endif
738 
739 //---------------------------------------------------------------------------
740 //
741 //  Misc services.
742 //
743 //---------------------------------------------------------------------------
744 
745 NV_STATUS NV_API_CALL os_delay_us(NvU32 MicroSeconds)
746 {
747     return nv_sleep_us(MicroSeconds);
748 }
749 
750 NV_STATUS NV_API_CALL os_delay(NvU32 MilliSeconds)
751 {
752     return nv_sleep_ms(MilliSeconds);
753 }
754 
755 NvU64 NV_API_CALL os_get_cpu_frequency(void)
756 {
757     NvU64 cpu_hz = 0;
758 #if defined(CONFIG_CPU_FREQ)
759     cpu_hz = (cpufreq_get(0) * 1000);
760 #elif defined(NVCPU_X86_64)
761     NvU64 tsc[2];
762 
763     tsc[0] = nv_rdtsc();
764     mdelay(250);
765     tsc[1] = nv_rdtsc();
766 
767     cpu_hz = ((tsc[1] - tsc[0]) * 4);
768 #endif
769     return cpu_hz;
770 }
771 
772 NvU32 NV_API_CALL os_get_current_process(void)
773 {
774     return NV_GET_CURRENT_PROCESS();
775 }
776 
777 void NV_API_CALL os_get_current_process_name(char *buf, NvU32 len)
778 {
779     task_lock(current);
780     strncpy(buf, current->comm, len - 1);
781     buf[len - 1] = '\0';
782     task_unlock(current);
783 }
784 
785 NV_STATUS NV_API_CALL os_get_current_thread(NvU64 *threadId)
786 {
787     if (in_interrupt())
788         *threadId = 0;
789     else
790         *threadId = (NvU64) current->pid;
791 
792     return NV_OK;
793 }
794 
795 /*******************************************************************************/
796 /*                                                                             */
797 /* Debug and logging utilities follow                                          */
798 /*                                                                             */
799 /*******************************************************************************/
800 
801 // The current debug display level (default to maximum debug level)
802 NvU32 cur_debuglevel = 0xffffffff;
803 
804 /*
805  * The binary core of RM (nv-kernel.o) calls both out_string, and nv_printf.
806  */
807 inline void NV_API_CALL out_string(const char *str)
808 {
809     printk("%s", str);
810 }
811 
812 /*
813  * nv_printf() prints to the kernel log for the driver.
814  * Returns the number of characters written.
815  */
816 int NV_API_CALL nv_printf(NvU32 debuglevel, const char *printf_format, ...)
817 {
818     va_list arglist;
819     int chars_written = 0;
820 
821     if (debuglevel >= ((cur_debuglevel >> 4) & 0x3))
822     {
823         size_t length;
824         unsigned long flags;
825 
826         // When printk is called to extend the output of the previous line
827         // (i.e. when the previous line did not end in \n), the printk call
828         // must contain KERN_CONT.  Older kernels still print the line
829         // correctly, but KERN_CONT was technically always required.
830 
831         // This means that every call to printk() needs to have a KERN_xxx
832         // prefix.  The only way to get this is to rebuild the format string
833         // into a new buffer, with a KERN_xxx prefix prepended.
834 
835         // Unfortunately, we can't guarantee that two calls to nv_printf()
836         // won't be interrupted by a printk from another driver.  So to be
837         // safe, we always append KERN_CONT.  It's still technically wrong,
838         // but it works.
839 
840         // The long-term fix is to modify all NV_PRINTF-ish calls so that the
841         // string always contains only one \n (at the end) and NV_PRINTF_EX
842         // is deleted.  But that is unlikely to ever happen.
843 
844         length = strlen(printf_format);
845         if (length < 1)
846             return 0;
847 
848         NV_SPIN_LOCK_IRQSAVE(&nv_error_string_lock, flags);
849 
850         // KERN_CONT changed in the 3.6 kernel, so we can't assume its
851         // composition or size.
852         memcpy(nv_error_string, KERN_CONT, sizeof(KERN_CONT) - 1);
853         memcpy(nv_error_string + sizeof(KERN_CONT) - 1, printf_format, length + 1);
854 
855         va_start(arglist, printf_format);
856         chars_written = vprintk(nv_error_string, arglist);
857         va_end(arglist);
858 
859         NV_SPIN_UNLOCK_IRQRESTORE(&nv_error_string_lock, flags);
860     }
861 
862     return chars_written;
863 }
864 
865 NvS32 NV_API_CALL os_snprintf(char *buf, NvU32 size, const char *fmt, ...)
866 {
867     va_list arglist;
868     int chars_written;
869 
870     va_start(arglist, fmt);
871     chars_written = vsnprintf(buf, size, fmt, arglist);
872     va_end(arglist);
873 
874     return chars_written;
875 }
876 
877 NvS32 NV_API_CALL os_vsnprintf(char *buf, NvU32 size, const char *fmt, va_list arglist)
878 {
879     return vsnprintf(buf, size, fmt, arglist);
880 }
881 
882 void NV_API_CALL os_log_error(const char *fmt, va_list ap)
883 {
884     unsigned long flags;
885 
886     NV_SPIN_LOCK_IRQSAVE(&nv_error_string_lock, flags);
887 
888     vsnprintf(nv_error_string, MAX_ERROR_STRING, fmt, ap);
889     nv_error_string[MAX_ERROR_STRING - 1] = 0;
890     printk(KERN_ERR "%s", nv_error_string);
891 
892     NV_SPIN_UNLOCK_IRQRESTORE(&nv_error_string_lock, flags);
893 }
894 
895 void NV_API_CALL os_io_write_byte(
896     NvU32 address,
897     NvU8 value
898 )
899 {
900     outb(value, address);
901 }
902 
903 void NV_API_CALL os_io_write_word(
904     NvU32 address,
905     NvU16 value
906 )
907 {
908     outw(value, address);
909 }
910 
911 void NV_API_CALL os_io_write_dword(
912     NvU32 address,
913     NvU32 value
914 )
915 {
916     outl(value, address);
917 }
918 
919 NvU8 NV_API_CALL os_io_read_byte(
920     NvU32 address
921 )
922 {
923     return inb(address);
924 }
925 
926 NvU16 NV_API_CALL os_io_read_word(
927     NvU32 address
928 )
929 {
930     return inw(address);
931 }
932 
933 NvU32 NV_API_CALL os_io_read_dword(
934     NvU32 address
935 )
936 {
937     return inl(address);
938 }
939 
940 
941 static NvBool NV_API_CALL xen_support_fully_virtualized_kernel(void)
942 {
943 #if defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
944     return (os_is_vgx_hyper());
945 #endif
946     return NV_FALSE;
947 }
948 
949 void* NV_API_CALL os_map_kernel_space(
950     NvU64 start,
951     NvU64 size_bytes,
952     NvU32 mode
953 )
954 {
955     void *vaddr;
956 
957     if (!xen_support_fully_virtualized_kernel() && start == 0)
958     {
959         if (mode != NV_MEMORY_CACHED)
960         {
961             nv_printf(NV_DBG_ERRORS,
962                 "NVRM: os_map_kernel_space: won't map address 0x%0llx UC!\n", start);
963             return NULL;
964         }
965         else
966             return (void *)PAGE_OFFSET;
967     }
968 
969     if (!NV_MAY_SLEEP())
970     {
971         nv_printf(NV_DBG_ERRORS,
972             "NVRM: os_map_kernel_space: can't map 0x%0llx, invalid context!\n", start);
973         os_dbg_breakpoint();
974         return NULL;
975     }
976 
977     switch (mode)
978     {
979         case NV_MEMORY_CACHED:
980             vaddr = nv_ioremap_cache(start, size_bytes);
981             break;
982         case NV_MEMORY_WRITECOMBINED:
983             vaddr = rm_disable_iomap_wc() ?
984                     nv_ioremap_nocache(start, size_bytes) :
985                     nv_ioremap_wc(start, size_bytes);
986             break;
987         case NV_MEMORY_UNCACHED:
988         case NV_MEMORY_DEFAULT:
989             vaddr = nv_ioremap_nocache(start, size_bytes);
990             break;
991         default:
992             nv_printf(NV_DBG_ERRORS,
993                 "NVRM: os_map_kernel_space: unsupported mode!\n");
994             return NULL;
995     }
996 
997     return vaddr;
998 }
999 
1000 void NV_API_CALL os_unmap_kernel_space(
1001     void *addr,
1002     NvU64 size_bytes
1003 )
1004 {
1005     if (addr == (void *)PAGE_OFFSET)
1006         return;
1007 
1008     nv_iounmap(addr, size_bytes);
1009 }
1010 
1011 #if NVCPU_IS_AARCH64
1012 
1013 static inline void nv_flush_cache_cpu(void *info)
1014 {
1015     if (!nvos_is_chipset_io_coherent())
1016     {
1017 #if defined(NV_FLUSH_CACHE_ALL_PRESENT)
1018         flush_cache_all();
1019 #else
1020         WARN_ONCE(0, "kernel does not provide flush_cache_all()\n");
1021 #endif
1022     }
1023 }
1024 
1025 // flush the cache of all cpus
1026 NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void)
1027 {
1028     on_each_cpu(nv_flush_cache_cpu, NULL, 1);
1029     return NV_OK;
1030 }
1031 
1032 NV_STATUS NV_API_CALL os_flush_user_cache(void)
1033 {
1034     if (!NV_MAY_SLEEP())
1035     {
1036         return NV_ERR_NOT_SUPPORTED;
1037     }
1038 
1039     //
1040     // The Linux kernel does not export an interface for flushing a range,
1041     // although it is possible. For now, just flush the entire cache to be
1042     // safe.
1043     //
1044     on_each_cpu(nv_flush_cache_cpu, NULL, 1);
1045     return NV_OK;
1046 }
1047 
1048 #else // NVCPU_IS_AARCH64
1049 
1050 NV_STATUS NV_API_CALL os_flush_cpu_cache_all(void)
1051 {
1052     return NV_ERR_NOT_SUPPORTED;
1053 }
1054 
1055 NV_STATUS NV_API_CALL os_flush_user_cache(void)
1056 {
1057     return NV_ERR_NOT_SUPPORTED;
1058 }
1059 
1060 #endif
1061 
1062 void NV_API_CALL os_flush_cpu_write_combine_buffer(void)
1063 {
1064     wmb();
1065 }
1066 
1067 // override initial debug level from registry
1068 void NV_API_CALL os_dbg_init(void)
1069 {
1070     NvU32 new_debuglevel;
1071     nvidia_stack_t *sp = NULL;
1072 
1073     if (nv_kmem_cache_alloc_stack(&sp) != 0)
1074     {
1075         return;
1076     }
1077 
1078     if (NV_OK == rm_read_registry_dword(sp, NULL,
1079                                         "ResmanDebugLevel",
1080                                         &new_debuglevel))
1081     {
1082         if (new_debuglevel != (NvU32)~0)
1083             cur_debuglevel = new_debuglevel;
1084     }
1085 
1086     nv_kmem_cache_free_stack(sp);
1087 }
1088 
1089 void NV_API_CALL os_dbg_set_level(NvU32 new_debuglevel)
1090 {
1091     nv_printf(NV_DBG_SETUP, "NVRM: Changing debuglevel from 0x%x to 0x%x\n",
1092         cur_debuglevel, new_debuglevel);
1093     cur_debuglevel = new_debuglevel;
1094 }
1095 
1096 NvU64 NV_API_CALL os_get_max_user_va(void)
1097 {
1098     return TASK_SIZE;
1099 }
1100 
1101 NV_STATUS NV_API_CALL os_schedule(void)
1102 {
1103     if (NV_MAY_SLEEP())
1104     {
1105         set_current_state(TASK_INTERRUPTIBLE);
1106         schedule_timeout(1);
1107         return NV_OK;
1108     }
1109     else
1110     {
1111         nv_printf(NV_DBG_ERRORS, "NVRM: os_schedule: Attempted to yield"
1112                                  " the CPU while in atomic or interrupt"
1113                                  " context\n");
1114         return NV_ERR_ILLEGAL_ACTION;
1115     }
1116 }
1117 
1118 typedef struct {
1119     nv_kthread_q_item_t item;
1120     void *data;
1121 } os_queue_data_t;
1122 
1123 static void os_execute_work_item(void *_oqd)
1124 {
1125     os_queue_data_t *oqd = _oqd;
1126     nvidia_stack_t *sp = NULL;
1127     void *data = oqd->data;
1128 
1129     NV_KFREE(oqd, sizeof(os_queue_data_t));
1130 
1131     if (nv_kmem_cache_alloc_stack(&sp) != 0)
1132     {
1133         return;
1134     }
1135 
1136     rm_execute_work_item(sp, data);
1137 
1138     nv_kmem_cache_free_stack(sp);
1139 }
1140 
1141 NV_STATUS NV_API_CALL os_queue_work_item(struct os_work_queue *queue, void *data)
1142 {
1143     os_queue_data_t *oqd;
1144     nv_kthread_q_t *kthread;
1145 
1146     /* Use the global queue unless a valid queue was provided */
1147     kthread = queue ? &queue->nvk : &nv_kthread_q;
1148 
1149     /* Make sure the kthread is active */
1150     if (unlikely(!kthread->q_kthread)) {
1151         nv_printf(NV_DBG_ERRORS, "NVRM: queue is not enabled\n");
1152         return NV_ERR_NOT_READY;
1153     }
1154 
1155     /* Allocate atomically just in case we're called in atomic context. */
1156     NV_KMALLOC_ATOMIC(oqd, sizeof(os_queue_data_t));
1157     if (!oqd)
1158         return NV_ERR_NO_MEMORY;
1159 
1160     nv_kthread_q_item_init(&oqd->item, os_execute_work_item, oqd);
1161     oqd->data = data;
1162 
1163     nv_kthread_q_schedule_q_item(kthread, &oqd->item);
1164 
1165     return NV_OK;
1166 }
1167 
1168 NV_STATUS NV_API_CALL os_flush_work_queue(struct os_work_queue *queue)
1169 {
1170     nv_kthread_q_t *kthread;
1171 
1172     /* Use the global queue unless a valid queue was provided */
1173     kthread = queue ? &queue->nvk : &nv_kthread_q;
1174 
1175     if (NV_MAY_SLEEP())
1176     {
1177         if (kthread->q_kthread)
1178             nv_kthread_q_flush(kthread);
1179 
1180         return NV_OK;
1181     }
1182     else
1183     {
1184         nv_printf(NV_DBG_ERRORS,
1185                   "NVRM: os_flush_work_queue: attempted to execute passive"
1186                   "work from an atomic or interrupt context.\n");
1187         return NV_ERR_ILLEGAL_ACTION;
1188     }
1189 }
1190 
1191 extern NvU32 NVreg_EnableDbgBreakpoint;
1192 
1193 void NV_API_CALL os_dbg_breakpoint(void)
1194 {
1195     if (NVreg_EnableDbgBreakpoint == 0)
1196     {
1197         return;
1198     }
1199 
1200 #if defined(CONFIG_X86_REMOTE_DEBUG) || defined(CONFIG_KGDB) || defined(CONFIG_XMON)
1201   #if defined(NVCPU_X86_64)
1202     __asm__ __volatile__ ("int $3");
1203   #elif defined(NVCPU_ARM)
1204     __asm__ __volatile__ (".word %c0" :: "i" (KGDB_COMPILED_BREAK));
1205   #elif defined(NVCPU_AARCH64)
1206     # warning "Need to implement os_dbg_breakpoint() for aarch64"
1207   #elif defined(NVCPU_PPC64LE)
1208     __asm__ __volatile__ ("trap");
1209   #endif // NVCPU_*
1210 #elif defined(CONFIG_KDB)
1211     KDB_ENTER();
1212 #endif // CONFIG_X86_REMOTE_DEBUG || CONFIG_KGDB || CONFIG_XMON
1213 }
1214 
1215 NvU32 NV_API_CALL os_get_cpu_number(void)
1216 {
1217     NvU32 cpu_id = get_cpu();
1218     put_cpu();
1219     return cpu_id;
1220 }
1221 
1222 NvU32 NV_API_CALL os_get_cpu_count(void)
1223 {
1224     return NV_NUM_CPUS();
1225 }
1226 
1227 NvBool NV_API_CALL os_pat_supported(void)
1228 {
1229     return (nv_pat_mode != NV_PAT_MODE_DISABLED);
1230 }
1231 
1232 NvBool NV_API_CALL os_is_efi_enabled(void)
1233 {
1234     return efi_enabled(EFI_BOOT);
1235 }
1236 
1237 void NV_API_CALL os_dump_stack(void)
1238 {
1239     dump_stack();
1240 }
1241 
1242 typedef struct os_spinlock_s
1243 {
1244     nv_spinlock_t      lock;
1245     unsigned long      eflags;
1246 } os_spinlock_t;
1247 
1248 NV_STATUS NV_API_CALL os_alloc_spinlock(void **ppSpinlock)
1249 {
1250     NV_STATUS rmStatus;
1251     os_spinlock_t *os_spinlock;
1252 
1253     rmStatus = os_alloc_mem(ppSpinlock, sizeof(os_spinlock_t));
1254     if (rmStatus != NV_OK)
1255     {
1256         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate spinlock!\n");
1257         return rmStatus;
1258     }
1259 
1260     os_spinlock = (os_spinlock_t *)*ppSpinlock;
1261     NV_SPIN_LOCK_INIT(&os_spinlock->lock);
1262     os_spinlock->eflags = 0;
1263     return NV_OK;
1264 }
1265 
1266 void NV_API_CALL os_free_spinlock(void *pSpinlock)
1267 {
1268     os_free_mem(pSpinlock);
1269 }
1270 
1271 NvU64 NV_API_CALL os_acquire_spinlock(void *pSpinlock)
1272 {
1273     os_spinlock_t *os_spinlock = (os_spinlock_t *)pSpinlock;
1274     unsigned long eflags;
1275 
1276     NV_SPIN_LOCK_IRQSAVE(&os_spinlock->lock, eflags);
1277     os_spinlock->eflags = eflags;
1278 
1279 #if defined(NVCPU_X86_64)
1280     eflags &= X86_EFLAGS_IF;
1281 #elif defined(NVCPU_AARCH64)
1282     eflags &= PSR_I_BIT;
1283 #endif
1284     return eflags;
1285 }
1286 
1287 void NV_API_CALL os_release_spinlock(void *pSpinlock, NvU64 oldIrql)
1288 {
1289     os_spinlock_t *os_spinlock = (os_spinlock_t *)pSpinlock;
1290     unsigned long eflags;
1291 
1292     eflags = os_spinlock->eflags;
1293     os_spinlock->eflags = 0;
1294     NV_SPIN_UNLOCK_IRQRESTORE(&os_spinlock->lock, eflags);
1295 }
1296 
1297 #define NV_KERNEL_RELEASE    ((LINUX_VERSION_CODE >> 16) & 0x0ff)
1298 #define NV_KERNEL_VERSION    ((LINUX_VERSION_CODE >> 8)  & 0x0ff)
1299 #define NV_KERNEL_SUBVERSION ((LINUX_VERSION_CODE)       & 0x0ff)
1300 
1301 NV_STATUS NV_API_CALL os_get_version_info(os_version_info * pOsVersionInfo)
1302 {
1303     NV_STATUS status      = NV_OK;
1304 
1305     pOsVersionInfo->os_major_version = NV_KERNEL_RELEASE;
1306     pOsVersionInfo->os_minor_version = NV_KERNEL_VERSION;
1307     pOsVersionInfo->os_build_number  = NV_KERNEL_SUBVERSION;
1308 
1309 #if defined(UTS_RELEASE)
1310     pOsVersionInfo->os_build_version_str = UTS_RELEASE;
1311 #endif
1312 
1313 #if defined(UTS_VERSION)
1314     pOsVersionInfo->os_build_date_plus_str = UTS_VERSION;
1315 #endif
1316 
1317     return status;
1318 }
1319 
1320 NvBool NV_API_CALL os_is_xen_dom0(void)
1321 {
1322 #if defined(NV_DOM0_KERNEL_PRESENT)
1323     return NV_TRUE;
1324 #else
1325     return NV_FALSE;
1326 #endif
1327 }
1328 
1329 NvBool NV_API_CALL os_is_vgx_hyper(void)
1330 {
1331 #if defined(NV_VGX_HYPER)
1332     return NV_TRUE;
1333 #else
1334     return NV_FALSE;
1335 #endif
1336 }
1337 
1338 NV_STATUS NV_API_CALL os_inject_vgx_msi(NvU16 guestID, NvU64 msiAddr, NvU32 msiData)
1339 {
1340 #if defined(NV_VGX_HYPER) && defined(NV_DOM0_KERNEL_PRESENT) && \
1341     defined(NV_XEN_IOEMU_INJECT_MSI)
1342     int rc = 0;
1343     rc = xen_ioemu_inject_msi(guestID, msiAddr, msiData);
1344     if (rc)
1345     {
1346         nv_printf(NV_DBG_ERRORS,
1347             "NVRM: %s: can't inject MSI to guest:%d, addr:0x%x, data:0x%x, err:%d\n",
1348             __FUNCTION__, guestID, msiAddr, msiData, rc);
1349         return NV_ERR_OPERATING_SYSTEM;
1350     }
1351     return NV_OK;
1352 #else
1353     return NV_ERR_NOT_SUPPORTED;
1354 #endif
1355 }
1356 
1357 NvBool NV_API_CALL os_is_grid_supported(void)
1358 {
1359 #if defined(NV_GRID_BUILD)
1360     return NV_TRUE;
1361 #else
1362     return NV_FALSE;
1363 #endif
1364 }
1365 
1366 NvU32 NV_API_CALL os_get_grid_csp_support(void)
1367 {
1368 #if defined(NV_GRID_BUILD_CSP)
1369     return NV_GRID_BUILD_CSP;
1370 #else
1371     return 0;
1372 #endif
1373 }
1374 
1375 void NV_API_CALL os_bug_check(NvU32 bugCode, const char *bugCodeStr)
1376 {
1377     panic(bugCodeStr);
1378 }
1379 
1380 NV_STATUS NV_API_CALL os_get_euid(NvU32 *pSecToken)
1381 {
1382     *pSecToken = NV_CURRENT_EUID();
1383     return NV_OK;
1384 }
1385 
1386 #if defined(NVCPU_X86_64) || defined(NVCPU_AARCH64)
1387 
1388 static NvBool os_verify_checksum(const NvU8 *pMappedAddr, NvU32 length)
1389 {
1390     NvU8 sum = 0;
1391     NvU32 iter = 0;
1392 
1393     for (iter = 0; iter < length; iter++)
1394         sum += pMappedAddr[iter];
1395 
1396     return sum == 0;
1397 }
1398 
1399 #define _VERIFY_SMBIOS3(_pMappedAddr)                        \
1400         _pMappedAddr &&                                      \
1401         (os_mem_cmp(_pMappedAddr, "_SM3_", 5) == 0  &&       \
1402         _pMappedAddr[6] < 32 &&                              \
1403         _pMappedAddr[6] > 0 &&                               \
1404         os_verify_checksum(_pMappedAddr, _pMappedAddr[6]))
1405 
1406 #define OS_VERIFY_SMBIOS3(pMappedAddr) _VERIFY_SMBIOS3((pMappedAddr))
1407 
1408 #define _VERIFY_SMBIOS(_pMappedAddr)                           \
1409         _pMappedAddr &&                                        \
1410         (os_mem_cmp(_pMappedAddr, "_SM_", 4) == 0  &&          \
1411         _pMappedAddr[5] < 32 &&                                \
1412         _pMappedAddr[5] > 0 &&                                 \
1413         os_verify_checksum(_pMappedAddr, _pMappedAddr[5]) &&   \
1414         os_mem_cmp((_pMappedAddr + 16), "_DMI_", 5) == 0  &&   \
1415         os_verify_checksum((_pMappedAddr + 16), 15))
1416 
1417 #define OS_VERIFY_SMBIOS(pMappedAddr) _VERIFY_SMBIOS((pMappedAddr))
1418 
1419 #define SMBIOS_LEGACY_BASE 0xF0000
1420 #define SMBIOS_LEGACY_SIZE 0x10000
1421 
1422 static NV_STATUS os_get_smbios_header_legacy(NvU64 *pSmbsAddr)
1423 {
1424 #if !defined(NVCPU_X86_64)
1425     return NV_ERR_NOT_SUPPORTED;
1426 #else
1427     NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
1428     NvU8 *pMappedAddr = NULL;
1429     NvU8 *pIterAddr = NULL;
1430 
1431     pMappedAddr = (NvU8*)os_map_kernel_space(SMBIOS_LEGACY_BASE,
1432                                              SMBIOS_LEGACY_SIZE,
1433                                              NV_MEMORY_CACHED);
1434     if (pMappedAddr == NULL)
1435     {
1436         return NV_ERR_INSUFFICIENT_RESOURCES;
1437     }
1438 
1439     pIterAddr = pMappedAddr;
1440 
1441     for (; pIterAddr < (pMappedAddr + SMBIOS_LEGACY_SIZE); pIterAddr += 16)
1442     {
1443         if (OS_VERIFY_SMBIOS3(pIterAddr))
1444         {
1445             *pSmbsAddr = SMBIOS_LEGACY_BASE + (pIterAddr - pMappedAddr);
1446             status = NV_OK;
1447             break;
1448         }
1449 
1450         if (OS_VERIFY_SMBIOS(pIterAddr))
1451         {
1452             *pSmbsAddr = SMBIOS_LEGACY_BASE + (pIterAddr - pMappedAddr);
1453             status = NV_OK;
1454             break;
1455         }
1456     }
1457 
1458     os_unmap_kernel_space(pMappedAddr, SMBIOS_LEGACY_SIZE);
1459 
1460     return status;
1461 #endif
1462 }
1463 
1464 // This function is needed only if "efi" is enabled.
1465 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
1466 static NV_STATUS os_verify_smbios_header_uefi(NvU64 smbsAddr)
1467 {
1468     NV_STATUS status = NV_ERR_OBJECT_NOT_FOUND;
1469     NvU64 start= 0, offset =0 , size = 32;
1470     NvU8 *pMappedAddr = NULL, *pBufAddr = NULL;
1471 
1472     start = smbsAddr;
1473     offset = (start & ~os_page_mask);
1474     start &= os_page_mask;
1475     size = ((size + offset + ~os_page_mask) & os_page_mask);
1476 
1477     pBufAddr = (NvU8*)os_map_kernel_space(start,
1478                                           size,
1479                                           NV_MEMORY_CACHED);
1480     if (pBufAddr == NULL)
1481     {
1482         return NV_ERR_INSUFFICIENT_RESOURCES;
1483     }
1484 
1485     pMappedAddr = pBufAddr + offset;
1486 
1487     if (OS_VERIFY_SMBIOS3(pMappedAddr))
1488     {
1489         status = NV_OK;
1490         goto done;
1491     }
1492 
1493     if (OS_VERIFY_SMBIOS(pMappedAddr))
1494     {
1495         status = NV_OK;
1496     }
1497 
1498 done:
1499     os_unmap_kernel_space(pBufAddr, size);
1500     return status;
1501 }
1502 #endif
1503 
1504 static NV_STATUS os_get_smbios_header_uefi(NvU64 *pSmbsAddr)
1505 {
1506     NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
1507 
1508 // Make sure that efi.h is present before using "struct efi".
1509 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
1510 
1511 // Make sure that efi.h has SMBIOS3_TABLE_GUID present.
1512 #if defined(SMBIOS3_TABLE_GUID)
1513     if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
1514     {
1515         status = os_verify_smbios_header_uefi(efi.smbios3);
1516         if (status == NV_OK)
1517         {
1518             *pSmbsAddr = efi.smbios3;
1519             return NV_OK;
1520         }
1521     }
1522 #endif
1523 
1524     if (efi.smbios != EFI_INVALID_TABLE_ADDR)
1525     {
1526         status = os_verify_smbios_header_uefi(efi.smbios);
1527         if (status == NV_OK)
1528         {
1529             *pSmbsAddr = efi.smbios;
1530             return NV_OK;
1531         }
1532     }
1533 #endif
1534 
1535     return status;
1536 }
1537 
1538 #endif // defined(NVCPU_X86_64) || defined(NVCPU_AARCH64)
1539 
1540 // The function locates the SMBIOS entry point.
1541 NV_STATUS NV_API_CALL os_get_smbios_header(NvU64 *pSmbsAddr)
1542 {
1543 
1544 #if !defined(NVCPU_X86_64) && !defined(NVCPU_AARCH64)
1545     return NV_ERR_NOT_SUPPORTED;
1546 #else
1547     NV_STATUS status = NV_OK;
1548 
1549     if (os_is_efi_enabled())
1550     {
1551         status = os_get_smbios_header_uefi(pSmbsAddr);
1552     }
1553     else
1554     {
1555         status = os_get_smbios_header_legacy(pSmbsAddr);
1556     }
1557 
1558     return status;
1559 #endif
1560 }
1561 
1562 NV_STATUS NV_API_CALL os_get_acpi_rsdp_from_uefi
1563 (
1564     NvU32  *pRsdpAddr
1565 )
1566 {
1567     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
1568 
1569     if (pRsdpAddr == NULL)
1570     {
1571         return NV_ERR_INVALID_STATE;
1572     }
1573 
1574     *pRsdpAddr = 0;
1575 
1576 // Make sure that efi.h is present before using "struct efi".
1577 #if (defined(NV_LINUX_EFI_H_PRESENT) && defined(CONFIG_EFI))
1578 
1579     if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
1580     {
1581         *pRsdpAddr = efi.acpi20;
1582         status = NV_OK;
1583     }
1584     else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
1585     {
1586         *pRsdpAddr = efi.acpi;
1587         status = NV_OK;
1588     }
1589     else
1590     {
1591         nv_printf(NV_DBG_ERRORS, "NVRM: RSDP Not found!\n");
1592         status = NV_ERR_OPERATING_SYSTEM;
1593     }
1594 #endif
1595 
1596     return status;
1597 }
1598 
1599 void NV_API_CALL os_add_record_for_crashLog(void *pbuffer, NvU32 size)
1600 {
1601 }
1602 
1603 void NV_API_CALL os_delete_record_for_crashLog(void *pbuffer)
1604 {
1605 }
1606 
1607 #if !defined(NV_VGPU_KVM_BUILD)
1608 NV_STATUS NV_API_CALL os_call_vgpu_vfio(void *pvgpu_vfio_info, NvU32 cmd_type)
1609 {
1610     return NV_ERR_NOT_SUPPORTED;
1611 }
1612 #endif
1613 
1614 NV_STATUS NV_API_CALL os_alloc_pages_node
1615 (
1616     NvS32  nid,
1617     NvU32  size,
1618     NvU32  flag,
1619     NvU64 *pAddress
1620 )
1621 {
1622     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
1623 
1624 #if defined(__GFP_THISNODE) && defined(GFP_HIGHUSER_MOVABLE) && \
1625     defined(__GFP_COMP) && defined(__GFP_NORETRY) && defined(__GFP_NOWARN)
1626     gfp_t gfp_mask;
1627     struct page *alloc_addr;
1628     unsigned int order = get_order(size);
1629 
1630     /*
1631      * Explanation of flags used:
1632      *
1633      * 1. __GFP_THISNODE:           This will make sure the allocation happens
1634      *                              on the node specified by nid.
1635      *
1636      * 2. GFP_HIGHUSER_MOVABLE:     This makes allocations from ZONE_MOVABLE.
1637      *
1638      * 3. __GFP_COMP:               This will make allocations with compound
1639      *                              pages, which is needed in order to use
1640      *                              vm_insert_page API.
1641      *
1642      * 4. __GFP_NORETRY:            Used to avoid the Linux kernel OOM killer.
1643      *
1644      * 5. __GFP_NOWARN:             Used to avoid a WARN_ON in the slowpath if
1645      *                              the requested order is too large (just fail
1646      *                              instead).
1647      *
1648      * 6. (Optional) __GFP_RECLAIM: Used to allow/forbid reclaim.
1649      *                              This is part of GFP_USER and consequently
1650      *                              GFP_HIGHUSER_MOVABLE.
1651      *
1652      * Some of these flags are relatively more recent, with the last of them
1653      * (GFP_HIGHUSER_MOVABLE) having been added with this Linux kernel commit:
1654      *
1655      * 2007-07-17 769848c03895b63e5662eb7e4ec8c4866f7d0183
1656      *
1657      * Assume that this feature will only be used on kernels that support all
1658      * of the needed GFP flags.
1659      */
1660 
1661     gfp_mask = __GFP_THISNODE | GFP_HIGHUSER_MOVABLE | __GFP_COMP |
1662                __GFP_NORETRY | __GFP_NOWARN;
1663 
1664 #if defined(__GFP_RECLAIM)
1665     if (flag & NV_ALLOC_PAGES_NODE_SKIP_RECLAIM)
1666     {
1667         gfp_mask &= ~(__GFP_RECLAIM);
1668     }
1669 #endif // defined(__GFP_RECLAIM)
1670 
1671     alloc_addr = alloc_pages_node(nid, gfp_mask, order);
1672     if (alloc_addr == NULL)
1673     {
1674         nv_printf(NV_DBG_INFO,
1675             "NVRM: alloc_pages_node(node = %d, order = %u) failed\n",
1676             nid, order);
1677         status = NV_ERR_NO_MEMORY;
1678     }
1679     else if (page_to_nid(alloc_addr) != nid)
1680     {
1681         //
1682         // We can hit this case when a Linux kernel bug is not patched.
1683         // The needed patch is https://patchwork.kernel.org/patch/10427387/
1684         //
1685         nv_printf(NV_DBG_ERRORS,
1686             "NVRM: alloc_pages_node(node = %d, order = %u) wrong node ID.\n",
1687             nid, order);
1688         __free_pages(alloc_addr, order);
1689         status = NV_ERR_NO_MEMORY;
1690     }
1691     else
1692     {
1693         *pAddress = (NvU64)page_to_phys(alloc_addr);
1694         status = NV_OK;
1695     }
1696 #endif // GFP flags
1697 
1698     return status;
1699 }
1700 
1701 NV_STATUS NV_API_CALL os_get_page
1702 (
1703     NvU64 address
1704 )
1705 {
1706     get_page(NV_GET_PAGE_STRUCT(address));
1707     return NV_OK;
1708 }
1709 
1710 NV_STATUS NV_API_CALL os_put_page
1711 (
1712     NvU64 address
1713 )
1714 {
1715     put_page(NV_GET_PAGE_STRUCT(address));
1716     return NV_OK;
1717 }
1718 
1719 NvU32 NV_API_CALL os_get_page_refcount
1720 (
1721     NvU64 address
1722 )
1723 {
1724     return NV_PAGE_COUNT(NV_GET_PAGE_STRUCT(address));
1725 }
1726 
1727 NvU32 NV_API_CALL os_count_tail_pages
1728 (
1729     NvU64 address
1730 )
1731 {
1732     NvU32 order = compound_order(compound_head(NV_GET_PAGE_STRUCT(address)));
1733 
1734     return 1 << order;
1735 }
1736 
1737 void NV_API_CALL os_free_pages_phys
1738 (
1739     NvU64 address,
1740     NvU32 size
1741 )
1742 {
1743     __free_pages(NV_GET_PAGE_STRUCT(address), get_order(size));
1744 }
1745 
1746 NV_STATUS NV_API_CALL os_numa_memblock_size
1747 (
1748     NvU64 *memblock_size
1749 )
1750 {
1751 #if NV_IS_EXPORT_SYMBOL_PRESENT_memory_block_size_bytes
1752     *memblock_size = memory_block_size_bytes();
1753     return NV_OK;
1754 #endif
1755     if (nv_ctl_device.numa_memblock_size == 0)
1756         return NV_ERR_INVALID_STATE;
1757     *memblock_size = nv_ctl_device.numa_memblock_size;
1758     return NV_OK;
1759 }
1760 
1761 NV_STATUS NV_API_CALL os_open_temporary_file
1762 (
1763     void **ppFile
1764 )
1765 {
1766 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1767 #if defined(O_TMPFILE)
1768     struct file *file;
1769     const char *default_path = "/tmp";
1770     const int flags = O_TMPFILE | O_LARGEFILE | O_RDWR;
1771     const char *path = NVreg_TemporaryFilePath;
1772 
1773     /*
1774      * The filp_open() call below depends on the current task's fs_struct
1775      * (current->fs), which may already be NULL if this is called during
1776      * process teardown.
1777      */
1778     if (current->fs == NULL)
1779     {
1780         return NV_ERR_OPERATING_SYSTEM;
1781     }
1782 
1783     if (!path)
1784     {
1785         path = default_path;
1786     }
1787 
1788     file = filp_open(path, flags, 0);
1789     if (IS_ERR(file))
1790     {
1791         if ((path != default_path) && (PTR_ERR(file) == -ENOENT))
1792         {
1793             nv_printf(NV_DBG_ERRORS,
1794                       "NVRM: The temporary file path specified via the NVreg_TemporaryFilePath\n"
1795                       "NVRM: module parameter does not exist. Defaulting to /tmp.\n");
1796 
1797             file = filp_open(default_path, flags, 0);
1798         }
1799     }
1800 
1801     if (IS_ERR(file))
1802     {
1803         return NV_ERR_OPERATING_SYSTEM;
1804     }
1805 
1806     *ppFile = (void *)file;
1807 
1808     return NV_OK;
1809 #else
1810     return NV_ERR_NOT_SUPPORTED;
1811 #endif
1812 #else
1813     return NV_ERR_NOT_SUPPORTED;
1814 #endif
1815 }
1816 
1817 void NV_API_CALL os_close_file
1818 (
1819     void *pFile
1820 )
1821 {
1822 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1823     filp_close(pFile, NULL);
1824 #endif
1825 }
1826 
1827 #define NV_MAX_NUM_FILE_IO_RETRIES 10
1828 
1829 NV_STATUS NV_API_CALL os_write_file
1830 (
1831     void *pFile,
1832     NvU8 *pBuffer,
1833     NvU64 size,
1834     NvU64 offset
1835 )
1836 {
1837 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1838     loff_t f_pos = offset;
1839     ssize_t num_written;
1840     int num_retries = NV_MAX_NUM_FILE_IO_RETRIES;
1841 
1842 retry:
1843 #if defined(NV_KERNEL_WRITE_HAS_POINTER_POS_ARG)
1844     num_written = kernel_write(pFile, pBuffer, size, &f_pos);
1845 #else
1846     num_written = kernel_write(pFile, pBuffer, size, f_pos);
1847 #endif
1848     if (num_written < 0)
1849     {
1850         return NV_ERR_OPERATING_SYSTEM;
1851     }
1852     else if (num_written < size)
1853     {
1854         if (num_written > 0)
1855         {
1856             pBuffer += num_written;
1857             size -= num_written;
1858         }
1859         if (--num_retries > 0)
1860         {
1861             cond_resched();
1862             goto retry;
1863         }
1864         return NV_ERR_OPERATING_SYSTEM;
1865     }
1866 
1867     return NV_OK;
1868 #else
1869     return NV_ERR_NOT_SUPPORTED;
1870 #endif
1871 }
1872 
1873 NV_STATUS NV_API_CALL os_read_file
1874 (
1875     void *pFile,
1876     NvU8 *pBuffer,
1877     NvU64 size,
1878     NvU64 offset
1879 )
1880 {
1881 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1882     loff_t f_pos = offset;
1883     ssize_t num_read;
1884     int num_retries = NV_MAX_NUM_FILE_IO_RETRIES;
1885 
1886 retry:
1887 #if defined(NV_KERNEL_READ_HAS_POINTER_POS_ARG)
1888     num_read = kernel_read(pFile, pBuffer, size, &f_pos);
1889 #else
1890     num_read = kernel_read(pFile, f_pos, pBuffer, size);
1891 #endif
1892     if (num_read < 0)
1893     {
1894         return NV_ERR_OPERATING_SYSTEM;
1895     }
1896     else if (num_read < size)
1897     {
1898         if (num_read > 0)
1899         {
1900             pBuffer += num_read;
1901             size -= num_read;
1902         }
1903         if (--num_retries > 0)
1904         {
1905             cond_resched();
1906             goto retry;
1907         }
1908         return NV_ERR_OPERATING_SYSTEM;
1909     }
1910 
1911     return NV_OK;
1912 #else
1913     return NV_ERR_NOT_SUPPORTED;
1914 #endif
1915 }
1916 
1917 NV_STATUS NV_API_CALL os_open_readonly_file
1918 (
1919     const char  *filename,
1920     void       **ppFile
1921 )
1922 {
1923 #if NV_FILESYSTEM_ACCESS_AVAILABLE
1924     struct file *file;
1925 
1926     /*
1927      * The filp_open() call below depends on the current task's fs_struct
1928      * (current->fs), which may already be NULL if this is called during
1929      * process teardown.
1930      */
1931     if (current->fs == NULL)
1932     {
1933         return NV_ERR_OPERATING_SYSTEM;
1934     }
1935 
1936     file = filp_open(filename, O_RDONLY, 0);
1937     if (IS_ERR(file))
1938     {
1939         return NV_ERR_OPERATING_SYSTEM;
1940     }
1941 
1942     *ppFile = (void *)file;
1943 
1944     return NV_OK;
1945 #else
1946     return NV_ERR_NOT_SUPPORTED;
1947 #endif
1948 }
1949 
1950 NV_STATUS NV_API_CALL os_open_and_read_file
1951 (
1952     const char *filename,
1953     NvU8       *buf,
1954     NvU64       count
1955 )
1956 {
1957     void *fileHandle;
1958     NV_STATUS status;
1959 
1960     status = os_open_readonly_file(filename, &fileHandle);
1961     if (status != NV_OK)
1962     {
1963         return status;
1964     }
1965 
1966     status = os_read_file(fileHandle, buf, count, 0);
1967 
1968     os_close_file(fileHandle);
1969 
1970     return status;
1971 }
1972 
1973 NvBool NV_API_CALL os_is_nvswitch_present(void)
1974 {
1975     struct pci_device_id nvswitch_pci_table[] = {
1976         {
1977             PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID),
1978             .class      = PCI_CLASS_BRIDGE_OTHER << 8,
1979             .class_mask = PCI_ANY_ID
1980         },
1981         {0}
1982     };
1983 
1984     return !!pci_dev_present(nvswitch_pci_table);
1985 }
1986 
1987 /*
1988  * This function may sleep (interruptible).
1989  */
1990 NV_STATUS NV_API_CALL os_get_random_bytes
1991 (
1992     NvU8 *bytes,
1993     NvU16 numBytes
1994 )
1995 {
1996 #if defined NV_WAIT_FOR_RANDOM_BYTES_PRESENT
1997     if (wait_for_random_bytes() < 0)
1998         return NV_ERR_NOT_READY;
1999 #endif
2000 
2001     get_random_bytes(bytes, numBytes);
2002     return NV_OK;
2003 }
2004 
2005 NV_STATUS NV_API_CALL os_alloc_wait_queue
2006 (
2007     os_wait_queue **wq
2008 )
2009 {
2010     NV_KMALLOC(*wq, sizeof(os_wait_queue));
2011     if (*wq == NULL)
2012         return NV_ERR_NO_MEMORY;
2013 
2014     init_completion(&(*wq)->q);
2015 
2016     return NV_OK;
2017 }
2018 
2019 void NV_API_CALL os_free_wait_queue
2020 (
2021     os_wait_queue *wq
2022 )
2023 {
2024     NV_KFREE(wq, sizeof(os_wait_queue));
2025 }
2026 
2027 void NV_API_CALL os_wait_uninterruptible
2028 (
2029     os_wait_queue *wq
2030 )
2031 {
2032     wait_for_completion(&wq->q);
2033 }
2034 
2035 void NV_API_CALL os_wait_interruptible
2036 (
2037     os_wait_queue *wq
2038 )
2039 {
2040     wait_for_completion_interruptible(&wq->q);
2041 }
2042 
2043 void NV_API_CALL os_wake_up
2044 (
2045     os_wait_queue *wq
2046 )
2047 {
2048     complete_all(&wq->q);
2049 }
2050 
2051 nv_cap_t* NV_API_CALL os_nv_cap_init
2052 (
2053     const char *path
2054 )
2055 {
2056     return nv_cap_init(path);
2057 }
2058 
2059 nv_cap_t* NV_API_CALL os_nv_cap_create_dir_entry
2060 (
2061     nv_cap_t *parent_cap,
2062     const char *name,
2063     int mode
2064 )
2065 {
2066     return nv_cap_create_dir_entry(parent_cap, name, mode);
2067 }
2068 
2069 nv_cap_t* NV_API_CALL os_nv_cap_create_file_entry
2070 (
2071     nv_cap_t *parent_cap,
2072     const char *name,
2073     int mode
2074 )
2075 {
2076     return nv_cap_create_file_entry(parent_cap, name, mode);
2077 }
2078 
2079 void NV_API_CALL os_nv_cap_destroy_entry
2080 (
2081     nv_cap_t *cap
2082 )
2083 {
2084     nv_cap_destroy_entry(cap);
2085 }
2086 
2087 int NV_API_CALL os_nv_cap_validate_and_dup_fd
2088 (
2089     const nv_cap_t *cap,
2090     int fd
2091 )
2092 {
2093     return nv_cap_validate_and_dup_fd(cap, fd);
2094 }
2095 
2096 void NV_API_CALL os_nv_cap_close_fd
2097 (
2098     int fd
2099 )
2100 {
2101     nv_cap_close_fd(fd);
2102 }
2103 
2104 NvS32 NV_API_CALL os_imex_channel_count
2105 (
2106     void
2107 )
2108 {
2109     return nv_caps_imex_channel_count();
2110 }
2111 
2112 NvS32 NV_API_CALL os_imex_channel_get
2113 (
2114     NvU64 descriptor
2115 )
2116 {
2117     return nv_caps_imex_channel_get((int)descriptor);
2118 }
2119 
2120 /*
2121  * Reads the total memory and free memory of a NUMA node from the kernel.
2122  */
2123 NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
2124 (
2125     NvS32 node_id,
2126     NvU64 *free_memory_bytes,
2127     NvU64 *total_memory_bytes
2128 )
2129 {
2130     struct pglist_data *pgdat;
2131     struct zone *zone;
2132     NvU32 zone_id;
2133 
2134     if (node_id >= MAX_NUMNODES)
2135     {
2136         nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
2137         return NV_ERR_INVALID_ARGUMENT;
2138     }
2139 
2140     pgdat = NODE_DATA(node_id);
2141 
2142     *free_memory_bytes = 0;
2143     *total_memory_bytes = 0;
2144 
2145     for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
2146     {
2147         zone = &(pgdat->node_zones[zone_id]);
2148         if (!populated_zone(zone))
2149             continue;
2150         *free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
2151         *total_memory_bytes += (zone->present_pages * PAGE_SIZE);
2152     }
2153 
2154     return NV_OK;
2155 }
2156 
2157 typedef struct os_numa_gpu_mem_hotplug_notifier_s
2158 {
2159     NvU64 start_pa;
2160     NvU64 size;
2161     nv_pci_info_t pci_info;
2162     struct notifier_block memory_notifier;
2163 } os_numa_gpu_mem_hotplug_notifier_t;
2164 
2165 static int os_numa_verify_gpu_memory_zone(struct notifier_block *nb,
2166                                           unsigned long action, void *data)
2167 {
2168     os_numa_gpu_mem_hotplug_notifier_t *notifier = container_of(nb,
2169         os_numa_gpu_mem_hotplug_notifier_t,
2170         memory_notifier);
2171     struct memory_notify *mhp = data;
2172     NvU64 start_pa = PFN_PHYS(mhp->start_pfn);
2173     NvU64 size = PFN_PHYS(mhp->nr_pages);
2174 
2175     if (action == MEM_GOING_ONLINE)
2176     {
2177         // Check if onlining memory falls in the GPU memory range
2178         if ((start_pa >= notifier->start_pa) &&
2179             (start_pa + size) <= (notifier->start_pa + notifier->size))
2180         {
2181             /*
2182              * Verify GPU memory NUMA node has memory only in ZONE_MOVABLE before
2183              * onlining the memory so that incorrect auto online setting doesn't
2184              * cause the memory onlined in a zone where kernel allocations
2185              * could happen, resulting in GPU memory hot unpluggable and requiring
2186              * system reboot.
2187              */
2188             if (page_zonenum((pfn_to_page(mhp->start_pfn))) != ZONE_MOVABLE)
2189             {
2190                 nv_printf(NV_DBG_ERRORS, "NVRM: Failing GPU memory onlining as the onlining zone "
2191                           "is not movable. pa: 0x%llx size: 0x%llx\n"
2192                           "NVRM: The NVIDIA GPU %04x:%02x:%02x.%x installed in the system\n"
2193                           "NVRM: requires auto onlining mode online_movable enabled in\n"
2194                           "NVRM: /sys/devices/system/memory/auto_online_blocks\n",
2195                           start_pa, size, notifier->pci_info.domain, notifier->pci_info.bus,
2196                           notifier->pci_info.slot, notifier->pci_info.function);
2197                 return NOTIFY_BAD;
2198             }
2199         }
2200     }
2201     return NOTIFY_OK;
2202 }
2203 
2204 #define ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS 4
2205 
2206 NV_STATUS NV_API_CALL os_numa_add_gpu_memory
2207 (
2208     void *handle,
2209     NvU64 offset,
2210     NvU64 size,
2211     NvU32 *nodeId
2212 )
2213 {
2214 #if defined(NV_ADD_MEMORY_DRIVER_MANAGED_PRESENT)
2215     int node = 0;
2216     nv_linux_state_t *nvl = pci_get_drvdata(handle);
2217     nv_state_t *nv = NV_STATE_PTR(nvl);
2218     NvU64 base = offset + nvl->coherent_link_info.gpu_mem_pa;
2219     int ret = 0;
2220     NvU64 memblock_size;
2221     NvU64 size_remaining;
2222     NvU64 calculated_segment_size;
2223     NvU64 segment_size;
2224     NvU64 segment_base;
2225     os_numa_gpu_mem_hotplug_notifier_t notifier =
2226     {
2227         .start_pa = base,
2228         .size = size,
2229         .pci_info = nv->pci_info,
2230         .memory_notifier.notifier_call = os_numa_verify_gpu_memory_zone,
2231     };
2232 
2233     if (nodeId == NULL)
2234     {
2235         return NV_ERR_INVALID_ARGUMENT;
2236     }
2237 
2238     if (bitmap_empty(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES))
2239     {
2240         return NV_ERR_IN_USE;
2241     }
2242     node = find_first_bit(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES);
2243     if (node == MAX_NUMNODES)
2244     {
2245         return NV_ERR_INVALID_STATE;
2246     }
2247 
2248     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE_IN_PROGRESS);
2249 
2250     ret = register_memory_notifier(&notifier.memory_notifier);
2251     if (ret)
2252     {
2253         nv_printf(NV_DBG_ERRORS, "NVRM: Memory hotplug notifier registration failed\n");
2254         goto failed;
2255     }
2256 
2257     //
2258     // Adding all memory at once can take a long time. Split up memory into segments
2259     // with schedule() in between to prevent soft lockups. Memory segments for
2260     // add_memory_driver_managed() need to be aligned to memblock size.
2261     //
2262     // If there are any issues splitting into segments, then add all memory at once.
2263     //
2264     if (os_numa_memblock_size(&memblock_size) == NV_OK)
2265     {
2266         calculated_segment_size = NV_ALIGN_UP(size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
2267     }
2268     else
2269     {
2270         // Don't split into segments, add all memory at once
2271         calculated_segment_size = size;
2272     }
2273 
2274     segment_size = calculated_segment_size;
2275     segment_base = base;
2276     size_remaining = size;
2277 
2278     while ((size_remaining > 0) &&
2279            (ret == 0))
2280     {
2281         if (segment_size > size_remaining)
2282         {
2283             segment_size = size_remaining;
2284         }
2285 
2286 #ifdef NV_ADD_MEMORY_DRIVER_MANAGED_HAS_MHP_FLAGS_ARG
2287         ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)", MHP_NONE);
2288 #else
2289         ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)");
2290 #endif
2291         nv_printf(NV_DBG_SETUP, "NVRM: add_memory_driver_managed() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2292                   ret, segment_base, segment_size);
2293 
2294         segment_base += segment_size;
2295         size_remaining -= segment_size;
2296 
2297         // Yield CPU to prevent soft lockups
2298         schedule();
2299     }
2300     unregister_memory_notifier(&notifier.memory_notifier);
2301 
2302     if (ret == 0)
2303     {
2304         struct zone *zone = &NODE_DATA(node)->node_zones[ZONE_MOVABLE];
2305         NvU64 start_pfn = base >> PAGE_SHIFT;
2306         NvU64 end_pfn = (base + size) >> PAGE_SHIFT;
2307 
2308         /* Verify the full GPU memory range passed on is onlined */
2309         if (zone->zone_start_pfn != start_pfn ||
2310             zone_end_pfn(zone) != end_pfn)
2311         {
2312             nv_printf(NV_DBG_ERRORS, "NVRM: GPU memory zone movable auto onlining failed!\n");
2313 
2314 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2315             // Since zone movable auto onlining failed, need to remove the added memory.
2316             segment_size = calculated_segment_size;
2317             segment_base = base;
2318             size_remaining = size;
2319 
2320             while (size_remaining > 0)
2321             {
2322                 if (segment_size > size_remaining)
2323                 {
2324                     segment_size = size_remaining;
2325                 }
2326 
2327 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2328                 ret = offline_and_remove_memory(node, segment_base, segment_size);
2329 #else
2330                 ret = offline_and_remove_memory(segment_base, segment_size);
2331 #endif
2332                 nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2333                           ret, segment_base, segment_size);
2334 
2335                 segment_base += segment_size;
2336                 size_remaining -= segment_size;
2337 
2338                 // Yield CPU to prevent soft lockups
2339                 schedule();
2340             }
2341 #endif
2342             goto failed;
2343         }
2344 
2345         /*
2346          * On systems with cpuset cgroup controller enabled, memory alloc on
2347          * this just hotplugged GPU memory node can fail if the
2348          * cpuset_hotplug_work is not scheduled yet. cpuset_hotplug_work is
2349          * where the current->mems_allowed is updated in the path
2350          * cpuset_hotplug_workfn->update_tasks_nodemask. When cpuset is
2351          * enabled and current->mems_allowed is not updated, memory allocation
2352          * with __GFP_THISNODE and this node id fails. cpuset_wait_for_hotplug
2353          * kernel function can be used to wait for the work to finish but that
2354          * is not exported. Adding a time loop to wait for
2355          * current->mems_allowed to be updated as a WAR while an upstream
2356          * kernel fix is being explored. Bug 4385903
2357          */
2358         if (!node_isset(node, cpuset_current_mems_allowed))
2359         {
2360             unsigned long delay;
2361 
2362             delay = jiffies + (HZ / 10); // 100ms
2363             while(time_before(jiffies, delay) &&
2364                   !node_isset(node, cpuset_current_mems_allowed))
2365             {
2366                 os_schedule();
2367             }
2368 
2369             if (!node_isset(node, cpuset_current_mems_allowed))
2370             {
2371                 nv_printf(NV_DBG_ERRORS, "NVRM: Hotplugged GPU memory NUMA node: %d "
2372                           "not set in current->mems_allowed!\n", node);
2373             }
2374         }
2375 
2376         *nodeId = node;
2377         clear_bit(node, nvl->coherent_link_info.free_node_bitmap);
2378         NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE);
2379         return NV_OK;
2380     }
2381     nv_printf(NV_DBG_ERRORS, "NVRM: Memory add failed. base: 0x%lx size: 0x%lx ret: %d\n",
2382               base, size, ret);
2383 failed:
2384     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_ONLINE_FAILED);
2385     return NV_ERR_OPERATING_SYSTEM;
2386 #endif
2387     return NV_ERR_NOT_SUPPORTED;
2388 }
2389 
2390 
2391 typedef struct {
2392     NvU64 base;
2393     NvU64 size;
2394     NvU32 nodeId;
2395     int ret;
2396 } remove_numa_memory_info_t;
2397 
2398 static void offline_numa_memory_callback
2399 (
2400     void *args
2401 )
2402 {
2403 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2404     remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
2405     int ret = 0;
2406     NvU64 memblock_size;
2407     NvU64 size_remaining;
2408     NvU64 calculated_segment_size;
2409     NvU64 segment_size;
2410     NvU64 segment_base;
2411 
2412     //
2413     // Removing all memory at once can take a long time. Split up memory into segments
2414     // with schedule() in between to prevent soft lockups. Memory segments for
2415     // offline_and_remove_memory() need to be aligned to memblock size.
2416     //
2417     // If there are any issues splitting into segments, then remove all memory at once.
2418     //
2419     if (os_numa_memblock_size(&memblock_size) == NV_OK)
2420     {
2421         calculated_segment_size = NV_ALIGN_UP(pNumaInfo->size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
2422     }
2423     else
2424     {
2425         // Don't split into segments, remove all memory at once
2426         calculated_segment_size = pNumaInfo->size;
2427     }
2428 
2429     segment_size = calculated_segment_size;
2430     segment_base = pNumaInfo->base;
2431     size_remaining = pNumaInfo->size;
2432 
2433     while (size_remaining > 0)
2434     {
2435         if (segment_size > size_remaining)
2436         {
2437             segment_size = size_remaining;
2438         }
2439 
2440 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2441         ret = offline_and_remove_memory(pNumaInfo->nodeId,
2442                                         segment_base,
2443                                         segment_size);
2444 #else
2445         ret = offline_and_remove_memory(segment_base,
2446                                         segment_size);
2447 #endif
2448         nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
2449                   ret, segment_base, segment_size);
2450         pNumaInfo->ret |= ret;
2451 
2452         segment_base += segment_size;
2453         size_remaining -= segment_size;
2454 
2455         // Yield CPU to prevent soft lockups
2456         schedule();
2457     }
2458 #endif
2459 }
2460 
2461 NV_STATUS NV_API_CALL os_numa_remove_gpu_memory
2462 (
2463     void *handle,
2464     NvU64 offset,
2465     NvU64 size,
2466     NvU32 nodeId
2467 )
2468 {
2469 #ifdef NV_ADD_MEMORY_DRIVER_MANAGED_PRESENT
2470     nv_linux_state_t *nvl = pci_get_drvdata(handle);
2471 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2472     NvU64 base = offset + nvl->coherent_link_info.gpu_mem_pa;
2473     remove_numa_memory_info_t numa_info;
2474     nv_kthread_q_item_t remove_numa_memory_q_item;
2475     int ret;
2476 #endif
2477 
2478     if (nodeId >= MAX_NUMNODES)
2479     {
2480         return NV_ERR_INVALID_ARGUMENT;
2481     }
2482     if ((nodeId == NUMA_NO_NODE) || test_bit(nodeId, nvl->coherent_link_info.free_node_bitmap))
2483     {
2484         return NV_ERR_INVALID_ARGUMENT;
2485     }
2486 
2487     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS);
2488 
2489 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2490     numa_info.base   = base;
2491     numa_info.size   = size;
2492     numa_info.nodeId = nodeId;
2493     numa_info.ret    = 0;
2494 
2495     nv_kthread_q_item_init(&remove_numa_memory_q_item,
2496                            offline_numa_memory_callback,
2497                            &numa_info);
2498     nv_kthread_q_schedule_q_item(&nvl->remove_numa_memory_q,
2499                                  &remove_numa_memory_q_item);
2500     nv_kthread_q_flush(&nvl->remove_numa_memory_q);
2501 
2502     ret = numa_info.ret;
2503 
2504     if (ret == 0)
2505     {
2506         set_bit(nodeId, nvl->coherent_link_info.free_node_bitmap);
2507 
2508         NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
2509         return NV_OK;
2510     }
2511 
2512     nv_printf(NV_DBG_ERRORS, "NVRM: Memory remove failed. base: 0x%lx size: 0x%lx ret: %d\n",
2513               base, size, ret);
2514 #endif
2515     NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE_FAILED);
2516     return NV_ERR_OPERATING_SYSTEM;
2517 #endif
2518     return NV_ERR_NOT_SUPPORTED;
2519 }
2520 
2521 NV_STATUS NV_API_CALL os_offline_page_at_address
2522 (
2523     NvU64 address
2524 )
2525 {
2526 #if defined(CONFIG_MEMORY_FAILURE)
2527     int flags = 0;
2528     int ret;
2529     NvU64 pfn;
2530     struct page *page = NV_GET_PAGE_STRUCT(address);
2531 
2532     if (page == NULL)
2533     {
2534         nv_printf(NV_DBG_ERRORS, "NVRM: Failed to get page struct for address: 0x%llx\n",
2535                   address);
2536         return NV_ERR_INVALID_ARGUMENT;
2537     }
2538 
2539     pfn = page_to_pfn(page);
2540 
2541 #ifdef NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED
2542     //
2543     // Set MF_SW_SIMULATED flag so Linux kernel can differentiate this from a HW
2544     // memory failure. HW memory failures cannot be unset via unpoison_memory() API.
2545     //
2546     // Currently, RM does not use unpoison_memory(), so it makes no difference
2547     // whether or not MF_SW_SIMULATED is set. Regardless, it is semantically more
2548     // correct to set MF_SW_SIMULATED.
2549     //
2550     flags |= MF_SW_SIMULATED;
2551 #endif
2552 
2553 #ifdef NV_MEMORY_FAILURE_HAS_TRAPNO_ARG
2554     ret = memory_failure(pfn, 0, flags);
2555 #else
2556     ret = memory_failure(pfn, flags);
2557 #endif
2558 
2559     if (ret != 0)
2560     {
2561         nv_printf(NV_DBG_ERRORS, "NVRM: page offlining failed. address: 0x%llx pfn: 0x%llx ret: %d\n",
2562                   address, pfn, ret);
2563         return NV_ERR_OPERATING_SYSTEM;
2564     }
2565 
2566     return NV_OK;
2567 #else // !defined(CONFIG_MEMORY_FAILURE)
2568     nv_printf(NV_DBG_ERRORS, "NVRM: memory_failure() not supported by kernel. page offlining failed. address: 0x%llx\n",
2569               address);
2570     return NV_ERR_NOT_SUPPORTED;
2571 #endif
2572 }
2573 
2574 void* NV_API_CALL os_get_pid_info(void)
2575 {
2576     return get_task_pid(current, PIDTYPE_PID);
2577 }
2578 
2579 void NV_API_CALL os_put_pid_info(void *pid_info)
2580 {
2581     if (pid_info != NULL)
2582         put_pid(pid_info);
2583 }
2584 
2585 NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
2586 {
2587     if ((pid_info == NULL) || (ns_pid == NULL))
2588         return NV_ERR_INVALID_ARGUMENT;
2589 
2590     *ns_pid = pid_vnr((struct pid *)pid_info);
2591 
2592     // The call returns 0 if the PID is not found in the current ns
2593     if (*ns_pid == 0)
2594         return NV_ERR_OBJECT_NOT_FOUND;
2595 
2596     return NV_OK;
2597 }
2598 
2599