1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2001-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #ifndef _NV_LINUX_H_
25 #define _NV_LINUX_H_
26 
27 #include "nvstatus.h"
28 #include "nv.h"
29 #include "nv-ioctl-numa.h"
30 #include "conftest.h"
31 
32 #include "nv-lock.h"
33 #include "nv-pgprot.h"
34 #include "nv-mm.h"
35 #include "os-interface.h"
36 #include "nv-timer.h"
37 #include "nv-time.h"
38 
39 #define NV_KERNEL_NAME "Linux"
40 
41 #ifndef AUTOCONF_INCLUDED
42 #if defined(NV_GENERATED_AUTOCONF_H_PRESENT)
43 #include <generated/autoconf.h>
44 #else
45 #include <linux/autoconf.h>
46 #endif
47 #endif
48 
49 #if defined(NV_GENERATED_UTSRELEASE_H_PRESENT)
50   #include <generated/utsrelease.h>
51 #endif
52 
53 #if defined(NV_GENERATED_COMPILE_H_PRESENT)
54   #include <generated/compile.h>
55 #endif
56 
57 #include <linux/version.h>
58 #include <linux/utsname.h>
59 
60 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
61 #error "This driver does not support kernels older than 2.6.32!"
62 #elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 7, 0)
63 #  define KERNEL_2_6
64 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
65 #  define KERNEL_3
66 #else
67 #error "This driver does not support development kernels!"
68 #endif
69 
70 #if defined (CONFIG_SMP) && !defined (__SMP__)
71 #define __SMP__
72 #endif
73 
74 #if defined (CONFIG_MODVERSIONS) && !defined (MODVERSIONS)
75 #  define MODVERSIONS
76 #endif
77 
78 #include <linux/kernel.h>
79 #include <linux/module.h>
80 #include <linux/kmod.h>
81 #include <asm/bug.h>
82 
83 #include <linux/mm.h>
84 
85 #if !defined(VM_RESERVED)
86 #define VM_RESERVED    0x00000000
87 #endif
88 #if !defined(VM_DONTEXPAND)
89 #define VM_DONTEXPAND  0x00000000
90 #endif
91 #if !defined(VM_DONTDUMP)
92 #define VM_DONTDUMP    0x00000000
93 #endif
94 
95 #include <linux/init.h>             /* module_init, module_exit         */
96 #include <linux/types.h>            /* pic_t, size_t, __u32, etc        */
97 #include <linux/errno.h>            /* error codes                      */
98 #include <linux/list.h>             /* circular linked list             */
99 #include <linux/stddef.h>           /* NULL, offsetof                   */
100 #include <linux/wait.h>             /* wait queues                      */
101 #include <linux/string.h>           /* strchr(), strpbrk()              */
102 
103 #include <linux/ctype.h>            /* isspace(), etc                   */
104 #include <linux/console.h>          /* acquire_console_sem(), etc       */
105 #include <linux/cpufreq.h>          /* cpufreq_get                      */
106 
107 #include <linux/slab.h>             /* kmalloc, kfree, etc              */
108 #include <linux/vmalloc.h>          /* vmalloc, vfree, etc              */
109 
110 #include <linux/poll.h>             /* poll_wait                        */
111 #include <linux/delay.h>            /* mdelay, udelay                   */
112 
113 #include <linux/sched.h>            /* suser(), capable() replacement   */
114 
115 #include <linux/random.h>           /* get_random_bytes()               */
116 
117 #if defined(NV_LINUX_DMA_BUF_H_PRESENT)
118 #include <linux/dma-buf.h>
119 #endif
120 
121 #if defined(NV_DRM_AVAILABLE)
122 #if defined(NV_DRM_DRM_DEVICE_H_PRESENT)
123 #include <drm/drm_device.h>
124 #endif
125 
126 #if defined(NV_DRM_DRM_DRV_H_PRESENT)
127 #include <drm/drm_drv.h>
128 #endif
129 
130 #if defined(NV_DRM_DRMP_H_PRESENT)
131 #include <drm/drmP.h>
132 #endif
133 
134 #if defined(NV_DRM_DRM_GEM_H_PRESENT)
135 #include <drm/drm_gem.h>
136 #endif
137 #endif /* NV_DRM_AVAILABLE */
138 
139 /*
140  * sched.h was refactored with this commit (as part of Linux 4.11)
141  *   2017-03-03  1827adb11ad26b2290dc9fe2aaf54976b2439865
142  */
143 #if defined(NV_LINUX_SCHED_SIGNAL_H_PRESENT)
144 #include <linux/sched/signal.h>     /* task_lock(), task_unlock()       */
145 #endif
146 
147 #if defined(NV_LINUX_SCHED_TASK_H_PRESENT)
148 #include <linux/sched/task.h>       /* task_lock(), task_unlock()       */
149 #endif
150 
151 /* task and signal-related items, for kernels < 4.11: */
152 #include <linux/sched.h>            /* task_lock(), task_unlock()       */
153 
154 #include <linux/moduleparam.h>      /* module_param()                   */
155 #include <asm/tlbflush.h>           /* flush_tlb(), flush_tlb_all()     */
156 
157 #include <linux/pci.h>              /* pci_find_class, etc              */
158 #include <linux/interrupt.h>        /* tasklets, interrupt helpers      */
159 #include <linux/timer.h>
160 #include <linux/file.h>             /* fget(), fput()                   */
161 #include <linux/rbtree.h>
162 #include <linux/cpu.h>              /* CPU hotplug support              */
163 
164 #include <linux/pm_runtime.h>       /* pm_runtime_*                     */
165 #include <linux/fdtable.h>          /* files_fdtable, etc               */
166 
167 #include <asm/div64.h>              /* do_div()                         */
168 #if defined(NV_ASM_SYSTEM_H_PRESENT)
169 #include <asm/system.h>             /* cli, sli, save_flags             */
170 #endif
171 #include <asm/io.h>                 /* ioremap, virt_to_phys            */
172 #include <asm/uaccess.h>            /* access_ok                        */
173 #include <asm/page.h>               /* PAGE_OFFSET                      */
174 #include <asm/pgtable.h>            /* pte bit definitions              */
175 #include <asm/bitops.h>             /* __set_bit()                      */
176 
177 #if defined(NV_LINUX_TIME_H_PRESENT)
178 #include <linux/time.h>             /* FD_SET()                         */
179 #endif
180 
181 #include "nv-list-helpers.h"
182 
183 /*
184  * Use current->cred->euid, instead of calling current_euid().
185  * The latter can pull in the GPL-only debug_lockdep_rcu_enabled()
186  * symbol when CONFIG_PROVE_RCU.  That is only used for debugging.
187  *
188  * The Linux kernel relies on the assumption that only the current process
189  * is permitted to change its cred structure. Therefore, current_euid()
190  * does not require the RCU's read lock on current->cred.
191  */
192 #define NV_CURRENT_EUID() (__kuid_val(current->cred->euid))
193 
194 #if defined(CONFIG_VGA_ARB)
195 #include <linux/vgaarb.h>
196 #endif
197 
198 #include <linux/pagemap.h>
199 #include <linux/dma-mapping.h>
200 
201 #if defined(NV_LINUX_DMA_MAP_OPS_H_PRESENT)
202 #include <linux/dma-map-ops.h>
203 #endif
204 
205 #if defined(CONFIG_SWIOTLB) && defined(NVCPU_AARCH64)
206 #include <linux/swiotlb.h>
207 #endif
208 
209 #include <linux/scatterlist.h>
210 #include <linux/completion.h>
211 #include <linux/highmem.h>
212 
213 #include <linux/nodemask.h>
214 #include <linux/memory.h>
215 
216 #include <linux/workqueue.h>        /* workqueue                        */
217 #include "nv-kthread-q.h"           /* kthread based queue              */
218 
219 #if defined(NV_LINUX_EFI_H_PRESENT)
220 #include <linux/efi.h>              /* efi_enabled                      */
221 #endif
222 
223 #include <linux/fb.h>               /* fb_info struct                   */
224 #include <linux/screen_info.h>      /* screen_info                      */
225 
226 #if !defined(CONFIG_PCI)
227 #warning "Attempting to build driver for a platform with no PCI support!"
228 #include <asm-generic/pci-dma-compat.h>
229 #endif
230 
231 #if defined(CONFIG_CRAY_XT)
232 #include <cray/cray_nvidia.h>
233 NV_STATUS nvos_forward_error_to_cray(struct pci_dev *, NvU32,
234         const char *, va_list);
235 #endif
236 
237 #if defined(NVCPU_PPC64LE) && defined(CONFIG_EEH)
238 #include <asm/eeh.h>
239 #define NV_PCI_ERROR_RECOVERY_ENABLED() eeh_enabled()
240 #define NV_PCI_ERROR_RECOVERY
241 #endif
242 
243 #if defined(NV_ASM_SET_MEMORY_H_PRESENT)
244 #include <asm/set_memory.h>
245 #endif
246 
247 #if defined(NV_SET_MEMORY_UC_PRESENT)
248 #undef NV_SET_PAGES_UC_PRESENT
249 #endif
250 
251 #if !defined(NVCPU_AARCH64) && !defined(NVCPU_PPC64LE)
252 #if !defined(NV_SET_MEMORY_UC_PRESENT) && !defined(NV_SET_PAGES_UC_PRESENT)
253 #error "This driver requires the ability to change memory types!"
254 #endif
255 #endif
256 
257 /*
258  * Traditionally, CONFIG_XEN indicated that the target kernel was
259  * built exclusively for use under a Xen hypervisor, requiring
260  * modifications to or disabling of a variety of NVIDIA graphics
261  * driver code paths. As of the introduction of CONFIG_PARAVIRT
262  * and support for Xen hypervisors within the CONFIG_PARAVIRT_GUEST
263  * architecture, CONFIG_XEN merely indicates that the target
264  * kernel can run under a Xen hypervisor, but not that it will.
265  *
266  * If CONFIG_XEN and CONFIG_PARAVIRT are defined, the old Xen
267  * specific code paths are disabled. If the target kernel executes
268  * stand-alone, the NVIDIA graphics driver will work fine. If the
269  * kernels executes under a Xen (or other) hypervisor, however, the
270  * NVIDIA graphics driver has no way of knowing and is unlikely
271  * to work correctly.
272  */
273 #if defined(CONFIG_XEN) && !defined(CONFIG_PARAVIRT)
274 #include <asm/maddr.h>
275 #include <xen/interface/memory.h>
276 #define NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL
277 #endif
278 
279 #ifdef CONFIG_KDB
280 #include <linux/kdb.h>
281 #include <asm/kdb.h>
282 #endif
283 
284 #if defined(CONFIG_X86_REMOTE_DEBUG)
285 #include <linux/gdb.h>
286 #endif
287 
288 #if defined(DEBUG) && defined(CONFIG_KGDB) && \
289     defined(NVCPU_AARCH64)
290 #include <asm/kgdb.h>
291 #endif
292 
293 #if defined(NVCPU_X86_64) && !defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
294 #define NV_ENABLE_PAT_SUPPORT
295 #endif
296 
297 #define NV_PAT_MODE_DISABLED    0
298 #define NV_PAT_MODE_KERNEL      1
299 #define NV_PAT_MODE_BUILTIN     2
300 
301 extern int nv_pat_mode;
302 
303 #if defined(CONFIG_HOTPLUG_CPU)
304 #define NV_ENABLE_HOTPLUG_CPU
305 #include <linux/notifier.h>         /* struct notifier_block, etc       */
306 #endif
307 
308 #if (defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE))
309 #include <linux/i2c.h>
310 #endif
311 
312 #if defined(CONFIG_ACPI)
313 #include <linux/acpi.h>
314 #define NV_LINUX_ACPI_EVENTS_SUPPORTED 1
315 #endif
316 
317 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
318 #define NV_ACPI_WALK_NAMESPACE(type, start_object, max_depth, \
319         user_function, args...) \
320     acpi_walk_namespace(type, start_object, max_depth, \
321             user_function, NULL, args)
322 #endif
323 
324 #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_PREEMPT_RT_FULL)
325 #define NV_CONFIG_PREEMPT_RT 1
326 #endif
327 
328 #if defined(NV_WRITE_CR4_PRESENT)
329 #define NV_READ_CR4()       read_cr4()
330 #define NV_WRITE_CR4(cr4)   write_cr4(cr4)
331 #else
332 #define NV_READ_CR4()       __read_cr4()
333 #define NV_WRITE_CR4(cr4)   __write_cr4(cr4)
334 #endif
335 
336 #ifndef get_cpu
337 #define get_cpu() smp_processor_id()
338 #define put_cpu()
339 #endif
340 
341 #if !defined(unregister_hotcpu_notifier)
342 #define unregister_hotcpu_notifier unregister_cpu_notifier
343 #endif
344 #if !defined(register_hotcpu_notifier)
345 #define register_hotcpu_notifier register_cpu_notifier
346 #endif
347 
348 #if defined(NVCPU_X86_64)
349 #if !defined(pmd_large)
350 #define pmd_large(_pmd) \
351     ((pmd_val(_pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
352 #endif
353 #endif /* defined(NVCPU_X86_64) */
354 
355 #define NV_PAGE_COUNT(page) \
356   ((unsigned int)page_count(page))
357 #define NV_GET_PAGE_COUNT(page_ptr) \
358   (NV_PAGE_COUNT(NV_GET_PAGE_STRUCT(page_ptr->phys_addr)))
359 #define NV_GET_PAGE_FLAGS(page_ptr) \
360   (NV_GET_PAGE_STRUCT(page_ptr->phys_addr)->flags)
361 
362 /*
363  * Before the introduction of VM_PFNMAP, there was an VM_UNPAGED flag.
364  * Drivers which wanted to call remap_pfn_range on normal pages had to use this
365  * VM_UNPAGED flag *and* set PageReserved. With the introduction of VM_PFNMAP,
366  * that restriction went away. This is described in commit
367  *
368  *   2005-10-28 6aab341e0a28aff100a09831c5300a2994b8b986
369  *     ("mm: re-architect the VM_UNPAGED logic")
370  *
371  * , which added VM_PFNMAP and vm_normal_page. Therefore, if VM_PFNMAP is
372  * defined, then we do *not* need to mark a page as reserved, in order to
373  * call remap_pfn_range().
374  */
375 #if !defined(VM_PFNMAP)
376 #define NV_MAYBE_RESERVE_PAGE(ptr_ptr) \
377   SetPageReserved(NV_GET_PAGE_STRUCT(page_ptr->phys_addr))
378 #define NV_MAYBE_UNRESERVE_PAGE(page_ptr) \
379   ClearPageReserved(NV_GET_PAGE_STRUCT(page_ptr->phys_addr))
380 #else
381 #define NV_MAYBE_RESERVE_PAGE(ptr_ptr)
382 #define NV_MAYBE_UNRESERVE_PAGE(page_ptr)
383 #endif /* defined(VM_PFNMAP) */
384 
385 #if !defined(__GFP_COMP)
386 #define __GFP_COMP 0
387 #endif
388 
389 #if !defined(DEBUG) && defined(__GFP_NOWARN)
390 #define NV_GFP_KERNEL (GFP_KERNEL | __GFP_NOWARN)
391 #define NV_GFP_ATOMIC (GFP_ATOMIC | __GFP_NOWARN)
392 #else
393 #define NV_GFP_KERNEL (GFP_KERNEL)
394 #define NV_GFP_ATOMIC (GFP_ATOMIC)
395 #endif
396 
397 #if defined(GFP_DMA32)
398 /*
399  * GFP_DMA32 is similar to GFP_DMA, but instructs the Linux zone
400  * allocator to allocate memory from the first 4GB on platforms
401  * such as Linux/x86-64; the alternative is to use an IOMMU such
402  * as the one implemented with the K8 GART, if available.
403  */
404 #define NV_GFP_DMA32 (NV_GFP_KERNEL | GFP_DMA32)
405 #else
406 #define NV_GFP_DMA32 (NV_GFP_KERNEL)
407 #endif
408 
409 extern NvBool nvos_is_chipset_io_coherent(void);
410 
411 #if defined(NVCPU_X86_64)
412 #define CACHE_FLUSH()  asm volatile("wbinvd":::"memory")
413 #define WRITE_COMBINE_FLUSH() asm volatile("sfence":::"memory")
414 #elif defined(NVCPU_AARCH64)
415     static inline void nv_flush_cache_cpu(void *info)
416     {
417         if (!nvos_is_chipset_io_coherent())
418         {
419 #if defined(NV_FLUSH_CACHE_ALL_PRESENT)
420             flush_cache_all();
421 #else
422             WARN_ONCE(0, "NVRM: kernel does not support flush_cache_all()\n");
423 #endif
424         }
425     }
426 #define CACHE_FLUSH()            nv_flush_cache_cpu(NULL)
427 #define CACHE_FLUSH_ALL()        on_each_cpu(nv_flush_cache_cpu, NULL, 1)
428 #define WRITE_COMBINE_FLUSH()    mb()
429 #elif defined(NVCPU_PPC64LE)
430 #define CACHE_FLUSH()            asm volatile("sync;  \n" \
431                                               "isync; \n" ::: "memory")
432 #define WRITE_COMBINE_FLUSH()    CACHE_FLUSH()
433 #endif
434 
435 typedef enum
436 {
437     NV_MEMORY_TYPE_SYSTEM,      /* Memory mapped for ROM, SBIOS and physical RAM. */
438     NV_MEMORY_TYPE_REGISTERS,
439     NV_MEMORY_TYPE_FRAMEBUFFER,
440     NV_MEMORY_TYPE_DEVICE_MMIO, /* All kinds of MMIO referred by NVRM e.g. BARs and MCFG of device */
441 } nv_memory_type_t;
442 
443 #if defined(NVCPU_AARCH64) || defined(NVCPU_PPC64LE)
444 #define NV_ALLOW_WRITE_COMBINING(mt)    1
445 #elif defined(NVCPU_X86_64)
446 #if defined(NV_ENABLE_PAT_SUPPORT)
447 #define NV_ALLOW_WRITE_COMBINING(mt)    \
448     ((nv_pat_mode != NV_PAT_MODE_DISABLED) && \
449      ((mt) != NV_MEMORY_TYPE_REGISTERS))
450 #else
451 #define NV_ALLOW_WRITE_COMBINING(mt)    0
452 #endif
453 #endif
454 
455 #if !defined(IRQF_SHARED)
456 #define IRQF_SHARED SA_SHIRQ
457 #endif
458 
459 #define NV_MAX_RECURRING_WARNING_MESSAGES 10
460 
461 /* various memory tracking/debugging techniques
462  * disabled for retail builds, enabled for debug builds
463  */
464 
465 // allow an easy way to convert all debug printfs related to memory
466 // management back and forth between 'info' and 'errors'
467 #if defined(NV_DBG_MEM)
468 #define NV_DBG_MEMINFO NV_DBG_ERRORS
469 #else
470 #define NV_DBG_MEMINFO NV_DBG_INFO
471 #endif
472 
473 #define NV_MEM_TRACKING_PAD_SIZE(size) \
474     (size) = NV_ALIGN_UP((size + sizeof(void *)), sizeof(void *))
475 
476 #define NV_MEM_TRACKING_HIDE_SIZE(ptr, size)            \
477     if ((ptr != NULL) && (*(ptr) != NULL))              \
478     {                                                   \
479         NvU8 *__ptr;                                    \
480         *(unsigned long *) *(ptr) = (size);             \
481         __ptr = *(ptr); __ptr += sizeof(void *);        \
482         *(ptr) = (void *) __ptr;                        \
483     }
484 #define NV_MEM_TRACKING_RETRIEVE_SIZE(ptr, size)        \
485     {                                                   \
486         NvU8 *__ptr = (ptr); __ptr -= sizeof(void *);   \
487         (ptr) = (void *) __ptr;                         \
488         (size) = *(unsigned long *) (ptr);              \
489     }
490 
491 /* keep track of memory usage */
492 #include "nv-memdbg.h"
493 
494 static inline void *nv_vmalloc(unsigned long size)
495 {
496 #if defined(NV_VMALLOC_HAS_PGPROT_T_ARG)
497     void *ptr = __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
498 #else
499     void *ptr = __vmalloc(size, GFP_KERNEL);
500 #endif
501     if (ptr)
502         NV_MEMDBG_ADD(ptr, size);
503     return ptr;
504 }
505 
506 static inline void nv_vfree(void *ptr, NvU64 size)
507 {
508     NV_MEMDBG_REMOVE(ptr, size);
509     vfree(ptr);
510 }
511 
512 static inline void *nv_ioremap(NvU64 phys, NvU64 size)
513 {
514     void *ptr = ioremap(phys, size);
515     if (ptr)
516         NV_MEMDBG_ADD(ptr, size);
517     return ptr;
518 }
519 
520 static inline void *nv_ioremap_nocache(NvU64 phys, NvU64 size)
521 {
522     return nv_ioremap(phys, size);
523 }
524 
525 static inline void *nv_ioremap_cache(NvU64 phys, NvU64 size)
526 {
527 #if defined(NV_IOREMAP_CACHE_PRESENT)
528     void *ptr = ioremap_cache(phys, size);
529     if (ptr)
530         NV_MEMDBG_ADD(ptr, size);
531     return ptr;
532 #elif defined(NVCPU_PPC64LE)
533     //
534     // ioremap_cache() has been only implemented correctly for ppc64le with
535     // commit f855b2f544d6 in April 2017 (kernel 4.12+). Internally, the kernel
536     // does provide a default implementation of ioremap_cache() that would be
537     // incorrect for our use (creating an uncached mapping) before the
538     // referenced commit, but that implementation is not exported and the
539     // NV_IOREMAP_CACHE_PRESENT conftest doesn't pick it up, and we end up in
540     // this #elif branch.
541     //
542     // At the same time, ppc64le have supported ioremap_prot() since May 2011
543     // (commit 40f1ce7fb7e8, kernel 3.0+) and that covers all kernels we
544     // support on power.
545     //
546     void *ptr = ioremap_prot(phys, size, pgprot_val(PAGE_KERNEL));
547     if (ptr)
548         NV_MEMDBG_ADD(ptr, size);
549     return ptr;
550 #else
551     return nv_ioremap(phys, size);
552 #endif
553 }
554 
555 static inline void *nv_ioremap_wc(NvU64 phys, NvU64 size)
556 {
557 #if defined(NV_IOREMAP_WC_PRESENT)
558     void *ptr = ioremap_wc(phys, size);
559     if (ptr)
560         NV_MEMDBG_ADD(ptr, size);
561     return ptr;
562 #else
563     return nv_ioremap_nocache(phys, size);
564 #endif
565 }
566 
567 static inline void nv_iounmap(void *ptr, NvU64 size)
568 {
569     NV_MEMDBG_REMOVE(ptr, size);
570     iounmap(ptr);
571 }
572 
573 static NvBool nv_numa_node_has_memory(int node_id)
574 {
575     if (node_id < 0 || node_id >= MAX_NUMNODES)
576         return NV_FALSE;
577     return node_state(node_id, N_MEMORY) ? NV_TRUE : NV_FALSE;
578 }
579 
580 #define NV_KMALLOC(ptr, size) \
581     { \
582         (ptr) = kmalloc(size, NV_GFP_KERNEL); \
583         if (ptr) \
584             NV_MEMDBG_ADD(ptr, size); \
585     }
586 
587 #define NV_KZALLOC(ptr, size) \
588     { \
589         (ptr) = kzalloc(size, NV_GFP_KERNEL); \
590         if (ptr) \
591             NV_MEMDBG_ADD(ptr, size); \
592     }
593 
594 #define NV_KMALLOC_ATOMIC(ptr, size) \
595     { \
596         (ptr) = kmalloc(size, NV_GFP_ATOMIC); \
597         if (ptr) \
598             NV_MEMDBG_ADD(ptr, size); \
599     }
600 
601 #if defined(__GFP_RETRY_MAYFAIL)
602 #define NV_GFP_NO_OOM (NV_GFP_KERNEL | __GFP_RETRY_MAYFAIL)
603 #elif defined(__GFP_NORETRY)
604 #define NV_GFP_NO_OOM (NV_GFP_KERNEL | __GFP_NORETRY)
605 #else
606 #define NV_GFP_NO_OOM (NV_GFP_KERNEL)
607 #endif
608 
609 #define NV_KMALLOC_NO_OOM(ptr, size) \
610     { \
611         (ptr) = kmalloc(size, NV_GFP_NO_OOM); \
612         if (ptr) \
613             NV_MEMDBG_ADD(ptr, size); \
614     }
615 
616 #define NV_KFREE(ptr, size) \
617     { \
618         NV_MEMDBG_REMOVE(ptr, size); \
619         kfree((void *) (ptr)); \
620     }
621 
622 #define NV_ALLOC_PAGES_NODE(ptr, nid, order, gfp_mask) \
623     { \
624         (ptr) = (unsigned long)page_address(alloc_pages_node(nid, gfp_mask, order)); \
625     }
626 
627 #define NV_GET_FREE_PAGES(ptr, order, gfp_mask)      \
628     {                                                \
629         (ptr) = __get_free_pages(gfp_mask, order);   \
630     }
631 
632 #define NV_FREE_PAGES(ptr, order)                    \
633     {                                                \
634         free_pages(ptr, order);                      \
635     }
636 
637 extern NvU64 nv_shared_gpa_boundary;
638 
639 static inline pgprot_t nv_adjust_pgprot(pgprot_t vm_prot, NvU32 extra)
640 {
641     pgprot_t prot = __pgprot(pgprot_val(vm_prot) | extra);
642 #if defined(CONFIG_AMD_MEM_ENCRYPT) && defined(NV_PGPROT_DECRYPTED_PRESENT)
643     /*
644      * When AMD memory encryption is enabled, device memory mappings with the
645      * C-bit set read as 0xFF, so ensure the bit is cleared for user mappings.
646      *
647      * If cc_mkdec() is present, then pgprot_decrypted() can't be used.
648      */
649 #if defined(NV_CC_MKDEC_PRESENT)
650     if (nv_shared_gpa_boundary != 0)
651     {
652         /*
653          * By design, a VM using vTOM doesn't see the SEV setting and
654          * for AMD with vTOM, *set* means decrypted.
655          */
656         prot =  __pgprot(nv_shared_gpa_boundary | (pgprot_val(vm_prot)));
657     }
658     else
659     {
660         prot =  __pgprot(__sme_clr(pgprot_val(vm_prot)));
661     }
662 #else
663     prot = pgprot_decrypted(prot);
664 #endif
665 #endif
666 
667     return prot;
668 }
669 
670 #if defined(PAGE_KERNEL_NOENC)
671 #if defined(__pgprot_mask)
672 #define NV_PAGE_KERNEL_NOCACHE_NOENC __pgprot_mask(__PAGE_KERNEL_NOCACHE)
673 #elif defined(default_pgprot)
674 #define NV_PAGE_KERNEL_NOCACHE_NOENC default_pgprot(__PAGE_KERNEL_NOCACHE)
675 #elif defined( __pgprot)
676 #define NV_PAGE_KERNEL_NOCACHE_NOENC __pgprot(__PAGE_KERNEL_NOCACHE)
677 #else
678 #error "Unsupported kernel!!!"
679 #endif
680 #endif
681 
682 static inline NvUPtr nv_vmap(struct page **pages, NvU32 page_count,
683                              NvBool cached, NvBool unencrypted)
684 {
685     void *ptr;
686     pgprot_t prot = PAGE_KERNEL;
687 #if defined(NVCPU_X86_64)
688 #if defined(PAGE_KERNEL_NOENC)
689     if (unencrypted)
690     {
691         prot = cached ? nv_adjust_pgprot(PAGE_KERNEL_NOENC, 0) :
692                         nv_adjust_pgprot(NV_PAGE_KERNEL_NOCACHE_NOENC, 0);
693     }
694     else
695 #endif
696     {
697         prot = cached ? PAGE_KERNEL : PAGE_KERNEL_NOCACHE;
698     }
699 #elif defined(NVCPU_AARCH64)
700     prot = cached ? PAGE_KERNEL : NV_PGPROT_UNCACHED(PAGE_KERNEL);
701 #endif
702     /* All memory cached in PPC64LE; can't honor 'cached' input. */
703     ptr = vmap(pages, page_count, VM_MAP, prot);
704     if (ptr)
705         NV_MEMDBG_ADD(ptr, page_count * PAGE_SIZE);
706     return (NvUPtr)ptr;
707 }
708 
709 static inline void nv_vunmap(NvUPtr vaddr, NvU32 page_count)
710 {
711     vunmap((void *)vaddr);
712     NV_MEMDBG_REMOVE((void *)vaddr, page_count * PAGE_SIZE);
713 }
714 
715 #if defined(NV_GET_NUM_PHYSPAGES_PRESENT)
716 #define NV_NUM_PHYSPAGES                get_num_physpages()
717 #else
718 #define NV_NUM_PHYSPAGES                num_physpages
719 #endif
720 #define NV_GET_CURRENT_PROCESS()        current->tgid
721 #define NV_IN_ATOMIC()                  in_atomic()
722 #define NV_LOCAL_BH_DISABLE()           local_bh_disable()
723 #define NV_LOCAL_BH_ENABLE()            local_bh_enable()
724 #define NV_COPY_TO_USER(to, from, n)    copy_to_user(to, from, n)
725 #define NV_COPY_FROM_USER(to, from, n)  copy_from_user(to, from, n)
726 
727 #define NV_IS_SUSER()                   capable(CAP_SYS_ADMIN)
728 #define NV_PCI_DEVICE_NAME(pci_dev)     ((pci_dev)->pretty_name)
729 #define NV_CLI()                        local_irq_disable()
730 #define NV_SAVE_FLAGS(eflags)           local_save_flags(eflags)
731 #define NV_RESTORE_FLAGS(eflags)        local_irq_restore(eflags)
732 #define NV_MAY_SLEEP()                  (!irqs_disabled() && !in_interrupt() && !NV_IN_ATOMIC())
733 #define NV_MODULE_PARAMETER(x)          module_param(x, int, 0)
734 #define NV_MODULE_STRING_PARAMETER(x)   module_param(x, charp, 0)
735 #undef  MODULE_PARM
736 
737 #define NV_NUM_CPUS()                   num_possible_cpus()
738 
739 static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
740 {
741 #if defined(NV_PHYS_TO_DMA_PRESENT)
742     return phys_to_dma(dev, pa);
743 #elif defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
744     return phys_to_machine(pa);
745 #else
746     return (dma_addr_t)pa;
747 #endif
748 }
749 
750 #define NV_GET_PAGE_STRUCT(phys_page) virt_to_page(__va(phys_page))
751 #define NV_VMA_PGOFF(vma)             ((vma)->vm_pgoff)
752 #define NV_VMA_SIZE(vma)              ((vma)->vm_end - (vma)->vm_start)
753 #define NV_VMA_OFFSET(vma)            (((NvU64)(vma)->vm_pgoff) << PAGE_SHIFT)
754 #define NV_VMA_PRIVATE(vma)           ((vma)->vm_private_data)
755 #define NV_VMA_FILE(vma)              ((vma)->vm_file)
756 
757 #define NV_DEVICE_MINOR_NUMBER(x)     minor((x)->i_rdev)
758 #define NV_CONTROL_DEVICE_MINOR       255
759 
760 #define NV_PCI_DISABLE_DEVICE(pci_dev)                           \
761     {                                                            \
762         NvU16 __cmd[2];                                          \
763         pci_read_config_word((pci_dev), PCI_COMMAND, &__cmd[0]); \
764         pci_disable_device(pci_dev);                             \
765         pci_read_config_word((pci_dev), PCI_COMMAND, &__cmd[1]); \
766         __cmd[1] |= PCI_COMMAND_MEMORY;                          \
767         pci_write_config_word((pci_dev), PCI_COMMAND,            \
768                 (__cmd[1] | (__cmd[0] & PCI_COMMAND_IO)));       \
769     }
770 
771 #define NV_PCI_RESOURCE_START(pci_dev, bar) pci_resource_start(pci_dev, (bar))
772 #define NV_PCI_RESOURCE_SIZE(pci_dev, bar)  pci_resource_len(pci_dev, (bar))
773 #define NV_PCI_RESOURCE_FLAGS(pci_dev, bar) pci_resource_flags(pci_dev, (bar))
774 
775 #define NV_PCI_RESOURCE_VALID(pci_dev, bar)                                     \
776     ((NV_PCI_RESOURCE_START(pci_dev, bar) != 0) &&                              \
777      (NV_PCI_RESOURCE_SIZE(pci_dev, bar) != 0))
778 
779 #define NV_PCI_DOMAIN_NUMBER(pci_dev) (NvU32)pci_domain_nr(pci_dev->bus)
780 #define NV_PCI_BUS_NUMBER(pci_dev)    (pci_dev)->bus->number
781 #define NV_PCI_DEVFN(pci_dev)         (pci_dev)->devfn
782 #define NV_PCI_SLOT_NUMBER(pci_dev)   PCI_SLOT(NV_PCI_DEVFN(pci_dev))
783 
784 #if defined(CONFIG_X86_UV) && defined(NV_CONFIG_X86_UV)
785 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus,devfn)                        \
786    ({                                                                       \
787         struct pci_dev *__dev = NULL;                                       \
788         while ((__dev = pci_get_device(PCI_VENDOR_ID_NVIDIA,                \
789                     PCI_ANY_ID, __dev)) != NULL)                            \
790         {                                                                   \
791             if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&                  \
792                 (NV_PCI_BUS_NUMBER(__dev) == bus) &&                        \
793                 (NV_PCI_DEVFN(__dev) == devfn))                             \
794             {                                                               \
795                 break;                                                      \
796             }                                                               \
797         }                                                                   \
798         if (__dev == NULL)                                                  \
799         {                                                                   \
800             while ((__dev = pci_get_class((PCI_CLASS_BRIDGE_HOST << 8),     \
801                         __dev)) != NULL)                                    \
802             {                                                               \
803                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
804                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
805                     (NV_PCI_DEVFN(__dev) == devfn))                         \
806                 {                                                           \
807                     break;                                                  \
808                 }                                                           \
809             }                                                               \
810         }                                                                   \
811         if (__dev == NULL)                                                  \
812         {                                                                   \
813             while ((__dev = pci_get_class((PCI_CLASS_BRIDGE_PCI << 8),      \
814                         __dev)) != NULL)                                    \
815             {                                                               \
816                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
817                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
818                     (NV_PCI_DEVFN(__dev) == devfn))                         \
819                 {                                                           \
820                     break;                                                  \
821                 }                                                           \
822             }                                                               \
823         }                                                                   \
824         if (__dev == NULL)                                                  \
825         {                                                                   \
826             while ((__dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID,          \
827                             __dev)) != NULL)                                \
828             {                                                               \
829                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
830                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
831                     (NV_PCI_DEVFN(__dev) == devfn))                         \
832                 {                                                           \
833                     break;                                                  \
834                 }                                                           \
835             }                                                               \
836         }                                                                   \
837         __dev;                                                              \
838     })
839 #elif defined(NV_PCI_GET_DOMAIN_BUS_AND_SLOT_PRESENT)
840 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus, devfn) \
841     pci_get_domain_bus_and_slot(domain, bus, devfn)
842 #else
843 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus,devfn)               \
844    ({                                                              \
845         struct pci_dev *__dev = NULL;                              \
846         while ((__dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID,     \
847                     __dev)) != NULL)                               \
848         {                                                          \
849             if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&         \
850                 (NV_PCI_BUS_NUMBER(__dev) == bus) &&               \
851                 (NV_PCI_DEVFN(__dev) == devfn))                    \
852             {                                                      \
853                 break;                                             \
854             }                                                      \
855         }                                                          \
856         __dev;                                                     \
857     })
858 #endif
859 
860 #if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE_PRESENT)  // introduced in 3.18-rc1 for aarch64
861 #define NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(pci_dev) pci_stop_and_remove_bus_device(pci_dev)
862 #endif
863 
864 #define NV_PRINT_AT(nv_debug_level,at)                                           \
865     {                                                                            \
866         nv_printf(nv_debug_level,                                                \
867             "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, flags = 0x%08x, "    \
868             "page_table = 0x%p\n",  __FUNCTION__, __LINE__, at,                  \
869             at->num_pages, NV_ATOMIC_READ(at->usage_count),                      \
870             at->flags, at->page_table);                                          \
871     }
872 
873 #define NV_PRINT_VMA(nv_debug_level,vma)                                                 \
874     {                                                                                    \
875         nv_printf(nv_debug_level,                                                        \
876             "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08x bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
877             __FUNCTION__, __LINE__, vma->vm_start, vma->vm_end, NV_VMA_SIZE(vma),        \
878             NV_VMA_OFFSET(vma), NV_VMA_PRIVATE(vma), NV_VMA_FILE(vma));                  \
879     }
880 
881 #ifndef minor
882 # define minor(x) MINOR(x)
883 #endif
884 
885 #if defined(cpu_relax)
886 #define NV_CPU_RELAX() cpu_relax()
887 #else
888 #define NV_CPU_RELAX() barrier()
889 #endif
890 
891 #ifndef IRQ_RETVAL
892 typedef void irqreturn_t;
893 #define IRQ_RETVAL(a)
894 #endif
895 
896 #if !defined(PCI_COMMAND_SERR)
897 #define PCI_COMMAND_SERR            0x100
898 #endif
899 #if !defined(PCI_COMMAND_INTX_DISABLE)
900 #define PCI_COMMAND_INTX_DISABLE    0x400
901 #endif
902 
903 #ifndef PCI_CAP_ID_EXP
904 #define PCI_CAP_ID_EXP 0x10
905 #endif
906 
907 /*
908  * On Linux on PPC64LE enable basic support for Linux PCI error recovery (see
909  * Documentation/PCI/pci-error-recovery.txt). Currently RM only supports error
910  * notification and data collection, not actual recovery of the device.
911  */
912 #if defined(NVCPU_PPC64LE) && defined(CONFIG_EEH)
913 #include <asm/eeh.h>
914 #define NV_PCI_ERROR_RECOVERY
915 #endif
916 
917 /*
918  * If the host OS has page sizes larger than 4KB, we may have a security
919  * problem. Registers are typically grouped in 4KB pages, but if there are
920  * larger pages, then the smallest userspace mapping possible (e.g., a page)
921  * may give more access than intended to the user.
922  */
923 #define NV_4K_PAGE_ISOLATION_REQUIRED(addr, size)                       \
924     ((PAGE_SIZE > NV_RM_PAGE_SIZE) &&                                   \
925      ((size) <= NV_RM_PAGE_SIZE) &&                                     \
926      (((addr) >> NV_RM_PAGE_SHIFT) ==                                   \
927         (((addr) + (size) - 1) >> NV_RM_PAGE_SHIFT)))
928 
929 /*
930  * The kernel may have a workaround for this, by providing a method to isolate
931  * a single 4K page in a given mapping.
932  */
933 #if (PAGE_SIZE > NV_RM_PAGE_SIZE) && defined(NVCPU_PPC64LE) && defined(NV_PAGE_4K_PFN)
934     #define NV_4K_PAGE_ISOLATION_PRESENT
935     #define NV_4K_PAGE_ISOLATION_MMAP_ADDR(addr)                        \
936         ((NvP64)((void*)(((addr) >> NV_RM_PAGE_SHIFT) << PAGE_SHIFT)))
937     #define NV_4K_PAGE_ISOLATION_MMAP_LEN(size)     PAGE_SIZE
938     #define NV_4K_PAGE_ISOLATION_ACCESS_START(addr)                     \
939         ((NvP64)((void*)((addr) & ~NV_RM_PAGE_MASK)))
940     #define NV_4K_PAGE_ISOLATION_ACCESS_LEN(addr, size)                 \
941         ((((addr) & NV_RM_PAGE_MASK) + size + NV_RM_PAGE_MASK) &        \
942          ~NV_RM_PAGE_MASK)
943     #define NV_PROT_4K_PAGE_ISOLATION NV_PAGE_4K_PFN
944 #endif
945 
946 static inline int nv_remap_page_range(struct vm_area_struct *vma,
947     unsigned long virt_addr, NvU64 phys_addr, NvU64 size, pgprot_t prot)
948 {
949     int ret = -1;
950 
951 #if defined(NV_4K_PAGE_ISOLATION_PRESENT) && defined(NV_PROT_4K_PAGE_ISOLATION)
952     if ((size == PAGE_SIZE) &&
953         ((pgprot_val(prot) & NV_PROT_4K_PAGE_ISOLATION) != 0))
954     {
955         /*
956          * remap_4k_pfn() hardcodes the length to a single OS page, and checks
957          * whether applying the page isolation workaround will cause PTE
958          * corruption (in which case it will fail, and this is an unsupported
959          * configuration).
960          */
961 #if defined(NV_HASH__REMAP_4K_PFN_PRESENT)
962         ret = hash__remap_4k_pfn(vma, virt_addr, (phys_addr >> PAGE_SHIFT), prot);
963 #else
964         ret = remap_4k_pfn(vma, virt_addr, (phys_addr >> PAGE_SHIFT), prot);
965 #endif
966     }
967     else
968 #endif
969     {
970         ret = remap_pfn_range(vma, virt_addr, (phys_addr >> PAGE_SHIFT), size,
971             prot);
972     }
973 
974     return ret;
975 }
976 
977 static inline int nv_io_remap_page_range(struct vm_area_struct *vma,
978     NvU64 phys_addr, NvU64 size, NvU32 extra_prot)
979 {
980     int ret = -1;
981 #if !defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
982     ret = nv_remap_page_range(vma, vma->vm_start, phys_addr, size,
983         nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
984 #else
985     ret = io_remap_pfn_range(vma, vma->vm_start, (phys_addr >> PAGE_SHIFT),
986         size, nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
987 #endif
988     return ret;
989 }
990 
991 static inline vm_fault_t nv_insert_pfn(struct vm_area_struct *vma,
992     NvU64 virt_addr, NvU64 pfn, NvU32 extra_prot)
993 {
994     /*
995      * vm_insert_pfn{,_prot} replaced with vmf_insert_pfn{,_prot} in Linux 4.20
996      */
997 #if defined(NV_VMF_INSERT_PFN_PROT_PRESENT)
998     return vmf_insert_pfn_prot(vma, virt_addr, pfn,
999              __pgprot(pgprot_val(vma->vm_page_prot) | extra_prot));
1000 #else
1001     int ret = -EINVAL;
1002     /*
1003      * Only PPC64LE (NV_4K_PAGE_ISOLATION_PRESENT) requires extra_prot to be
1004      * used when remapping.
1005      *
1006      * vm_insert_pfn_prot() was added in Linux 4.4, whereas POWER9 support
1007      * was added in Linux 4.8.
1008      *
1009      * Rather than tampering with the vma to make use of extra_prot with
1010      * vm_insert_pfn() on older kernels, for now, just fail in this case, as
1011      * it's not expected to be used currently.
1012      */
1013 #if defined(NV_VM_INSERT_PFN_PROT_PRESENT)
1014     ret = vm_insert_pfn_prot(vma, virt_addr, pfn,
1015         __pgprot(pgprot_val(vma->vm_page_prot) | extra_prot));
1016 #elif !defined(NV_4K_PAGE_ISOLATION_PRESENT)
1017     ret = vm_insert_pfn(vma, virt_addr, pfn);
1018 #endif
1019     switch (ret)
1020     {
1021         case 0:
1022         case -EBUSY:
1023             /*
1024              * EBUSY indicates that another thread already handled
1025              * the faulted range.
1026              */
1027             return VM_FAULT_NOPAGE;
1028         case -ENOMEM:
1029             return VM_FAULT_OOM;
1030         default:
1031             break;
1032     }
1033 #endif /* defined(NV_VMF_INSERT_PFN_PROT_PRESENT) */
1034     return VM_FAULT_SIGBUS;
1035 }
1036 
1037 /* Converts BAR index to Linux specific PCI BAR index */
1038 static inline NvU8 nv_bar_index_to_os_bar_index
1039 (
1040     struct pci_dev *dev,
1041     NvU8 nv_bar_index
1042 )
1043 {
1044     NvU8 bar_index = 0;
1045     NvU8 i;
1046 
1047     BUG_ON(nv_bar_index >= NV_GPU_NUM_BARS);
1048 
1049     for (i = 0; i < nv_bar_index; i++)
1050     {
1051         if (NV_PCI_RESOURCE_FLAGS(dev, bar_index) & PCI_BASE_ADDRESS_MEM_TYPE_64)
1052         {
1053             bar_index += 2;
1054         }
1055         else
1056         {
1057             bar_index++;
1058         }
1059     }
1060 
1061     return bar_index;
1062 }
1063 
1064 #define NV_PAGE_MASK    (NvU64)(long)PAGE_MASK
1065 
1066 extern void *nvidia_stack_t_cache;
1067 
1068 /*
1069  * On Linux, when a kmem cache is created, a new sysfs entry is created for the
1070  * same unless it's merged with an existing cache. Upstream Linux kernel commit
1071  * 3b7b314053d021601940c50b07f5f1423ae67e21 (version 4.12+) made cache
1072  * destruction asynchronous which creates a race between cache destroy and
1073  * create. A new cache created with attributes as a previous cache, which is
1074  * scheduled for destruction, can try to create a sysfs entry with the same
1075  * conflicting name. Upstream Linux kernel commit
1076  * d50d82faa0c964e31f7a946ba8aba7c715ca7ab0 (4.18) fixes this issue by cleaning
1077  * up sysfs entry within slab_mutex, so the entry is deleted before a cache with
1078  * the same attributes could be created.
1079  *
1080  * To workaround this kernel issue, we take two steps:
1081  * - Create unmergeable caches: a kmem_cache with a constructor is unmergeable.
1082  *   So, we define an empty contructor for the same. Creating an unmergeable
1083  *   cache ensures that the kernel doesn't generate an internal name and always
1084  *   uses our name instead.
1085  *
1086  * - Generate a unique cache name by appending the current timestamp (ns). We
1087  *   wait for the timestamp to increment by at least one to ensure that we do
1088  *   not hit a name conflict in cache create -> destroy (async) -> create cycle.
1089  */
1090 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1091 static inline void nv_kmem_ctor_dummy(void *arg)
1092 {
1093     (void)arg;
1094 }
1095 #else
1096 #define nv_kmem_ctor_dummy NULL
1097 #endif
1098 
1099 #define NV_KMEM_CACHE_CREATE(name, type)    \
1100     nv_kmem_cache_create(name, sizeof(type), 0)
1101 
1102 /* The NULL pointer check is required for kernels older than 4.3 */
1103 #define NV_KMEM_CACHE_DESTROY(kmem_cache)   \
1104     if (kmem_cache != NULL)                 \
1105     {                                       \
1106         kmem_cache_destroy(kmem_cache);     \
1107     }
1108 
1109 #define NV_KMEM_CACHE_ALLOC(kmem_cache)     \
1110     kmem_cache_alloc(kmem_cache, GFP_KERNEL)
1111 #define NV_KMEM_CACHE_FREE(ptr, kmem_cache) \
1112     kmem_cache_free(kmem_cache, ptr)
1113 
1114 static inline void *nv_kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
1115 {
1116 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1117     /*
1118      * We cannot call kmem_cache_zalloc directly as it adds the __GFP_ZERO
1119      * flag. This flag together with the presence of a slab constructor is
1120      * flagged as a potential bug by the Linux kernel since it is the role
1121      * of a constructor to fill an allocated object with the desired
1122      * pattern. In our case, we specified a (dummy) constructor as a
1123      * workaround for a bug and not to zero-initialize objects. So, we take
1124      * the pain here to memset allocated object ourselves.
1125      */
1126     void *object = kmem_cache_alloc(k, flags);
1127     if (object)
1128         memset(object, 0, kmem_cache_size(k));
1129     return object;
1130 #else
1131     return kmem_cache_zalloc(k, flags);
1132 #endif
1133 }
1134 
1135 static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
1136 {
1137     nvidia_stack_t *sp = NULL;
1138 #if defined(NVCPU_X86_64)
1139     if (rm_is_altstack_in_use())
1140     {
1141         sp = NV_KMEM_CACHE_ALLOC(nvidia_stack_t_cache);
1142         if (sp == NULL)
1143             return -ENOMEM;
1144         sp->size = sizeof(sp->stack);
1145         sp->top = sp->stack + sp->size;
1146     }
1147 #endif
1148     *stack = sp;
1149     return 0;
1150 }
1151 
1152 static inline void nv_kmem_cache_free_stack(nvidia_stack_t *stack)
1153 {
1154 #if defined(NVCPU_X86_64)
1155     if (stack != NULL && rm_is_altstack_in_use())
1156     {
1157         NV_KMEM_CACHE_FREE(stack, nvidia_stack_t_cache);
1158     }
1159 #endif
1160 }
1161 
1162 #if defined(NVCPU_X86_64)
1163 /*
1164  * RAM is cached on Linux by default, we can assume there's
1165  * nothing to be done here. This is not the case for the
1166  * other memory spaces: we will have made an attempt to add
1167  * a WC MTRR for the frame buffer.
1168  *
1169  * If a WC MTRR is present, we can't satisfy the WB mapping
1170  * attempt here, since the achievable effective memory
1171  * types in that case are WC and UC, if not it's typically
1172  * UC (MTRRdefType is UC); we could only satisfy WB mapping
1173  * requests with a WB MTRR.
1174  */
1175 #define NV_ALLOW_CACHING(mt)            ((mt) == NV_MEMORY_TYPE_SYSTEM)
1176 #else
1177 #define NV_ALLOW_CACHING(mt)            ((mt) != NV_MEMORY_TYPE_REGISTERS)
1178 #endif
1179 
1180 typedef struct nvidia_pte_s {
1181     NvU64           phys_addr;
1182     unsigned long   virt_addr;
1183     NvU64           dma_addr;
1184 #ifdef CONFIG_XEN
1185     unsigned int    guest_pfn;
1186 #endif
1187     unsigned int    page_count;
1188 } nvidia_pte_t;
1189 
1190 typedef struct nv_alloc_s {
1191     struct nv_alloc_s *next;
1192     struct device     *dev;
1193     atomic_t       usage_count;
1194     struct {
1195         NvBool contig      : 1;
1196         NvBool guest       : 1;
1197         NvBool zeroed      : 1;
1198         NvBool aliased     : 1;
1199         NvBool user        : 1;
1200         NvBool node        : 1;
1201         NvBool peer_io     : 1;
1202         NvBool physical    : 1;
1203         NvBool unencrypted : 1;
1204         NvBool coherent    : 1;
1205     } flags;
1206     unsigned int   cache_type;
1207     unsigned int   num_pages;
1208     unsigned int   order;
1209     unsigned int   size;
1210     nvidia_pte_t **page_table;          /* list of physical pages allocated */
1211     unsigned int   pid;
1212     struct page  **user_pages;
1213     NvU64         guest_id;             /* id of guest VM */
1214     NvS32         node_id;              /* Node id for memory allocation when node is set in flags */
1215     void          *import_priv;
1216     struct sg_table *import_sgt;
1217 } nv_alloc_t;
1218 
1219 /**
1220  * nv_is_dma_direct - return true if direct_dma is enabled
1221  *
1222  * Starting with the 5.0 kernel, SWIOTLB is merged into
1223  * direct_dma, so systems without an IOMMU use direct_dma.  We
1224  * need to know if this is the case, so that we can use a
1225  * different check for SWIOTLB enablement.
1226  */
1227 static inline NvBool nv_is_dma_direct(struct device *dev)
1228 {
1229     NvBool is_direct = NV_FALSE;
1230 
1231 #if defined(NV_DMA_IS_DIRECT_PRESENT)
1232     if (dma_is_direct(get_dma_ops(dev)))
1233         is_direct = NV_TRUE;
1234 #endif
1235 
1236     return is_direct;
1237 }
1238 
1239 /**
1240  * nv_dma_maps_swiotlb - return NV_TRUE if swiotlb is enabled
1241  *
1242  * SWIOTLB creates bounce buffers for the DMA mapping layer to
1243  * use if a driver asks the kernel to map a DMA buffer that is
1244  * outside of the device's addressable range.  The driver does
1245  * not function correctly if bounce buffers are enabled for the
1246  * device.  So if SWIOTLB is enabled, we should avoid making
1247  * mapping calls.
1248  */
1249 static inline NvBool
1250 nv_dma_maps_swiotlb(struct device *dev)
1251 {
1252     NvBool swiotlb_in_use = NV_FALSE;
1253 #if defined(CONFIG_SWIOTLB)
1254   #if defined(NV_DMA_OPS_PRESENT) || defined(NV_GET_DMA_OPS_PRESENT) || \
1255       defined(NV_SWIOTLB_DMA_OPS_PRESENT)
1256     /*
1257      * We only use the 'dma_ops' symbol on older x86_64 kernels; later kernels,
1258      * including those for other architectures, have converged on the
1259      * get_dma_ops() interface.
1260      */
1261     #if defined(NV_GET_DMA_OPS_PRESENT)
1262     /*
1263      * The __attribute__ ((unused)) is necessary because in at least one
1264      * case, *none* of the preprocessor branches below are taken, and
1265      * so the ops variable ends up never being referred to at all. This can
1266      * happen with the (NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs == 1)
1267      * case.
1268      */
1269     const struct dma_map_ops *ops __attribute__ ((unused)) = get_dma_ops(dev);
1270     #else
1271     const struct dma_mapping_ops *ops __attribute__ ((unused)) = dma_ops;
1272     #endif
1273 
1274     /*
1275      * The switch from dma_mapping_ops -> dma_map_ops coincided with the
1276      * switch from swiotlb_map_sg -> swiotlb_map_sg_attrs.
1277      */
1278       #if defined(NVCPU_AARCH64) && \
1279           defined(NV_NONCOHERENT_SWIOTLB_DMA_OPS_PRESENT)
1280     /* AArch64 exports these symbols directly */
1281     swiotlb_in_use = ((ops == &noncoherent_swiotlb_dma_ops) ||
1282                       (ops == &coherent_swiotlb_dma_ops));
1283       #elif NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs != 0
1284     swiotlb_in_use = (ops->map_sg == swiotlb_map_sg_attrs);
1285       #elif NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_dma_ops != 0
1286     swiotlb_in_use = (ops == &swiotlb_dma_ops);
1287       #endif
1288       /*
1289        * The "else" case that is not shown
1290        * (for NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs == 0 ||
1291        * NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_dma_ops == 0) does
1292        * nothing, and ends up dropping us out to the last line of this function,
1293        * effectively returning false. The nearly-human-readable version of that
1294        * case is "struct swiotlb_dma_ops is present (NV_SWIOTLB_DMA_OPS_PRESENT
1295        * is defined) but neither swiotlb_map_sg_attrs nor swiotlb_dma_ops is
1296        * present".
1297        *
1298        * That can happen on kernels that fall within below range:
1299        *
1300        * 2017-12-24  4bd89ed39b2ab8dc4ac4b6c59b07d420b0213bec
1301        *     ("swiotlb: remove various exports")
1302        * 2018-06-28  210d0797c97d0e8f3b1a932a0dc143f4c57008a3
1303        *     ("swiotlb: export swiotlb_dma_ops")
1304        *
1305        * Related to this: Between above two commits, this driver has no way of
1306        * detecting whether or not the SWIOTLB is in use. Furthermore, the
1307        * driver cannot support DMA remapping. That leads to the following
1308        * point: "swiotlb=force" is not supported for kernels falling in above
1309        * range.
1310        *
1311        * The other "else" case that is not shown:
1312        * Starting with the 5.0 kernel, swiotlb is integrated into dma_direct,
1313        * which is used when there's no IOMMU.  In these kernels, ops == NULL,
1314        * swiotlb_dma_ops no longer exists, and we do not support swiotlb=force
1315        * (doing so would require detecting when swiotlb=force is enabled and
1316        * then returning NV_TRUE even when dma_direct is in use).  So for now,
1317        * we just return NV_FALSE and in nv_compute_gfp_mask() we check for
1318        * whether swiotlb could possibly be used (outside of swiotlb=force).
1319        */
1320   #endif
1321 
1322     /*
1323      * Commit 2017-11-07 d7b417fa08d ("x86/mm: Add DMA support for
1324      * SEV memory encryption") forces SWIOTLB to be enabled when AMD SEV
1325      * is active in all cases.
1326      */
1327     if (os_sev_enabled)
1328         swiotlb_in_use = NV_TRUE;
1329 #endif
1330 
1331     return swiotlb_in_use;
1332 }
1333 
1334 /*
1335  * TODO: Bug 1522381 will allow us to move these mapping relationships into
1336  *       common code.
1337  */
1338 
1339 /*
1340  * Bug 1606851: the Linux kernel scatterlist code doesn't work for regions
1341  * greater than or equal to 4GB, due to regular use of unsigned int
1342  * throughout. So we need to split our mappings into 4GB-minus-1-page-or-less
1343  * chunks and manage them separately.
1344  */
1345 typedef struct nv_dma_submap_s {
1346     NvU32 page_count;
1347     NvU32 sg_map_count;
1348     struct sg_table sgt;
1349     NvBool imported;
1350 } nv_dma_submap_t;
1351 
1352 typedef struct nv_dma_map_s {
1353     struct page **pages;
1354     NvU64 page_count;
1355     NvBool contiguous;
1356     NvU32 cache_type;
1357     struct sg_table *import_sgt;
1358 
1359     union
1360     {
1361         struct
1362         {
1363             NvU32 submap_count;
1364             nv_dma_submap_t *submaps;
1365         } discontig;
1366 
1367         struct
1368         {
1369             NvU64 dma_addr;
1370         } contig;
1371     } mapping;
1372 
1373     struct device *dev;
1374 } nv_dma_map_t;
1375 
1376 #define NV_FOR_EACH_DMA_SUBMAP(dm, sm, i)                                     \
1377     for (i = 0, sm = &dm->mapping.discontig.submaps[0];                       \
1378          i < dm->mapping.discontig.submap_count;                              \
1379          i++, sm = &dm->mapping.discontig.submaps[i])
1380 
1381 #define NV_DMA_SUBMAP_MAX_PAGES           ((NvU32)(NV_U32_MAX >> PAGE_SHIFT))
1382 #define NV_DMA_SUBMAP_IDX_TO_PAGE_IDX(s)  (s * NV_DMA_SUBMAP_MAX_PAGES)
1383 
1384 /*
1385  * DO NOT use sg_alloc_table_from_pages on Xen Server, even if it's available.
1386  * This will glom multiple pages into a single sg element, which
1387  * xen_swiotlb_map_sg_attrs may try to route to the SWIOTLB. We must only use
1388  * single-page sg elements on Xen Server.
1389  */
1390 #if !defined(NV_DOM0_KERNEL_PRESENT)
1391     #define NV_ALLOC_DMA_SUBMAP_SCATTERLIST(dm, sm, i)                        \
1392         ((sg_alloc_table_from_pages(&sm->sgt,                                 \
1393             &dm->pages[NV_DMA_SUBMAP_IDX_TO_PAGE_IDX(i)],                     \
1394             sm->page_count, 0,                                                \
1395             sm->page_count * PAGE_SIZE, NV_GFP_KERNEL) == 0) ? NV_OK :        \
1396                 NV_ERR_OPERATING_SYSTEM)
1397 #else
1398     #define NV_ALLOC_DMA_SUBMAP_SCATTERLIST(dm, sm, i)                \
1399         ((sg_alloc_table(&sm->sgt, sm->page_count, NV_GFP_KERNEL)) == \
1400             0 ? NV_OK : NV_ERR_OPERATING_SYSTEM)
1401 #endif
1402 
1403 typedef struct nv_ibmnpu_info nv_ibmnpu_info_t;
1404 
1405 typedef struct nv_work_s {
1406     struct work_struct task;
1407     void *data;
1408 } nv_work_t;
1409 
1410 #define NV_MAX_REGISTRY_KEYS_LENGTH   512
1411 
1412 typedef enum
1413 {
1414     NV_DEV_STACK_TIMER,
1415     NV_DEV_STACK_ISR,
1416     NV_DEV_STACK_ISR_BH,
1417     NV_DEV_STACK_ISR_BH_UNLOCKED,
1418     NV_DEV_STACK_GPU_WAKEUP,
1419     NV_DEV_STACK_COUNT
1420 } nvidia_linux_dev_stack_t;
1421 
1422 /* Linux version of the opaque type used for os_queue_work_item() */
1423 struct os_work_queue {
1424     nv_kthread_q_t nvk;
1425 };
1426 
1427 /* Linux version of the opaque type used for os_wait_*() */
1428 struct os_wait_queue {
1429     struct completion q;
1430 };
1431 
1432 /*
1433  * To report error in msi/msix when unhandled count reaches a threshold
1434  */
1435 
1436 typedef struct nv_irq_count_info_s
1437 {
1438     int    irq;
1439     NvU64  unhandled;
1440     NvU64  total;
1441     NvU64  last_unhandled;
1442 } nv_irq_count_info_t;
1443 
1444 /* Linux-specific version of nv_dma_device_t */
1445 struct nv_dma_device {
1446     struct {
1447         NvU64 start;
1448         NvU64 limit;
1449     } addressable_range;
1450 
1451     struct device *dev;
1452     NvBool nvlink;
1453 };
1454 
1455 /* Properties of the coherent link */
1456 typedef struct coherent_link_info_s {
1457     /* Physical Address of the GPU memory in SOC AMAP. In the case of
1458      * baremetal OS environment it is System Physical Address(SPA) and in the case
1459      * of virutalized OS environment it is Intermediate Physical Address(IPA) */
1460     NvU64 gpu_mem_pa;
1461     /* Bitmap of NUMA node ids, corresponding to the reserved PXMs,
1462      * available for adding GPU memory to the kernel as system RAM */
1463     DECLARE_BITMAP(free_node_bitmap, MAX_NUMNODES);
1464 } coherent_link_info_t;
1465 
1466 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
1467 /*
1468  * acpi data storage structure
1469  *
1470  * This structure retains the pointer to the device,
1471  * and any other baggage we want to carry along
1472  *
1473  */
1474 typedef struct
1475 {
1476     nvidia_stack_t *sp;
1477     struct acpi_device *device;
1478     struct acpi_handle *handle;
1479     void *notifier_data;
1480     int notify_handler_installed;
1481 } nv_acpi_t;
1482 #endif
1483 
1484 /* linux-specific version of old nv_state_t */
1485 /* this is a general os-specific state structure. the first element *must* be
1486    the general state structure, for the generic unix-based code */
1487 typedef struct nv_linux_state_s {
1488     nv_state_t nv_state;
1489 
1490     atomic_t usage_count;
1491     NvU32    suspend_count;
1492 
1493     struct device  *dev;
1494     struct pci_dev *pci_dev;
1495 
1496     /* IBM-NPU info associated with this GPU */
1497     nv_ibmnpu_info_t *npu;
1498 
1499     /* coherent link information */
1500      coherent_link_info_t coherent_link_info;
1501 
1502     /* Dedicated queue to be used for removing FB memory which is onlined
1503      * to kernel as a NUMA node. Refer Bug : 3879845*/
1504     nv_kthread_q_t remove_numa_memory_q;
1505 
1506     /* NUMA node information for the platforms where GPU memory is presented
1507      * as a NUMA node to the kernel */
1508     struct {
1509         /* NUMA node id >=0  when the platform supports GPU memory as NUMA node
1510          * otherwise it holds the value of NUMA_NO_NODE */
1511         NvS32 node_id;
1512 
1513         /* NUMA online/offline status for platforms that support GPU memory as
1514          * NUMA node */
1515         atomic_t status;
1516         NvBool use_auto_online;
1517     } numa_info;
1518 
1519     nvidia_stack_t *sp[NV_DEV_STACK_COUNT];
1520 
1521     char registry_keys[NV_MAX_REGISTRY_KEYS_LENGTH];
1522 
1523     nv_work_t work;
1524 
1525     /* get a timer callback every second */
1526     struct nv_timer rc_timer;
1527 
1528     /* lock for linux-specific data, not used by core rm */
1529     struct semaphore ldata_lock;
1530 
1531     /* proc directory information */
1532     struct proc_dir_entry *proc_dir;
1533 
1534     NvU32 minor_num;
1535     struct nv_linux_state_s *next;
1536 
1537     /* DRM private information */
1538     struct drm_device *drm;
1539 
1540     /* kthread based bottom half servicing queue and elements */
1541     nv_kthread_q_t bottom_half_q;
1542     nv_kthread_q_item_t bottom_half_q_item;
1543 
1544     /* Lock for unlocked bottom half protecting common allocated stack */
1545     void *isr_bh_unlocked_mutex;
1546 
1547     NvBool tce_bypass_enabled;
1548 
1549     NvU32 num_intr;
1550 
1551     /* Lock serializing ISRs for different MSI-X vectors */
1552     nv_spinlock_t msix_isr_lock;
1553 
1554     /* Lock serializing bottom halves for different MSI-X vectors */
1555     void *msix_bh_mutex;
1556 
1557     struct msix_entry *msix_entries;
1558 
1559     NvU64 numa_memblock_size;
1560 
1561     struct {
1562         struct backlight_device *dev;
1563         NvU32 displayId;
1564         const char *device_name;
1565     } backlight;
1566 
1567     /*
1568      * file handle for pci sysfs config file (/sys/bus/pci/devices/.../config)
1569      * which will be opened during device probe
1570      */
1571     struct file *sysfs_config_file;
1572 
1573     /* Per-GPU queue */
1574     struct os_work_queue queue;
1575 
1576     /* GPU user mapping revocation/remapping (only for non-CTL device) */
1577     struct semaphore mmap_lock; /* Protects all fields in this category */
1578     struct list_head open_files;
1579     NvBool all_mappings_revoked;
1580     NvBool safe_to_mmap;
1581     NvBool gpu_wakeup_callback_needed;
1582 
1583     /* Per-device notifier block for ACPI events */
1584     struct notifier_block acpi_nb;
1585 
1586 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
1587     nv_acpi_t* nv_acpi_object;
1588 #endif
1589 
1590     /* Lock serializing ISRs for different SOC vectors */
1591     nv_spinlock_t soc_isr_lock;
1592     void *soc_bh_mutex;
1593 
1594     struct nv_timer snapshot_timer;
1595     nv_spinlock_t snapshot_timer_lock;
1596     void (*snapshot_callback)(void *context);
1597 
1598     /* count for unhandled, total and timestamp of irq */
1599     nv_irq_count_info_t *irq_count;
1600 
1601     /* Max number of irq triggered and are getting tracked */
1602     NvU16 current_num_irq_tracked;
1603 
1604     NvBool is_forced_shutdown;
1605 
1606     struct nv_dma_device dma_dev;
1607     struct nv_dma_device niso_dma_dev;
1608 } nv_linux_state_t;
1609 
1610 extern nv_linux_state_t *nv_linux_devices;
1611 
1612 /*
1613  * Macros to protect operations on nv_linux_devices list
1614  * Lock acquisition order while using the nv_linux_devices list
1615  * 1. LOCK_NV_LINUX_DEVICES()
1616  * 2. Traverse the list
1617  *    If the list is traversed to search for an element say nvl,
1618  *    acquire the nvl->ldata_lock before step 3
1619  * 3. UNLOCK_NV_LINUX_DEVICES()
1620  * 4. Release nvl->ldata_lock after any read/write access to the
1621  *    nvl element is complete
1622  */
1623 extern struct semaphore nv_linux_devices_lock;
1624 #define LOCK_NV_LINUX_DEVICES()     down(&nv_linux_devices_lock)
1625 #define UNLOCK_NV_LINUX_DEVICES()   up(&nv_linux_devices_lock)
1626 
1627 /*
1628  * Lock to synchronize system power management transitions,
1629  * and to protect the global system PM state.  The procfs power
1630  * management interface acquires this lock in write mode for
1631  * the duration of the sleep operation, any other paths accessing
1632  * device state must acquire the lock in read mode.
1633  */
1634 extern struct rw_semaphore nv_system_pm_lock;
1635 
1636 extern NvBool nv_ats_supported;
1637 
1638 /*
1639  * file-private data
1640  * hide a pointer to our data structures in a file-private ptr
1641  * there are times we need to grab this data back from the file
1642  * data structure..
1643  */
1644 
1645 typedef struct nvidia_event
1646 {
1647     struct nvidia_event *next;
1648     nv_event_t event;
1649 } nvidia_event_t;
1650 
1651 typedef enum
1652 {
1653     NV_FOPS_STACK_INDEX_MMAP,
1654     NV_FOPS_STACK_INDEX_IOCTL,
1655     NV_FOPS_STACK_INDEX_COUNT
1656 } nvidia_entry_point_index_t;
1657 
1658 typedef struct
1659 {
1660     nv_file_private_t nvfp;
1661 
1662     nvidia_stack_t *sp;
1663     nvidia_stack_t *fops_sp[NV_FOPS_STACK_INDEX_COUNT];
1664     struct semaphore fops_sp_lock[NV_FOPS_STACK_INDEX_COUNT];
1665     nv_alloc_t *free_list;
1666     void *nvptr;
1667     nvidia_event_t *event_data_head, *event_data_tail;
1668     NvBool dataless_event_pending;
1669     nv_spinlock_t fp_lock;
1670     wait_queue_head_t waitqueue;
1671     nv_kthread_q_item_t deferred_close_q_item;
1672     NvU32 *attached_gpus;
1673     size_t num_attached_gpus;
1674     nv_alloc_mapping_context_t mmap_context;
1675     struct address_space mapping;
1676 
1677     struct list_head entry;
1678 } nv_linux_file_private_t;
1679 
1680 static inline nv_linux_file_private_t *nv_get_nvlfp_from_nvfp(nv_file_private_t *nvfp)
1681 {
1682     return container_of(nvfp, nv_linux_file_private_t, nvfp);
1683 }
1684 
1685 #define NV_SET_FILE_PRIVATE(filep,data) ((filep)->private_data = (data))
1686 #define NV_GET_LINUX_FILE_PRIVATE(filep) ((nv_linux_file_private_t *)(filep)->private_data)
1687 
1688 /* for the card devices */
1689 #define NV_GET_NVL_FROM_FILEP(filep)    (NV_GET_LINUX_FILE_PRIVATE(filep)->nvptr)
1690 #define NV_GET_NVL_FROM_NV_STATE(nv)    ((nv_linux_state_t *)nv->os_state)
1691 
1692 #define NV_STATE_PTR(nvl)   &(((nv_linux_state_t *)(nvl))->nv_state)
1693 
1694 static inline nvidia_stack_t *nv_nvlfp_get_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
1695 {
1696 #if defined(NVCPU_X86_64)
1697     if (rm_is_altstack_in_use())
1698     {
1699         down(&nvlfp->fops_sp_lock[which]);
1700         return nvlfp->fops_sp[which];
1701     }
1702 #endif
1703     return NULL;
1704 }
1705 
1706 static inline void nv_nvlfp_put_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
1707 {
1708 #if defined(NVCPU_X86_64)
1709     if (rm_is_altstack_in_use())
1710     {
1711         up(&nvlfp->fops_sp_lock[which]);
1712     }
1713 #endif
1714 }
1715 
1716 #define NV_ATOMIC_READ(data)            atomic_read(&(data))
1717 #define NV_ATOMIC_SET(data,val)         atomic_set(&(data), (val))
1718 #define NV_ATOMIC_INC(data)             atomic_inc(&(data))
1719 #define NV_ATOMIC_DEC(data)             atomic_dec(&(data))
1720 #define NV_ATOMIC_DEC_AND_TEST(data)    atomic_dec_and_test(&(data))
1721 
1722 static inline struct kmem_cache *nv_kmem_cache_create(const char *name, unsigned int size,
1723                                                       unsigned int align)
1724 {
1725     char *name_unique;
1726     struct kmem_cache *cache;
1727 
1728 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1729     size_t len;
1730     NvU64 tm_ns = nv_ktime_get_raw_ns();
1731 
1732     /*
1733      * Wait for timer to change at least once. This ensures
1734      * that the name generated below is always unique.
1735      */
1736     while (tm_ns == nv_ktime_get_raw_ns());
1737     tm_ns = nv_ktime_get_raw_ns();
1738 
1739     /* 20 is the max length of a 64-bit integer printed in decimal */
1740     len = strlen(name) + 20 + 1;
1741     name_unique = kzalloc(len, GFP_KERNEL);
1742     if (!name_unique)
1743         return NULL;
1744 
1745     if (snprintf(name_unique, len, "%s-%llu", name, tm_ns) >= len)
1746     {
1747         WARN(1, "kmem cache name too long: %s\n", name);
1748         kfree(name_unique);
1749         return NULL;
1750     }
1751 #else
1752     name_unique = (char *)name;
1753 #endif
1754     cache = kmem_cache_create(name_unique, size, align, 0, nv_kmem_ctor_dummy);
1755     if (name_unique != name)
1756         kfree(name_unique);
1757 
1758     return cache;
1759 }
1760 
1761 #if defined(CONFIG_PCI_IOV)
1762 #define NV_PCI_SRIOV_SUPPORT
1763 #endif /* CONFIG_PCI_IOV */
1764 
1765 #define NV_PCIE_CFG_MAX_OFFSET 0x1000
1766 
1767 #include "nv-proto.h"
1768 
1769 /*
1770  * Check if GPU is present on the bus by checking flag
1771  * NV_FLAG_IN_SURPRISE_REMOVAL(set when eGPU is removed from TB3).
1772  */
1773 static inline NV_STATUS nv_check_gpu_state(nv_state_t *nv)
1774 {
1775 #if !defined(NVCPU_PPC64LE)
1776     if (NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv))
1777     {
1778         return NV_ERR_GPU_IS_LOST;
1779     }
1780 #endif
1781 
1782     return NV_OK;
1783 }
1784 
1785 extern NvU32 NVreg_EnableUserNUMAManagement;
1786 extern NvU32 NVreg_RegisterPCIDriver;
1787 extern NvU32 NVreg_EnableResizableBar;
1788 
1789 extern NvU32 num_probed_nv_devices;
1790 extern NvU32 num_nv_devices;
1791 
1792 #define NV_FILE_INODE(file) (file)->f_inode
1793 
1794 #if defined(NV_DOM0_KERNEL_PRESENT) || defined(NV_VGPU_KVM_BUILD)
1795 #define NV_VGX_HYPER
1796 #if defined(NV_XEN_IOEMU_INJECT_MSI)
1797 #include <xen/ioemu.h>
1798 #endif
1799 #endif
1800 
1801 static inline NvU64 nv_pci_bus_address(struct pci_dev *dev, NvU8 bar_index)
1802 {
1803     NvU64 bus_addr = 0;
1804 #if defined(NV_PCI_BUS_ADDRESS_PRESENT)
1805     bus_addr = pci_bus_address(dev, bar_index);
1806 #elif defined(CONFIG_PCI)
1807     struct pci_bus_region region;
1808 
1809     pcibios_resource_to_bus(dev, &region, &dev->resource[bar_index]);
1810     bus_addr = region.start;
1811 #endif
1812     return bus_addr;
1813 }
1814 
1815 /*
1816  * Decrements the usage count of the allocation, and moves the allocation to
1817  * the given nvlfp's free list if the usage count drops to zero.
1818  *
1819  * Returns NV_TRUE if the allocation is moved to the nvlfp's free list.
1820  */
1821 static inline NvBool nv_alloc_release(nv_linux_file_private_t *nvlfp, nv_alloc_t *at)
1822 {
1823     NV_PRINT_AT(NV_DBG_MEMINFO, at);
1824 
1825     if (NV_ATOMIC_DEC_AND_TEST(at->usage_count))
1826     {
1827         NV_ATOMIC_INC(at->usage_count);
1828 
1829         at->next = nvlfp->free_list;
1830         nvlfp->free_list = at;
1831         return NV_TRUE;
1832     }
1833 
1834     return NV_FALSE;
1835 }
1836 
1837 /*
1838  * RB_EMPTY_ROOT was added in 2.6.18 by this commit:
1839  *   2006-06-21  dd67d051529387f6e44d22d1d5540ef281965fdd
1840  */
1841 #if !defined(RB_EMPTY_ROOT)
1842 #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
1843 #endif
1844 
1845 /*
1846  * Starting on Power9 systems, DMA addresses for NVLink are no longer
1847  * the same as used over PCIe.
1848  *
1849  * Power9 supports a 56-bit Real Address. This address range is compressed
1850  * when accessed over NVLink to allow the GPU to access all of memory using
1851  * its 47-bit Physical address.
1852  *
1853  * If there is an NPU device present on the system, it implies that NVLink
1854  * sysmem links are present and we need to apply the required address
1855  * conversion for NVLink within the driver.
1856  *
1857  * See Bug 1920398 for further background and details.
1858  *
1859  * Note, a deviation from the documented compression scheme is that the
1860  * upper address bits (i.e. bit 56-63) instead of being set to zero are
1861  * preserved during NVLink address compression so the orignal PCIe DMA
1862  * address can be reconstructed on expansion. These bits can be safely
1863  * ignored on NVLink since they are truncated by the GPU.
1864  *
1865  * Bug 1968345: As a performance enhancement it is the responsibility of
1866  * the caller on PowerPC platforms to check for presence of an NPU device
1867  * before the address transformation is applied.
1868  */
1869 static inline NvU64 nv_compress_nvlink_addr(NvU64 addr)
1870 {
1871     NvU64 addr47 = addr;
1872 
1873 #if defined(NVCPU_PPC64LE)
1874     addr47 = addr & ((1ULL << 43) - 1);
1875     addr47 |= (addr & (0x3ULL << 45)) >> 2;
1876     WARN_ON(addr47 & (1ULL << 44));
1877     addr47 |= (addr & (0x3ULL << 49)) >> 4;
1878     addr47 |= addr & ~((1ULL << 56) - 1);
1879 #endif
1880 
1881     return addr47;
1882 }
1883 
1884 static inline NvU64 nv_expand_nvlink_addr(NvU64 addr47)
1885 {
1886     NvU64 addr = addr47;
1887 
1888 #if defined(NVCPU_PPC64LE)
1889     addr = addr47 & ((1ULL << 43) - 1);
1890     addr |= (addr47 & (3ULL << 43)) << 2;
1891     addr |= (addr47 & (3ULL << 45)) << 4;
1892     addr |= addr47 & ~((1ULL << 56) - 1);
1893 #endif
1894 
1895     return addr;
1896 }
1897 
1898 // Default flags for ISRs
1899 static inline NvU32 nv_default_irq_flags(nv_state_t *nv)
1900 {
1901     NvU32 flags = 0;
1902 
1903     /*
1904      * Request IRQs to be disabled in our ISRs to keep consistency across the
1905      * supported kernel versions.
1906      *
1907      * IRQF_DISABLED has been made the default in 2.6.35 with commit e58aa3d2d0cc
1908      * from March 2010. And it has been later completely removed in 4.1 with commit
1909      * d8bf368d0631 from March 2015. Add it to our flags if it's defined to get the
1910      * same behaviour on pre-2.6.35 kernels as on recent ones.
1911      */
1912 #if defined(IRQF_DISABLED)
1913     flags |= IRQF_DISABLED;
1914 #endif
1915 
1916     /*
1917      * For legacy interrupts, also allow sharing. Sharing doesn't make sense
1918      * for MSI(-X) as on Linux they are never shared across different devices
1919      * and we only register one ISR today.
1920      */
1921     if ((nv->flags & (NV_FLAG_USES_MSI | NV_FLAG_USES_MSIX)) == 0)
1922         flags |= IRQF_SHARED;
1923 
1924     return flags;
1925 }
1926 
1927 /*
1928  * From v3.7-rc1 kernel have stopped exporting get_unused_fd() and started
1929  * exporting get_unused_fd_flags(), as of this commit:
1930  * 2012-09-26 1a7bd2265fc ("make get_unused_fd_flags() a function")
1931  */
1932 #if NV_IS_EXPORT_SYMBOL_PRESENT_get_unused_fd
1933     #define NV_GET_UNUSED_FD()  get_unused_fd()
1934 #else
1935     #define NV_GET_UNUSED_FD()  get_unused_fd_flags(0)
1936 #endif
1937 
1938 #if NV_IS_EXPORT_SYMBOL_PRESENT_get_unused_fd_flags
1939     #define NV_GET_UNUSED_FD_FLAGS(flags)  get_unused_fd_flags(flags)
1940 #else
1941     #define NV_GET_UNUSED_FD_FLAGS(flags)  (-1)
1942 #endif
1943 
1944 #define MODULE_BASE_NAME "nvidia"
1945 #define MODULE_INSTANCE_NUMBER 0
1946 #define MODULE_INSTANCE_STRING ""
1947 #define MODULE_NAME MODULE_BASE_NAME MODULE_INSTANCE_STRING
1948 
1949 NvS32 nv_request_soc_irq(nv_linux_state_t *, NvU32, nv_soc_irq_type_t, NvU32, NvU32, const char*);
1950 
1951 static inline void nv_mutex_destroy(struct mutex *lock)
1952 {
1953     mutex_destroy(lock);
1954 }
1955 
1956 static inline NvBool nv_platform_supports_numa(nv_linux_state_t *nvl)
1957 {
1958     return nvl->numa_info.node_id != NUMA_NO_NODE;
1959 }
1960 
1961 static inline int nv_get_numa_status(nv_linux_state_t *nvl)
1962 {
1963     if (!nv_platform_supports_numa(nvl))
1964     {
1965         return NV_IOCTL_NUMA_STATUS_DISABLED;
1966     }
1967 
1968     return NV_ATOMIC_READ(nvl->numa_info.status);
1969 }
1970 
1971 static inline int nv_set_numa_status(nv_linux_state_t *nvl, int status)
1972 {
1973     if (!nv_platform_supports_numa(nvl))
1974     {
1975         return -EINVAL;
1976     }
1977 
1978     NV_ATOMIC_SET(nvl->numa_info.status, status);
1979     return 0;
1980 }
1981 
1982 static inline NvBool nv_platform_use_auto_online(nv_linux_state_t *nvl)
1983 {
1984     return nvl->numa_info.use_auto_online;
1985 }
1986 
1987 typedef struct {
1988     NvU64 base;
1989     NvU64 size;
1990     NvU32 nodeId;
1991     int ret;
1992 } remove_numa_memory_info_t;
1993 
1994 static void offline_numa_memory_callback
1995 (
1996     void *args
1997 )
1998 {
1999 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2000     remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
2001 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2002     pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->nodeId,
2003                                                pNumaInfo->base,
2004                                                pNumaInfo->size);
2005 #else
2006     pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->base,
2007                                                pNumaInfo->size);
2008 #endif
2009 #endif
2010 }
2011 
2012 typedef enum
2013 {
2014     NV_NUMA_STATUS_DISABLED             = 0,
2015     NV_NUMA_STATUS_OFFLINE              = 1,
2016     NV_NUMA_STATUS_ONLINE_IN_PROGRESS   = 2,
2017     NV_NUMA_STATUS_ONLINE               = 3,
2018     NV_NUMA_STATUS_ONLINE_FAILED        = 4,
2019     NV_NUMA_STATUS_OFFLINE_IN_PROGRESS  = 5,
2020     NV_NUMA_STATUS_OFFLINE_FAILED       = 6,
2021     NV_NUMA_STATUS_COUNT
2022 } nv_numa_status_t;
2023 
2024 #if defined(NV_LINUX_PLATFORM_DEVICE_H_PRESENT)
2025 #include <linux/platform_device.h>
2026 #endif
2027 
2028 #if defined(NV_LINUX_MUTEX_H_PRESENT)
2029 #include <linux/mutex.h>
2030 #endif
2031 
2032 #if defined(NV_LINUX_RESET_H_PRESENT)
2033 #include <linux/reset.h>
2034 #endif
2035 
2036 #if defined(NV_LINUX_DMA_BUF_H_PRESENT)
2037 #include <linux/dma-buf.h>
2038 #endif
2039 
2040 #if defined(NV_LINUX_GPIO_H_PRESENT)
2041 #include <linux/gpio.h>
2042 #endif
2043 
2044 #if defined(NV_LINUX_OF_GPIO_H_PRESENT)
2045 #include <linux/of_gpio.h>
2046 #endif
2047 
2048 #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
2049 #include <linux/of_device.h>
2050 #endif
2051 
2052 #if defined(NV_LINUX_OF_PLATFORM_H_PRESENT)
2053 #include <linux/of_platform.h>
2054 #endif
2055 
2056 #if defined(NV_LINUX_INTERCONNECT_H_PRESENT)
2057 #include <linux/interconnect.h>
2058 #endif
2059 
2060 #if defined(NV_LINUX_PM_RUNTIME_H_PRESENT)
2061 #include <linux/pm_runtime.h>
2062 #endif
2063 
2064 #if defined(NV_LINUX_CLK_H_PRESENT)
2065 #include <linux/clk.h>
2066 #endif
2067 
2068 #if defined(NV_LINUX_CLK_PROVIDER_H_PRESENT)
2069 #include <linux/clk-provider.h>
2070 #endif
2071 
2072 #endif  /* _NV_LINUX_H_ */
2073