1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #ifndef _NV_LINUX_H_
25 #define _NV_LINUX_H_
26 
27 #include "nvstatus.h"
28 #include "nv.h"
29 #include "nv-ioctl-numa.h"
30 #include "conftest.h"
31 
32 #include "nv-lock.h"
33 #include "nv-pgprot.h"
34 #include "nv-mm.h"
35 #include "os-interface.h"
36 #include "nv-timer.h"
37 #include "nv-time.h"
38 #include "nv-chardev-numbers.h"
39 
40 #define NV_KERNEL_NAME "Linux"
41 
42 #ifndef AUTOCONF_INCLUDED
43 #if defined(NV_GENERATED_AUTOCONF_H_PRESENT)
44 #include <generated/autoconf.h>
45 #else
46 #include <linux/autoconf.h>
47 #endif
48 #endif
49 
50 #if defined(NV_GENERATED_UTSRELEASE_H_PRESENT)
51   #include <generated/utsrelease.h>
52 #endif
53 
54 #if defined(NV_GENERATED_COMPILE_H_PRESENT)
55   #include <generated/compile.h>
56 #endif
57 
58 #include <linux/version.h>
59 #include <linux/utsname.h>
60 
61 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
62 #error "This driver does not support kernels older than 2.6.32!"
63 #elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 7, 0)
64 #  define KERNEL_2_6
65 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
66 #  define KERNEL_3
67 #else
68 #error "This driver does not support development kernels!"
69 #endif
70 
71 #if defined (CONFIG_SMP) && !defined (__SMP__)
72 #define __SMP__
73 #endif
74 
75 #if defined (CONFIG_MODVERSIONS) && !defined (MODVERSIONS)
76 #  define MODVERSIONS
77 #endif
78 
79 #include <linux/kernel.h>
80 #include <linux/module.h>
81 #include <linux/kmod.h>
82 #include <asm/bug.h>
83 
84 #include <linux/mm.h>
85 
86 #if !defined(VM_RESERVED)
87 #define VM_RESERVED    0x00000000
88 #endif
89 #if !defined(VM_DONTEXPAND)
90 #define VM_DONTEXPAND  0x00000000
91 #endif
92 #if !defined(VM_DONTDUMP)
93 #define VM_DONTDUMP    0x00000000
94 #endif
95 
96 #include <linux/init.h>             /* module_init, module_exit         */
97 #include <linux/types.h>            /* pic_t, size_t, __u32, etc        */
98 #include <linux/errno.h>            /* error codes                      */
99 #include <linux/list.h>             /* circular linked list             */
100 #include <linux/stddef.h>           /* NULL, offsetof                   */
101 #include <linux/wait.h>             /* wait queues                      */
102 #include <linux/string.h>           /* strchr(), strpbrk()              */
103 
104 #include <linux/ctype.h>            /* isspace(), etc                   */
105 #include <linux/console.h>          /* acquire_console_sem(), etc       */
106 #include <linux/cpufreq.h>          /* cpufreq_get                      */
107 
108 #include <linux/slab.h>             /* kmalloc, kfree, etc              */
109 #include <linux/vmalloc.h>          /* vmalloc, vfree, etc              */
110 
111 #include <linux/poll.h>             /* poll_wait                        */
112 #include <linux/delay.h>            /* mdelay, udelay                   */
113 
114 #include <linux/sched.h>            /* suser(), capable() replacement   */
115 
116 #include <linux/random.h>           /* get_random_bytes()               */
117 
118 #if defined(NV_LINUX_DMA_BUF_H_PRESENT)
119 #include <linux/dma-buf.h>
120 #endif
121 
122 #if defined(NV_DRM_AVAILABLE)
123 #if defined(NV_DRM_DRM_DEVICE_H_PRESENT)
124 #include <drm/drm_device.h>
125 #endif
126 
127 #if defined(NV_DRM_DRM_DRV_H_PRESENT)
128 #include <drm/drm_drv.h>
129 #endif
130 
131 #if defined(NV_DRM_DRMP_H_PRESENT)
132 #include <drm/drmP.h>
133 #endif
134 
135 #if defined(NV_DRM_DRM_GEM_H_PRESENT)
136 #include <drm/drm_gem.h>
137 #endif
138 #endif /* NV_DRM_AVAILABLE */
139 
140 /*
141  * sched.h was refactored with this commit (as part of Linux 4.11)
142  *   2017-03-03  1827adb11ad26b2290dc9fe2aaf54976b2439865
143  */
144 #if defined(NV_LINUX_SCHED_SIGNAL_H_PRESENT)
145 #include <linux/sched/signal.h>     /* task_lock(), task_unlock()       */
146 #endif
147 
148 #if defined(NV_LINUX_SCHED_TASK_H_PRESENT)
149 #include <linux/sched/task.h>       /* task_lock(), task_unlock()       */
150 #endif
151 
152 /* task and signal-related items, for kernels < 4.11: */
153 #include <linux/sched.h>            /* task_lock(), task_unlock()       */
154 
155 #include <linux/moduleparam.h>      /* module_param()                   */
156 #include <asm/tlbflush.h>           /* flush_tlb(), flush_tlb_all()     */
157 
158 #include <linux/pci.h>              /* pci_find_class, etc              */
159 #include <linux/interrupt.h>        /* tasklets, interrupt helpers      */
160 #include <linux/timer.h>
161 #include <linux/file.h>             /* fget(), fput()                   */
162 #include <linux/rbtree.h>
163 #include <linux/cpu.h>              /* CPU hotplug support              */
164 
165 #include <linux/pm_runtime.h>       /* pm_runtime_*                     */
166 #include <linux/fdtable.h>          /* files_fdtable, etc               */
167 
168 #include <asm/div64.h>              /* do_div()                         */
169 #if defined(NV_ASM_SYSTEM_H_PRESENT)
170 #include <asm/system.h>             /* cli, sli, save_flags             */
171 #endif
172 #include <asm/io.h>                 /* ioremap, virt_to_phys            */
173 #include <asm/uaccess.h>            /* access_ok                        */
174 #include <asm/page.h>               /* PAGE_OFFSET                      */
175 #include <asm/pgtable.h>            /* pte bit definitions              */
176 #include <asm/bitops.h>             /* __set_bit()                      */
177 
178 #if defined(NV_LINUX_TIME_H_PRESENT)
179 #include <linux/time.h>             /* FD_SET()                         */
180 #endif
181 
182 #include "nv-list-helpers.h"
183 
184 /*
185  * Use current->cred->euid, instead of calling current_euid().
186  * The latter can pull in the GPL-only debug_lockdep_rcu_enabled()
187  * symbol when CONFIG_PROVE_RCU.  That is only used for debugging.
188  *
189  * The Linux kernel relies on the assumption that only the current process
190  * is permitted to change its cred structure. Therefore, current_euid()
191  * does not require the RCU's read lock on current->cred.
192  */
193 #define NV_CURRENT_EUID() (__kuid_val(current->cred->euid))
194 
195 #if defined(CONFIG_VGA_ARB)
196 #include <linux/vgaarb.h>
197 #endif
198 
199 #include <linux/pagemap.h>
200 #include <linux/dma-mapping.h>
201 
202 #if defined(NV_LINUX_DMA_MAP_OPS_H_PRESENT)
203 #include <linux/dma-map-ops.h>
204 #endif
205 
206 #if defined(CONFIG_SWIOTLB) && defined(NVCPU_AARCH64)
207 #include <linux/swiotlb.h>
208 #endif
209 
210 #include <linux/scatterlist.h>
211 #include <linux/completion.h>
212 #include <linux/highmem.h>
213 
214 #include <linux/nodemask.h>
215 #include <linux/memory.h>
216 
217 #include <linux/workqueue.h>        /* workqueue                        */
218 #include "nv-kthread-q.h"           /* kthread based queue              */
219 
220 #if defined(NV_LINUX_EFI_H_PRESENT)
221 #include <linux/efi.h>              /* efi_enabled                      */
222 #endif
223 
224 #include <linux/fb.h>               /* fb_info struct                   */
225 #include <linux/screen_info.h>      /* screen_info                      */
226 
227 #if !defined(CONFIG_PCI)
228 #warning "Attempting to build driver for a platform with no PCI support!"
229 #include <asm-generic/pci-dma-compat.h>
230 #endif
231 
232 #if defined(CONFIG_CRAY_XT)
233 #include <cray/cray_nvidia.h>
234 NV_STATUS nvos_forward_error_to_cray(struct pci_dev *, NvU32,
235         const char *, va_list);
236 #endif
237 
238 #if defined(NVCPU_PPC64LE) && defined(CONFIG_EEH)
239 #include <asm/eeh.h>
240 #define NV_PCI_ERROR_RECOVERY_ENABLED() eeh_enabled()
241 #define NV_PCI_ERROR_RECOVERY
242 #endif
243 
244 #if defined(NV_ASM_SET_MEMORY_H_PRESENT)
245 #include <asm/set_memory.h>
246 #endif
247 
248 #if defined(NV_SET_MEMORY_UC_PRESENT)
249 #undef NV_SET_PAGES_UC_PRESENT
250 #endif
251 
252 #if !defined(NVCPU_AARCH64) && !defined(NVCPU_PPC64LE) && !defined(NVCPU_RISCV64)
253 #if !defined(NV_SET_MEMORY_UC_PRESENT) && !defined(NV_SET_PAGES_UC_PRESENT)
254 #error "This driver requires the ability to change memory types!"
255 #endif
256 #endif
257 
258 /*
259  * Traditionally, CONFIG_XEN indicated that the target kernel was
260  * built exclusively for use under a Xen hypervisor, requiring
261  * modifications to or disabling of a variety of NVIDIA graphics
262  * driver code paths. As of the introduction of CONFIG_PARAVIRT
263  * and support for Xen hypervisors within the CONFIG_PARAVIRT_GUEST
264  * architecture, CONFIG_XEN merely indicates that the target
265  * kernel can run under a Xen hypervisor, but not that it will.
266  *
267  * If CONFIG_XEN and CONFIG_PARAVIRT are defined, the old Xen
268  * specific code paths are disabled. If the target kernel executes
269  * stand-alone, the NVIDIA graphics driver will work fine. If the
270  * kernels executes under a Xen (or other) hypervisor, however, the
271  * NVIDIA graphics driver has no way of knowing and is unlikely
272  * to work correctly.
273  */
274 #if defined(CONFIG_XEN) && !defined(CONFIG_PARAVIRT)
275 #include <asm/maddr.h>
276 #include <xen/interface/memory.h>
277 #define NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL
278 #endif
279 
280 #ifdef CONFIG_KDB
281 #include <linux/kdb.h>
282 #include <asm/kdb.h>
283 #endif
284 
285 #if defined(CONFIG_X86_REMOTE_DEBUG)
286 #include <linux/gdb.h>
287 #endif
288 
289 #if defined(DEBUG) && defined(CONFIG_KGDB) && \
290     defined(NVCPU_AARCH64)
291 #include <asm/kgdb.h>
292 #endif
293 
294 #if defined(NVCPU_X86_64) && !defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
295 #define NV_ENABLE_PAT_SUPPORT
296 #endif
297 
298 #define NV_PAT_MODE_DISABLED    0
299 #define NV_PAT_MODE_KERNEL      1
300 #define NV_PAT_MODE_BUILTIN     2
301 
302 extern int nv_pat_mode;
303 
304 #if defined(CONFIG_HOTPLUG_CPU)
305 #define NV_ENABLE_HOTPLUG_CPU
306 #include <linux/notifier.h>         /* struct notifier_block, etc       */
307 #endif
308 
309 #if (defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE))
310 #include <linux/i2c.h>
311 #endif
312 
313 #if defined(CONFIG_ACPI)
314 #include <linux/acpi.h>
315 #define NV_LINUX_ACPI_EVENTS_SUPPORTED 1
316 #endif
317 
318 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
319 #define NV_ACPI_WALK_NAMESPACE(type, start_object, max_depth, \
320         user_function, args...) \
321     acpi_walk_namespace(type, start_object, max_depth, \
322             user_function, NULL, args)
323 #endif
324 
325 #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_PREEMPT_RT_FULL)
326 #define NV_CONFIG_PREEMPT_RT 1
327 #endif
328 
329 #if defined(NV_WRITE_CR4_PRESENT)
330 #define NV_READ_CR4()       read_cr4()
331 #define NV_WRITE_CR4(cr4)   write_cr4(cr4)
332 #else
333 #define NV_READ_CR4()       __read_cr4()
334 #define NV_WRITE_CR4(cr4)   __write_cr4(cr4)
335 #endif
336 
337 #ifndef get_cpu
338 #define get_cpu() smp_processor_id()
339 #define put_cpu()
340 #endif
341 
342 #if !defined(unregister_hotcpu_notifier)
343 #define unregister_hotcpu_notifier unregister_cpu_notifier
344 #endif
345 #if !defined(register_hotcpu_notifier)
346 #define register_hotcpu_notifier register_cpu_notifier
347 #endif
348 
349 #if defined(NVCPU_X86_64)
350 #if !defined(pmd_large)
351 #define pmd_large(_pmd) \
352     ((pmd_val(_pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
353 #endif
354 #endif /* defined(NVCPU_X86_64) */
355 
356 #define NV_PAGE_COUNT(page) \
357   ((unsigned int)page_count(page))
358 #define NV_GET_PAGE_COUNT(page_ptr) \
359   (NV_PAGE_COUNT(NV_GET_PAGE_STRUCT(page_ptr->phys_addr)))
360 #define NV_GET_PAGE_FLAGS(page_ptr) \
361   (NV_GET_PAGE_STRUCT(page_ptr->phys_addr)->flags)
362 
363 /*
364  * Before the introduction of VM_PFNMAP, there was an VM_UNPAGED flag.
365  * Drivers which wanted to call remap_pfn_range on normal pages had to use this
366  * VM_UNPAGED flag *and* set PageReserved. With the introduction of VM_PFNMAP,
367  * that restriction went away. This is described in commit
368  *
369  *   2005-10-28 6aab341e0a28aff100a09831c5300a2994b8b986
370  *     ("mm: re-architect the VM_UNPAGED logic")
371  *
372  * , which added VM_PFNMAP and vm_normal_page. Therefore, if VM_PFNMAP is
373  * defined, then we do *not* need to mark a page as reserved, in order to
374  * call remap_pfn_range().
375  */
376 #if !defined(VM_PFNMAP)
377 #define NV_MAYBE_RESERVE_PAGE(ptr_ptr) \
378   SetPageReserved(NV_GET_PAGE_STRUCT(page_ptr->phys_addr))
379 #define NV_MAYBE_UNRESERVE_PAGE(page_ptr) \
380   ClearPageReserved(NV_GET_PAGE_STRUCT(page_ptr->phys_addr))
381 #else
382 #define NV_MAYBE_RESERVE_PAGE(ptr_ptr)
383 #define NV_MAYBE_UNRESERVE_PAGE(page_ptr)
384 #endif /* defined(VM_PFNMAP) */
385 
386 #if !defined(__GFP_COMP)
387 #define __GFP_COMP 0
388 #endif
389 
390 #if !defined(DEBUG) && defined(__GFP_NOWARN)
391 #define NV_GFP_KERNEL (GFP_KERNEL | __GFP_NOWARN)
392 #define NV_GFP_ATOMIC (GFP_ATOMIC | __GFP_NOWARN)
393 #else
394 #define NV_GFP_KERNEL (GFP_KERNEL)
395 #define NV_GFP_ATOMIC (GFP_ATOMIC)
396 #endif
397 
398 #if defined(GFP_DMA32)
399 /*
400  * GFP_DMA32 is similar to GFP_DMA, but instructs the Linux zone
401  * allocator to allocate memory from the first 4GB on platforms
402  * such as Linux/x86-64; the alternative is to use an IOMMU such
403  * as the one implemented with the K8 GART, if available.
404  */
405 #define NV_GFP_DMA32 (NV_GFP_KERNEL | GFP_DMA32)
406 #else
407 #define NV_GFP_DMA32 (NV_GFP_KERNEL)
408 #endif
409 
410 typedef enum
411 {
412     NV_MEMORY_TYPE_SYSTEM,      /* Memory mapped for ROM, SBIOS and physical RAM. */
413     NV_MEMORY_TYPE_REGISTERS,
414     NV_MEMORY_TYPE_FRAMEBUFFER,
415     NV_MEMORY_TYPE_DEVICE_MMIO, /* All kinds of MMIO referred by NVRM e.g. BARs and MCFG of device */
416 } nv_memory_type_t;
417 
418 #if defined(NVCPU_AARCH64) || defined(NVCPU_PPC64LE) || defined(NVCPU_RISCV64)
419 #define NV_ALLOW_WRITE_COMBINING(mt)    1
420 #elif defined(NVCPU_X86_64)
421 #if defined(NV_ENABLE_PAT_SUPPORT)
422 #define NV_ALLOW_WRITE_COMBINING(mt)    \
423     ((nv_pat_mode != NV_PAT_MODE_DISABLED) && \
424      ((mt) != NV_MEMORY_TYPE_REGISTERS))
425 #else
426 #define NV_ALLOW_WRITE_COMBINING(mt)    0
427 #endif
428 #endif
429 
430 #if !defined(IRQF_SHARED)
431 #define IRQF_SHARED SA_SHIRQ
432 #endif
433 
434 #define NV_MAX_RECURRING_WARNING_MESSAGES 10
435 
436 /* various memory tracking/debugging techniques
437  * disabled for retail builds, enabled for debug builds
438  */
439 
440 // allow an easy way to convert all debug printfs related to memory
441 // management back and forth between 'info' and 'errors'
442 #if defined(NV_DBG_MEM)
443 #define NV_DBG_MEMINFO NV_DBG_ERRORS
444 #else
445 #define NV_DBG_MEMINFO NV_DBG_INFO
446 #endif
447 
448 #define NV_MEM_TRACKING_PAD_SIZE(size) \
449     (size) = NV_ALIGN_UP((size + sizeof(void *)), sizeof(void *))
450 
451 #define NV_MEM_TRACKING_HIDE_SIZE(ptr, size)            \
452     if ((ptr != NULL) && (*(ptr) != NULL))              \
453     {                                                   \
454         NvU8 *__ptr;                                    \
455         *(unsigned long *) *(ptr) = (size);             \
456         __ptr = *(ptr); __ptr += sizeof(void *);        \
457         *(ptr) = (void *) __ptr;                        \
458     }
459 #define NV_MEM_TRACKING_RETRIEVE_SIZE(ptr, size)        \
460     {                                                   \
461         NvU8 *__ptr = (ptr); __ptr -= sizeof(void *);   \
462         (ptr) = (void *) __ptr;                         \
463         (size) = *(unsigned long *) (ptr);              \
464     }
465 
466 /* keep track of memory usage */
467 #include "nv-memdbg.h"
468 
469 static inline void *nv_vmalloc(unsigned long size)
470 {
471 #if defined(NV_VMALLOC_HAS_PGPROT_T_ARG)
472     void *ptr = __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
473 #else
474     void *ptr = __vmalloc(size, GFP_KERNEL);
475 #endif
476     if (ptr)
477         NV_MEMDBG_ADD(ptr, size);
478     return ptr;
479 }
480 
481 static inline void nv_vfree(void *ptr, NvU64 size)
482 {
483     NV_MEMDBG_REMOVE(ptr, size);
484     vfree(ptr);
485 }
486 
487 static inline void *nv_ioremap(NvU64 phys, NvU64 size)
488 {
489 #if IS_ENABLED(CONFIG_INTEL_TDX_GUEST) && defined(NV_IOREMAP_DRIVER_HARDENED_PRESENT)
490     void *ptr = ioremap_driver_hardened(phys, size);
491 #else
492     void *ptr = ioremap(phys, size);
493 #endif
494     if (ptr)
495         NV_MEMDBG_ADD(ptr, size);
496     return ptr;
497 }
498 
499 static inline void *nv_ioremap_nocache(NvU64 phys, NvU64 size)
500 {
501     return nv_ioremap(phys, size);
502 }
503 
504 static inline void *nv_ioremap_cache(NvU64 phys, NvU64 size)
505 {
506     void *ptr = NULL;
507 #if IS_ENABLED(CONFIG_INTEL_TDX_GUEST) && defined(NV_IOREMAP_CACHE_SHARED_PRESENT)
508     ptr = ioremap_cache_shared(phys, size);
509 #elif defined(NV_IOREMAP_CACHE_PRESENT)
510     ptr = ioremap_cache(phys, size);
511 #elif defined(NVCPU_PPC64LE)
512     //
513     // ioremap_cache() has been only implemented correctly for ppc64le with
514     // commit f855b2f544d6 in April 2017 (kernel 4.12+). Internally, the kernel
515     // does provide a default implementation of ioremap_cache() that would be
516     // incorrect for our use (creating an uncached mapping) before the
517     // referenced commit, but that implementation is not exported and the
518     // NV_IOREMAP_CACHE_PRESENT conftest doesn't pick it up, and we end up in
519     // this #elif branch.
520     //
521     // At the same time, ppc64le have supported ioremap_prot() since May 2011
522     // (commit 40f1ce7fb7e8, kernel 3.0+) and that covers all kernels we
523     // support on power.
524     //
525     ptr = ioremap_prot(phys, size, pgprot_val(PAGE_KERNEL));
526 #else
527     return nv_ioremap(phys, size);
528 #endif
529 
530     if (ptr)
531         NV_MEMDBG_ADD(ptr, size);
532 
533     return ptr;
534 }
535 
536 static inline void *nv_ioremap_wc(NvU64 phys, NvU64 size)
537 {
538     void *ptr = NULL;
539 #if IS_ENABLED(CONFIG_INTEL_TDX_GUEST) && defined(NV_IOREMAP_DRIVER_HARDENED_WC_PRESENT)
540     ptr = ioremap_driver_hardened_wc(phys, size);
541 #elif defined(NV_IOREMAP_WC_PRESENT)
542     ptr = ioremap_wc(phys, size);
543 #else
544     return nv_ioremap_nocache(phys, size);
545 #endif
546 
547     if (ptr)
548         NV_MEMDBG_ADD(ptr, size);
549 
550     return ptr;
551 }
552 
553 static inline void nv_iounmap(void *ptr, NvU64 size)
554 {
555     NV_MEMDBG_REMOVE(ptr, size);
556     iounmap(ptr);
557 }
558 
559 static NvBool nv_numa_node_has_memory(int node_id)
560 {
561     if (node_id < 0 || node_id >= MAX_NUMNODES)
562         return NV_FALSE;
563     return node_state(node_id, N_MEMORY) ? NV_TRUE : NV_FALSE;
564 }
565 
566 #define NV_KMALLOC(ptr, size) \
567     { \
568         (ptr) = kmalloc(size, NV_GFP_KERNEL); \
569         if (ptr) \
570             NV_MEMDBG_ADD(ptr, size); \
571     }
572 
573 #define NV_KZALLOC(ptr, size) \
574     { \
575         (ptr) = kzalloc(size, NV_GFP_KERNEL); \
576         if (ptr) \
577             NV_MEMDBG_ADD(ptr, size); \
578     }
579 
580 #define NV_KMALLOC_ATOMIC(ptr, size) \
581     { \
582         (ptr) = kmalloc(size, NV_GFP_ATOMIC); \
583         if (ptr) \
584             NV_MEMDBG_ADD(ptr, size); \
585     }
586 
587 #if defined(__GFP_RETRY_MAYFAIL)
588 #define NV_GFP_NO_OOM (NV_GFP_KERNEL | __GFP_RETRY_MAYFAIL)
589 #elif defined(__GFP_NORETRY)
590 #define NV_GFP_NO_OOM (NV_GFP_KERNEL | __GFP_NORETRY)
591 #else
592 #define NV_GFP_NO_OOM (NV_GFP_KERNEL)
593 #endif
594 
595 #define NV_KMALLOC_NO_OOM(ptr, size) \
596     { \
597         (ptr) = kmalloc(size, NV_GFP_NO_OOM); \
598         if (ptr) \
599             NV_MEMDBG_ADD(ptr, size); \
600     }
601 
602 #define NV_KFREE(ptr, size) \
603     { \
604         NV_MEMDBG_REMOVE(ptr, size); \
605         kfree((void *) (ptr)); \
606     }
607 
608 #define NV_ALLOC_PAGES_NODE(ptr, nid, order, gfp_mask) \
609     { \
610         (ptr) = (unsigned long)page_address(alloc_pages_node(nid, gfp_mask, order)); \
611     }
612 
613 #define NV_GET_FREE_PAGES(ptr, order, gfp_mask)      \
614     {                                                \
615         (ptr) = __get_free_pages(gfp_mask, order);   \
616     }
617 
618 #define NV_FREE_PAGES(ptr, order)                    \
619     {                                                \
620         free_pages(ptr, order);                      \
621     }
622 
623 static inline pgprot_t nv_sme_clr(pgprot_t prot)
624 {
625 #if defined(__sme_clr)
626     return __pgprot(__sme_clr(pgprot_val(prot)));
627 #else
628     return prot;
629 #endif // __sme_clr
630 }
631 
632 static inline pgprot_t nv_adjust_pgprot(pgprot_t vm_prot, NvU32 extra)
633 {
634     pgprot_t prot = __pgprot(pgprot_val(vm_prot) | extra);
635 
636 #if defined(pgprot_decrypted)
637     return pgprot_decrypted(prot);
638 #else
639     return nv_sme_clr(prot);
640 #endif // pgprot_decrypted
641 }
642 
643 #if defined(PAGE_KERNEL_NOENC)
644 #if defined(__pgprot_mask)
645 #define NV_PAGE_KERNEL_NOCACHE_NOENC __pgprot_mask(__PAGE_KERNEL_NOCACHE)
646 #elif defined(default_pgprot)
647 #define NV_PAGE_KERNEL_NOCACHE_NOENC default_pgprot(__PAGE_KERNEL_NOCACHE)
648 #elif defined( __pgprot)
649 #define NV_PAGE_KERNEL_NOCACHE_NOENC __pgprot(__PAGE_KERNEL_NOCACHE)
650 #else
651 #error "Unsupported kernel!!!"
652 #endif
653 #endif
654 
655 static inline NvUPtr nv_vmap(struct page **pages, NvU32 page_count,
656                              NvBool cached, NvBool unencrypted)
657 {
658     void *ptr;
659     pgprot_t prot = PAGE_KERNEL;
660 #if defined(NVCPU_X86_64)
661 #if defined(PAGE_KERNEL_NOENC)
662     if (unencrypted)
663     {
664         prot = cached ? nv_adjust_pgprot(PAGE_KERNEL_NOENC, 0) :
665                         nv_adjust_pgprot(NV_PAGE_KERNEL_NOCACHE_NOENC, 0);
666     }
667     else
668 #endif
669     {
670         prot = cached ? PAGE_KERNEL : PAGE_KERNEL_NOCACHE;
671     }
672 #elif defined(NVCPU_AARCH64)
673     prot = cached ? PAGE_KERNEL : NV_PGPROT_UNCACHED(PAGE_KERNEL);
674 #endif
675     /* All memory cached in PPC64LE; can't honor 'cached' input. */
676     ptr = vmap(pages, page_count, VM_MAP, prot);
677     if (ptr)
678         NV_MEMDBG_ADD(ptr, page_count * PAGE_SIZE);
679     return (NvUPtr)ptr;
680 }
681 
682 static inline void nv_vunmap(NvUPtr vaddr, NvU32 page_count)
683 {
684     vunmap((void *)vaddr);
685     NV_MEMDBG_REMOVE((void *)vaddr, page_count * PAGE_SIZE);
686 }
687 
688 #if defined(NV_GET_NUM_PHYSPAGES_PRESENT)
689 #define NV_NUM_PHYSPAGES                get_num_physpages()
690 #else
691 #define NV_NUM_PHYSPAGES                num_physpages
692 #endif
693 #define NV_GET_CURRENT_PROCESS()        current->tgid
694 #define NV_IN_ATOMIC()                  in_atomic()
695 #define NV_LOCAL_BH_DISABLE()           local_bh_disable()
696 #define NV_LOCAL_BH_ENABLE()            local_bh_enable()
697 #define NV_COPY_TO_USER(to, from, n)    copy_to_user(to, from, n)
698 #define NV_COPY_FROM_USER(to, from, n)  copy_from_user(to, from, n)
699 
700 #define NV_IS_SUSER()                   capable(CAP_SYS_ADMIN)
701 #define NV_PCI_DEVICE_NAME(pci_dev)     ((pci_dev)->pretty_name)
702 #define NV_CLI()                        local_irq_disable()
703 #define NV_SAVE_FLAGS(eflags)           local_save_flags(eflags)
704 #define NV_RESTORE_FLAGS(eflags)        local_irq_restore(eflags)
705 #define NV_MAY_SLEEP()                  (!irqs_disabled() && !in_interrupt() && !NV_IN_ATOMIC())
706 #define NV_MODULE_PARAMETER(x)          module_param(x, int, 0)
707 #define NV_MODULE_STRING_PARAMETER(x)   module_param(x, charp, 0)
708 #undef  MODULE_PARM
709 
710 #define NV_NUM_CPUS()                   num_possible_cpus()
711 
712 static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
713 {
714 #if defined(NV_PHYS_TO_DMA_PRESENT)
715     return phys_to_dma(dev, pa);
716 #elif defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
717     return phys_to_machine(pa);
718 #else
719     return (dma_addr_t)pa;
720 #endif
721 }
722 
723 #define NV_GET_PAGE_STRUCT(phys_page) virt_to_page(__va(phys_page))
724 #define NV_VMA_PGOFF(vma)             ((vma)->vm_pgoff)
725 #define NV_VMA_SIZE(vma)              ((vma)->vm_end - (vma)->vm_start)
726 #define NV_VMA_OFFSET(vma)            (((NvU64)(vma)->vm_pgoff) << PAGE_SHIFT)
727 #define NV_VMA_PRIVATE(vma)           ((vma)->vm_private_data)
728 #define NV_VMA_FILE(vma)              ((vma)->vm_file)
729 
730 #define NV_DEVICE_MINOR_NUMBER(x)     minor((x)->i_rdev)
731 
732 #define NV_PCI_DISABLE_DEVICE(pci_dev)                           \
733     {                                                            \
734         NvU16 __cmd[2];                                          \
735         pci_read_config_word((pci_dev), PCI_COMMAND, &__cmd[0]); \
736         pci_disable_device(pci_dev);                             \
737         pci_read_config_word((pci_dev), PCI_COMMAND, &__cmd[1]); \
738         __cmd[1] |= PCI_COMMAND_MEMORY;                          \
739         pci_write_config_word((pci_dev), PCI_COMMAND,            \
740                 (__cmd[1] | (__cmd[0] & PCI_COMMAND_IO)));       \
741     }
742 
743 #define NV_PCI_RESOURCE_START(pci_dev, bar) pci_resource_start(pci_dev, (bar))
744 #define NV_PCI_RESOURCE_SIZE(pci_dev, bar)  pci_resource_len(pci_dev, (bar))
745 #define NV_PCI_RESOURCE_FLAGS(pci_dev, bar) pci_resource_flags(pci_dev, (bar))
746 
747 #define NV_PCI_RESOURCE_VALID(pci_dev, bar)                                     \
748     ((NV_PCI_RESOURCE_START(pci_dev, bar) != 0) &&                              \
749      (NV_PCI_RESOURCE_SIZE(pci_dev, bar) != 0))
750 
751 #define NV_PCI_DOMAIN_NUMBER(pci_dev) (NvU32)pci_domain_nr(pci_dev->bus)
752 #define NV_PCI_BUS_NUMBER(pci_dev)    (pci_dev)->bus->number
753 #define NV_PCI_DEVFN(pci_dev)         (pci_dev)->devfn
754 #define NV_PCI_SLOT_NUMBER(pci_dev)   PCI_SLOT(NV_PCI_DEVFN(pci_dev))
755 
756 #if defined(CONFIG_X86_UV) && defined(NV_CONFIG_X86_UV)
757 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus,devfn)                        \
758    ({                                                                       \
759         struct pci_dev *__dev = NULL;                                       \
760         while ((__dev = pci_get_device(PCI_VENDOR_ID_NVIDIA,                \
761                     PCI_ANY_ID, __dev)) != NULL)                            \
762         {                                                                   \
763             if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&                  \
764                 (NV_PCI_BUS_NUMBER(__dev) == bus) &&                        \
765                 (NV_PCI_DEVFN(__dev) == devfn))                             \
766             {                                                               \
767                 break;                                                      \
768             }                                                               \
769         }                                                                   \
770         if (__dev == NULL)                                                  \
771         {                                                                   \
772             while ((__dev = pci_get_class((PCI_CLASS_BRIDGE_HOST << 8),     \
773                         __dev)) != NULL)                                    \
774             {                                                               \
775                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
776                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
777                     (NV_PCI_DEVFN(__dev) == devfn))                         \
778                 {                                                           \
779                     break;                                                  \
780                 }                                                           \
781             }                                                               \
782         }                                                                   \
783         if (__dev == NULL)                                                  \
784         {                                                                   \
785             while ((__dev = pci_get_class((PCI_CLASS_BRIDGE_PCI << 8),      \
786                         __dev)) != NULL)                                    \
787             {                                                               \
788                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
789                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
790                     (NV_PCI_DEVFN(__dev) == devfn))                         \
791                 {                                                           \
792                     break;                                                  \
793                 }                                                           \
794             }                                                               \
795         }                                                                   \
796         if (__dev == NULL)                                                  \
797         {                                                                   \
798             while ((__dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID,          \
799                             __dev)) != NULL)                                \
800             {                                                               \
801                 if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&              \
802                     (NV_PCI_BUS_NUMBER(__dev) == bus) &&                    \
803                     (NV_PCI_DEVFN(__dev) == devfn))                         \
804                 {                                                           \
805                     break;                                                  \
806                 }                                                           \
807             }                                                               \
808         }                                                                   \
809         __dev;                                                              \
810     })
811 #elif defined(NV_PCI_GET_DOMAIN_BUS_AND_SLOT_PRESENT)
812 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus, devfn) \
813     pci_get_domain_bus_and_slot(domain, bus, devfn)
814 #else
815 #define NV_GET_DOMAIN_BUS_AND_SLOT(domain,bus,devfn)               \
816    ({                                                              \
817         struct pci_dev *__dev = NULL;                              \
818         while ((__dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID,     \
819                     __dev)) != NULL)                               \
820         {                                                          \
821             if ((NV_PCI_DOMAIN_NUMBER(__dev) == domain) &&         \
822                 (NV_PCI_BUS_NUMBER(__dev) == bus) &&               \
823                 (NV_PCI_DEVFN(__dev) == devfn))                    \
824             {                                                      \
825                 break;                                             \
826             }                                                      \
827         }                                                          \
828         __dev;                                                     \
829     })
830 #endif
831 
832 #if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE_PRESENT)  // introduced in 3.18-rc1 for aarch64
833 #define NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(pci_dev) pci_stop_and_remove_bus_device(pci_dev)
834 #endif
835 
836 #define NV_PRINT_AT(nv_debug_level,at)                                           \
837     {                                                                            \
838         nv_printf(nv_debug_level,                                                \
839             "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, flags = 0x%08x, "    \
840             "page_table = 0x%p\n",  __FUNCTION__, __LINE__, at,                  \
841             at->num_pages, NV_ATOMIC_READ(at->usage_count),                      \
842             at->flags, at->page_table);                                          \
843     }
844 
845 #define NV_PRINT_VMA(nv_debug_level,vma)                                                 \
846     {                                                                                    \
847         nv_printf(nv_debug_level,                                                        \
848             "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08x bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
849             __FUNCTION__, __LINE__, vma->vm_start, vma->vm_end, NV_VMA_SIZE(vma),        \
850             NV_VMA_OFFSET(vma), NV_VMA_PRIVATE(vma), NV_VMA_FILE(vma));                  \
851     }
852 
853 #ifndef minor
854 # define minor(x) MINOR(x)
855 #endif
856 
857 #if defined(cpu_relax)
858 #define NV_CPU_RELAX() cpu_relax()
859 #else
860 #define NV_CPU_RELAX() barrier()
861 #endif
862 
863 #ifndef IRQ_RETVAL
864 typedef void irqreturn_t;
865 #define IRQ_RETVAL(a)
866 #endif
867 
868 #if !defined(PCI_COMMAND_SERR)
869 #define PCI_COMMAND_SERR            0x100
870 #endif
871 #if !defined(PCI_COMMAND_INTX_DISABLE)
872 #define PCI_COMMAND_INTX_DISABLE    0x400
873 #endif
874 
875 #ifndef PCI_CAP_ID_EXP
876 #define PCI_CAP_ID_EXP 0x10
877 #endif
878 
879 /*
880  * On Linux on PPC64LE enable basic support for Linux PCI error recovery (see
881  * Documentation/PCI/pci-error-recovery.txt). Currently RM only supports error
882  * notification and data collection, not actual recovery of the device.
883  */
884 #if defined(NVCPU_PPC64LE) && defined(CONFIG_EEH)
885 #include <asm/eeh.h>
886 #define NV_PCI_ERROR_RECOVERY
887 #endif
888 
889 /*
890  * If the host OS has page sizes larger than 4KB, we may have a security
891  * problem. Registers are typically grouped in 4KB pages, but if there are
892  * larger pages, then the smallest userspace mapping possible (e.g., a page)
893  * may give more access than intended to the user.
894  */
895 #define NV_4K_PAGE_ISOLATION_REQUIRED(addr, size)                       \
896     ((PAGE_SIZE > NV_RM_PAGE_SIZE) &&                                   \
897      ((size) <= NV_RM_PAGE_SIZE) &&                                     \
898      (((addr) >> NV_RM_PAGE_SHIFT) ==                                   \
899         (((addr) + (size) - 1) >> NV_RM_PAGE_SHIFT)))
900 
901 /*
902  * The kernel may have a workaround for this, by providing a method to isolate
903  * a single 4K page in a given mapping.
904  */
905 #if (PAGE_SIZE > NV_RM_PAGE_SIZE) && defined(NVCPU_PPC64LE) && defined(NV_PAGE_4K_PFN)
906     #define NV_4K_PAGE_ISOLATION_PRESENT
907     #define NV_4K_PAGE_ISOLATION_MMAP_ADDR(addr)                        \
908         ((NvP64)((void*)(((addr) >> NV_RM_PAGE_SHIFT) << PAGE_SHIFT)))
909     #define NV_4K_PAGE_ISOLATION_MMAP_LEN(size)     PAGE_SIZE
910     #define NV_4K_PAGE_ISOLATION_ACCESS_START(addr)                     \
911         ((NvP64)((void*)((addr) & ~NV_RM_PAGE_MASK)))
912     #define NV_4K_PAGE_ISOLATION_ACCESS_LEN(addr, size)                 \
913         ((((addr) & NV_RM_PAGE_MASK) + size + NV_RM_PAGE_MASK) &        \
914          ~NV_RM_PAGE_MASK)
915     #define NV_PROT_4K_PAGE_ISOLATION NV_PAGE_4K_PFN
916 #endif
917 
918 static inline int nv_remap_page_range(struct vm_area_struct *vma,
919     unsigned long virt_addr, NvU64 phys_addr, NvU64 size, pgprot_t prot)
920 {
921     int ret = -1;
922 
923 #if defined(NV_4K_PAGE_ISOLATION_PRESENT) && defined(NV_PROT_4K_PAGE_ISOLATION)
924     if ((size == PAGE_SIZE) &&
925         ((pgprot_val(prot) & NV_PROT_4K_PAGE_ISOLATION) != 0))
926     {
927         /*
928          * remap_4k_pfn() hardcodes the length to a single OS page, and checks
929          * whether applying the page isolation workaround will cause PTE
930          * corruption (in which case it will fail, and this is an unsupported
931          * configuration).
932          */
933 #if defined(NV_HASH__REMAP_4K_PFN_PRESENT)
934         ret = hash__remap_4k_pfn(vma, virt_addr, (phys_addr >> PAGE_SHIFT), prot);
935 #else
936         ret = remap_4k_pfn(vma, virt_addr, (phys_addr >> PAGE_SHIFT), prot);
937 #endif
938     }
939     else
940 #endif
941     {
942         ret = remap_pfn_range(vma, virt_addr, (phys_addr >> PAGE_SHIFT), size,
943             prot);
944     }
945 
946     return ret;
947 }
948 
949 static inline int nv_io_remap_page_range(struct vm_area_struct *vma,
950     NvU64 phys_addr, NvU64 size, NvU32 extra_prot)
951 {
952     int ret = -1;
953 #if !defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
954     ret = nv_remap_page_range(vma, vma->vm_start, phys_addr, size,
955         nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
956 #else
957     ret = io_remap_pfn_range(vma, vma->vm_start, (phys_addr >> PAGE_SHIFT),
958         size, nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
959 #endif
960     return ret;
961 }
962 
963 static inline vm_fault_t nv_insert_pfn(struct vm_area_struct *vma,
964     NvU64 virt_addr, NvU64 pfn, NvU32 extra_prot)
965 {
966     /*
967      * vm_insert_pfn{,_prot} replaced with vmf_insert_pfn{,_prot} in Linux 4.20
968      */
969 #if defined(NV_VMF_INSERT_PFN_PROT_PRESENT)
970     return vmf_insert_pfn_prot(vma, virt_addr, pfn,
971              __pgprot(pgprot_val(vma->vm_page_prot) | extra_prot));
972 #else
973     int ret = -EINVAL;
974     /*
975      * Only PPC64LE (NV_4K_PAGE_ISOLATION_PRESENT) requires extra_prot to be
976      * used when remapping.
977      *
978      * vm_insert_pfn_prot() was added in Linux 4.4, whereas POWER9 support
979      * was added in Linux 4.8.
980      *
981      * Rather than tampering with the vma to make use of extra_prot with
982      * vm_insert_pfn() on older kernels, for now, just fail in this case, as
983      * it's not expected to be used currently.
984      */
985 #if defined(NV_VM_INSERT_PFN_PROT_PRESENT)
986     ret = vm_insert_pfn_prot(vma, virt_addr, pfn,
987         __pgprot(pgprot_val(vma->vm_page_prot) | extra_prot));
988 #elif !defined(NV_4K_PAGE_ISOLATION_PRESENT)
989     ret = vm_insert_pfn(vma, virt_addr, pfn);
990 #endif
991     switch (ret)
992     {
993         case 0:
994         case -EBUSY:
995             /*
996              * EBUSY indicates that another thread already handled
997              * the faulted range.
998              */
999             return VM_FAULT_NOPAGE;
1000         case -ENOMEM:
1001             return VM_FAULT_OOM;
1002         default:
1003             break;
1004     }
1005 #endif /* defined(NV_VMF_INSERT_PFN_PROT_PRESENT) */
1006     return VM_FAULT_SIGBUS;
1007 }
1008 
1009 /* Converts BAR index to Linux specific PCI BAR index */
1010 static inline NvU8 nv_bar_index_to_os_bar_index
1011 (
1012     struct pci_dev *dev,
1013     NvU8 nv_bar_index
1014 )
1015 {
1016     NvU8 bar_index = 0;
1017     NvU8 i;
1018 
1019     BUG_ON(nv_bar_index >= NV_GPU_NUM_BARS);
1020 
1021     for (i = 0; i < nv_bar_index; i++)
1022     {
1023         if (NV_PCI_RESOURCE_FLAGS(dev, bar_index) & PCI_BASE_ADDRESS_MEM_TYPE_64)
1024         {
1025             bar_index += 2;
1026         }
1027         else
1028         {
1029             bar_index++;
1030         }
1031     }
1032 
1033     return bar_index;
1034 }
1035 
1036 #define NV_PAGE_MASK    (NvU64)(long)PAGE_MASK
1037 
1038 extern void *nvidia_stack_t_cache;
1039 
1040 /*
1041  * On Linux, when a kmem cache is created, a new sysfs entry is created for the
1042  * same unless it's merged with an existing cache. Upstream Linux kernel commit
1043  * 3b7b314053d021601940c50b07f5f1423ae67e21 (version 4.12+) made cache
1044  * destruction asynchronous which creates a race between cache destroy and
1045  * create. A new cache created with attributes as a previous cache, which is
1046  * scheduled for destruction, can try to create a sysfs entry with the same
1047  * conflicting name. Upstream Linux kernel commit
1048  * d50d82faa0c964e31f7a946ba8aba7c715ca7ab0 (4.18) fixes this issue by cleaning
1049  * up sysfs entry within slab_mutex, so the entry is deleted before a cache with
1050  * the same attributes could be created.
1051  *
1052  * To workaround this kernel issue, we take two steps:
1053  * - Create unmergeable caches: a kmem_cache with a constructor is unmergeable.
1054  *   So, we define an empty contructor for the same. Creating an unmergeable
1055  *   cache ensures that the kernel doesn't generate an internal name and always
1056  *   uses our name instead.
1057  *
1058  * - Generate a unique cache name by appending the current timestamp (ns). We
1059  *   wait for the timestamp to increment by at least one to ensure that we do
1060  *   not hit a name conflict in cache create -> destroy (async) -> create cycle.
1061  */
1062 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1063 static inline void nv_kmem_ctor_dummy(void *arg)
1064 {
1065     (void)arg;
1066 }
1067 #else
1068 #define nv_kmem_ctor_dummy NULL
1069 #endif
1070 
1071 #define NV_KMEM_CACHE_CREATE(name, type)    \
1072     nv_kmem_cache_create(name, sizeof(type), 0)
1073 
1074 /* The NULL pointer check is required for kernels older than 4.3 */
1075 #define NV_KMEM_CACHE_DESTROY(kmem_cache)   \
1076     if (kmem_cache != NULL)                 \
1077     {                                       \
1078         kmem_cache_destroy(kmem_cache);     \
1079     }
1080 
1081 #define NV_KMEM_CACHE_ALLOC(kmem_cache)     \
1082     kmem_cache_alloc(kmem_cache, GFP_KERNEL)
1083 #define NV_KMEM_CACHE_FREE(ptr, kmem_cache) \
1084     kmem_cache_free(kmem_cache, ptr)
1085 
1086 static inline void *nv_kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
1087 {
1088 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1089     /*
1090      * We cannot call kmem_cache_zalloc directly as it adds the __GFP_ZERO
1091      * flag. This flag together with the presence of a slab constructor is
1092      * flagged as a potential bug by the Linux kernel since it is the role
1093      * of a constructor to fill an allocated object with the desired
1094      * pattern. In our case, we specified a (dummy) constructor as a
1095      * workaround for a bug and not to zero-initialize objects. So, we take
1096      * the pain here to memset allocated object ourselves.
1097      */
1098     void *object = kmem_cache_alloc(k, flags);
1099     if (object)
1100         memset(object, 0, kmem_cache_size(k));
1101     return object;
1102 #else
1103     return kmem_cache_zalloc(k, flags);
1104 #endif
1105 }
1106 
1107 static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
1108 {
1109     nvidia_stack_t *sp = NULL;
1110 #if defined(NVCPU_X86_64)
1111     if (rm_is_altstack_in_use())
1112     {
1113         sp = NV_KMEM_CACHE_ALLOC(nvidia_stack_t_cache);
1114         if (sp == NULL)
1115             return -ENOMEM;
1116         sp->size = sizeof(sp->stack);
1117         sp->top = sp->stack + sp->size;
1118     }
1119 #endif
1120     *stack = sp;
1121     return 0;
1122 }
1123 
1124 static inline void nv_kmem_cache_free_stack(nvidia_stack_t *stack)
1125 {
1126 #if defined(NVCPU_X86_64)
1127     if (stack != NULL && rm_is_altstack_in_use())
1128     {
1129         NV_KMEM_CACHE_FREE(stack, nvidia_stack_t_cache);
1130     }
1131 #endif
1132 }
1133 
1134 #if defined(NVCPU_X86_64)
1135 /*
1136  * RAM is cached on Linux by default, we can assume there's
1137  * nothing to be done here. This is not the case for the
1138  * other memory spaces: we will have made an attempt to add
1139  * a WC MTRR for the frame buffer.
1140  *
1141  * If a WC MTRR is present, we can't satisfy the WB mapping
1142  * attempt here, since the achievable effective memory
1143  * types in that case are WC and UC, if not it's typically
1144  * UC (MTRRdefType is UC); we could only satisfy WB mapping
1145  * requests with a WB MTRR.
1146  */
1147 #define NV_ALLOW_CACHING(mt)            ((mt) == NV_MEMORY_TYPE_SYSTEM)
1148 #else
1149 #define NV_ALLOW_CACHING(mt)            ((mt) != NV_MEMORY_TYPE_REGISTERS)
1150 #endif
1151 
1152 typedef struct nvidia_pte_s {
1153     NvU64           phys_addr;
1154     unsigned long   virt_addr;
1155     NvU64           dma_addr;
1156 #ifdef CONFIG_XEN
1157     unsigned int    guest_pfn;
1158 #endif
1159     unsigned int    page_count;
1160 } nvidia_pte_t;
1161 
1162 typedef struct nv_alloc_s {
1163     struct nv_alloc_s *next;
1164     struct device     *dev;
1165     atomic_t       usage_count;
1166     struct {
1167         NvBool contig      : 1;
1168         NvBool guest       : 1;
1169         NvBool zeroed      : 1;
1170         NvBool aliased     : 1;
1171         NvBool user        : 1;
1172         NvBool node        : 1;
1173         NvBool peer_io     : 1;
1174         NvBool physical    : 1;
1175         NvBool unencrypted : 1;
1176         NvBool coherent    : 1;
1177     } flags;
1178     unsigned int   cache_type;
1179     unsigned int   num_pages;
1180     unsigned int   order;
1181     unsigned int   size;
1182     nvidia_pte_t **page_table;          /* list of physical pages allocated */
1183     unsigned int   pid;
1184     struct page  **user_pages;
1185     NvU64         guest_id;             /* id of guest VM */
1186     NvS32         node_id;              /* Node id for memory allocation when node is set in flags */
1187     void          *import_priv;
1188     struct sg_table *import_sgt;
1189 } nv_alloc_t;
1190 
1191 /**
1192  * nv_is_dma_direct - return true if direct_dma is enabled
1193  *
1194  * Starting with the 5.0 kernel, SWIOTLB is merged into
1195  * direct_dma, so systems without an IOMMU use direct_dma.  We
1196  * need to know if this is the case, so that we can use a
1197  * different check for SWIOTLB enablement.
1198  */
1199 static inline NvBool nv_is_dma_direct(struct device *dev)
1200 {
1201     NvBool is_direct = NV_FALSE;
1202 
1203 #if defined(NV_DMA_IS_DIRECT_PRESENT)
1204     if (dma_is_direct(get_dma_ops(dev)))
1205         is_direct = NV_TRUE;
1206 #endif
1207 
1208     return is_direct;
1209 }
1210 
1211 /**
1212  * nv_dma_maps_swiotlb - return NV_TRUE if swiotlb is enabled
1213  *
1214  * SWIOTLB creates bounce buffers for the DMA mapping layer to
1215  * use if a driver asks the kernel to map a DMA buffer that is
1216  * outside of the device's addressable range.  The driver does
1217  * not function correctly if bounce buffers are enabled for the
1218  * device.  So if SWIOTLB is enabled, we should avoid making
1219  * mapping calls.
1220  */
1221 static inline NvBool
1222 nv_dma_maps_swiotlb(struct device *dev)
1223 {
1224     NvBool swiotlb_in_use = NV_FALSE;
1225 #if defined(CONFIG_SWIOTLB)
1226   #if defined(NV_DMA_OPS_PRESENT) || defined(NV_GET_DMA_OPS_PRESENT) || \
1227       defined(NV_SWIOTLB_DMA_OPS_PRESENT)
1228     /*
1229      * We only use the 'dma_ops' symbol on older x86_64 kernels; later kernels,
1230      * including those for other architectures, have converged on the
1231      * get_dma_ops() interface.
1232      */
1233     #if defined(NV_GET_DMA_OPS_PRESENT)
1234     /*
1235      * The __attribute__ ((unused)) is necessary because in at least one
1236      * case, *none* of the preprocessor branches below are taken, and
1237      * so the ops variable ends up never being referred to at all. This can
1238      * happen with the (NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs == 1)
1239      * case.
1240      */
1241     const struct dma_map_ops *ops __attribute__ ((unused)) = get_dma_ops(dev);
1242     #else
1243     const struct dma_mapping_ops *ops __attribute__ ((unused)) = dma_ops;
1244     #endif
1245 
1246     /*
1247      * The switch from dma_mapping_ops -> dma_map_ops coincided with the
1248      * switch from swiotlb_map_sg -> swiotlb_map_sg_attrs.
1249      */
1250       #if defined(NVCPU_AARCH64) && \
1251           defined(NV_NONCOHERENT_SWIOTLB_DMA_OPS_PRESENT)
1252     /* AArch64 exports these symbols directly */
1253     swiotlb_in_use = ((ops == &noncoherent_swiotlb_dma_ops) ||
1254                       (ops == &coherent_swiotlb_dma_ops));
1255       #elif NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs != 0
1256     swiotlb_in_use = (ops->map_sg == swiotlb_map_sg_attrs);
1257       #elif NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_dma_ops != 0
1258     swiotlb_in_use = (ops == &swiotlb_dma_ops);
1259       #endif
1260       /*
1261        * The "else" case that is not shown
1262        * (for NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_map_sg_attrs == 0 ||
1263        * NV_IS_EXPORT_SYMBOL_PRESENT_swiotlb_dma_ops == 0) does
1264        * nothing, and ends up dropping us out to the last line of this function,
1265        * effectively returning false. The nearly-human-readable version of that
1266        * case is "struct swiotlb_dma_ops is present (NV_SWIOTLB_DMA_OPS_PRESENT
1267        * is defined) but neither swiotlb_map_sg_attrs nor swiotlb_dma_ops is
1268        * present".
1269        *
1270        * That can happen on kernels that fall within below range:
1271        *
1272        * 2017-12-24  4bd89ed39b2ab8dc4ac4b6c59b07d420b0213bec
1273        *     ("swiotlb: remove various exports")
1274        * 2018-06-28  210d0797c97d0e8f3b1a932a0dc143f4c57008a3
1275        *     ("swiotlb: export swiotlb_dma_ops")
1276        *
1277        * Related to this: Between above two commits, this driver has no way of
1278        * detecting whether or not the SWIOTLB is in use. Furthermore, the
1279        * driver cannot support DMA remapping. That leads to the following
1280        * point: "swiotlb=force" is not supported for kernels falling in above
1281        * range.
1282        *
1283        * The other "else" case that is not shown:
1284        * Starting with the 5.0 kernel, swiotlb is integrated into dma_direct,
1285        * which is used when there's no IOMMU.  In these kernels, ops == NULL,
1286        * swiotlb_dma_ops no longer exists, and we do not support swiotlb=force
1287        * (doing so would require detecting when swiotlb=force is enabled and
1288        * then returning NV_TRUE even when dma_direct is in use).  So for now,
1289        * we just return NV_FALSE and in nv_compute_gfp_mask() we check for
1290        * whether swiotlb could possibly be used (outside of swiotlb=force).
1291        */
1292   #endif
1293 
1294     /*
1295      * Commit 2017-11-07 d7b417fa08d ("x86/mm: Add DMA support for
1296      * SEV memory encryption") forces SWIOTLB to be enabled when AMD SEV
1297      * is active in all cases.
1298      */
1299     if (os_cc_enabled)
1300         swiotlb_in_use = NV_TRUE;
1301 #endif
1302 
1303     return swiotlb_in_use;
1304 }
1305 
1306 /*
1307  * TODO: Bug 1522381 will allow us to move these mapping relationships into
1308  *       common code.
1309  */
1310 
1311 /*
1312  * Bug 1606851: the Linux kernel scatterlist code doesn't work for regions
1313  * greater than or equal to 4GB, due to regular use of unsigned int
1314  * throughout. So we need to split our mappings into 4GB-minus-1-page-or-less
1315  * chunks and manage them separately.
1316  */
1317 typedef struct nv_dma_submap_s {
1318     NvU32 page_count;
1319     NvU32 sg_map_count;
1320     struct sg_table sgt;
1321     NvBool imported;
1322 } nv_dma_submap_t;
1323 
1324 typedef struct nv_dma_map_s {
1325     struct page **pages;
1326     NvU64 page_count;
1327     NvBool contiguous;
1328     NvU32 cache_type;
1329     struct sg_table *import_sgt;
1330 
1331     union
1332     {
1333         struct
1334         {
1335             NvU32 submap_count;
1336             nv_dma_submap_t *submaps;
1337         } discontig;
1338 
1339         struct
1340         {
1341             NvU64 dma_addr;
1342         } contig;
1343     } mapping;
1344 
1345     struct device *dev;
1346 } nv_dma_map_t;
1347 
1348 #define NV_FOR_EACH_DMA_SUBMAP(dm, sm, i)                                     \
1349     for (i = 0, sm = &dm->mapping.discontig.submaps[0];                       \
1350          i < dm->mapping.discontig.submap_count;                              \
1351          i++, sm = &dm->mapping.discontig.submaps[i])
1352 
1353 /*
1354  * On 4K ARM kernels, use max submap size a multiple of 64K to keep nv-p2p happy.
1355  * Despite 4K OS pages, we still use 64K P2P pages due to dependent modules still using 64K.
1356  * Instead of using (4G-4K), use max submap size as (4G-64K) since the mapped IOVA range
1357  * must be aligned at 64K boundary.
1358  */
1359 #if defined(CONFIG_ARM64_4K_PAGES)
1360 #define NV_DMA_U32_MAX_4K_PAGES           ((NvU32)((NV_U32_MAX >> PAGE_SHIFT) + 1))
1361 #define NV_DMA_SUBMAP_MAX_PAGES           ((NvU32)(NV_DMA_U32_MAX_4K_PAGES - 16))
1362 #else
1363 #define NV_DMA_SUBMAP_MAX_PAGES           ((NvU32)(NV_U32_MAX >> PAGE_SHIFT))
1364 #endif
1365 
1366 #define NV_DMA_SUBMAP_IDX_TO_PAGE_IDX(s)  (s * NV_DMA_SUBMAP_MAX_PAGES)
1367 
1368 /*
1369  * DO NOT use sg_alloc_table_from_pages on Xen Server, even if it's available.
1370  * This will glom multiple pages into a single sg element, which
1371  * xen_swiotlb_map_sg_attrs may try to route to the SWIOTLB. We must only use
1372  * single-page sg elements on Xen Server.
1373  */
1374 #if !defined(NV_DOM0_KERNEL_PRESENT)
1375     #define NV_ALLOC_DMA_SUBMAP_SCATTERLIST(dm, sm, i)                        \
1376         ((sg_alloc_table_from_pages(&sm->sgt,                                 \
1377             &dm->pages[NV_DMA_SUBMAP_IDX_TO_PAGE_IDX(i)],                     \
1378             sm->page_count, 0,                                                \
1379             sm->page_count * PAGE_SIZE, NV_GFP_KERNEL) == 0) ? NV_OK :        \
1380                 NV_ERR_OPERATING_SYSTEM)
1381 #else
1382     #define NV_ALLOC_DMA_SUBMAP_SCATTERLIST(dm, sm, i)                \
1383         ((sg_alloc_table(&sm->sgt, sm->page_count, NV_GFP_KERNEL)) == \
1384             0 ? NV_OK : NV_ERR_OPERATING_SYSTEM)
1385 #endif
1386 
1387 typedef struct nv_ibmnpu_info nv_ibmnpu_info_t;
1388 
1389 typedef struct nv_work_s {
1390     struct work_struct task;
1391     void *data;
1392 } nv_work_t;
1393 
1394 #define NV_MAX_REGISTRY_KEYS_LENGTH   512
1395 
1396 typedef enum
1397 {
1398     NV_DEV_STACK_TIMER,
1399     NV_DEV_STACK_ISR,
1400     NV_DEV_STACK_ISR_BH,
1401     NV_DEV_STACK_ISR_BH_UNLOCKED,
1402     NV_DEV_STACK_GPU_WAKEUP,
1403     NV_DEV_STACK_COUNT
1404 } nvidia_linux_dev_stack_t;
1405 
1406 /* Linux version of the opaque type used for os_queue_work_item() */
1407 struct os_work_queue {
1408     nv_kthread_q_t nvk;
1409 };
1410 
1411 /* Linux version of the opaque type used for os_wait_*() */
1412 struct os_wait_queue {
1413     struct completion q;
1414 };
1415 
1416 /*
1417  * To report error in msi/msix when unhandled count reaches a threshold
1418  */
1419 
1420 typedef struct nv_irq_count_info_s
1421 {
1422     int    irq;
1423     NvU64  unhandled;
1424     NvU64  total;
1425     NvU64  last_unhandled;
1426 } nv_irq_count_info_t;
1427 
1428 /* Linux-specific version of nv_dma_device_t */
1429 struct nv_dma_device {
1430     struct {
1431         NvU64 start;
1432         NvU64 limit;
1433     } addressable_range;
1434 
1435     struct device *dev;
1436     NvBool nvlink;
1437 };
1438 
1439 /* Properties of the coherent link */
1440 typedef struct coherent_link_info_s {
1441     /* Physical Address of the GPU memory in SOC AMAP. In the case of
1442      * baremetal OS environment it is System Physical Address(SPA) and in the case
1443      * of virutalized OS environment it is Intermediate Physical Address(IPA) */
1444     NvU64 gpu_mem_pa;
1445 
1446     /* Physical address of the reserved portion of the GPU memory, applicable
1447      * only in Grace Hopper self hosted passthrough virtualizatioan platform. */
1448     NvU64 rsvd_mem_pa;
1449 
1450     /* Bitmap of NUMA node ids, corresponding to the reserved PXMs,
1451      * available for adding GPU memory to the kernel as system RAM */
1452     DECLARE_BITMAP(free_node_bitmap, MAX_NUMNODES);
1453 } coherent_link_info_t;
1454 
1455 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
1456 /*
1457  * acpi data storage structure
1458  *
1459  * This structure retains the pointer to the device,
1460  * and any other baggage we want to carry along
1461  *
1462  */
1463 typedef struct
1464 {
1465     nvidia_stack_t *sp;
1466     struct acpi_device *device;
1467     struct acpi_handle *handle;
1468     void *notifier_data;
1469     int notify_handler_installed;
1470 } nv_acpi_t;
1471 #endif
1472 
1473 /* linux-specific version of old nv_state_t */
1474 /* this is a general os-specific state structure. the first element *must* be
1475    the general state structure, for the generic unix-based code */
1476 typedef struct nv_linux_state_s {
1477     nv_state_t nv_state;
1478 
1479     atomic_t usage_count;
1480     NvU32    suspend_count;
1481 
1482     struct device  *dev;
1483     struct pci_dev *pci_dev;
1484 
1485     /* IBM-NPU info associated with this GPU */
1486     nv_ibmnpu_info_t *npu;
1487 
1488     /* coherent link information */
1489      coherent_link_info_t coherent_link_info;
1490 
1491     /* Dedicated queue to be used for removing FB memory which is onlined
1492      * to kernel as a NUMA node. Refer Bug : 3879845*/
1493     nv_kthread_q_t remove_numa_memory_q;
1494 
1495     /* NUMA node information for the platforms where GPU memory is presented
1496      * as a NUMA node to the kernel */
1497     struct {
1498         /* NUMA node id >=0  when the platform supports GPU memory as NUMA node
1499          * otherwise it holds the value of NUMA_NO_NODE */
1500         NvS32 node_id;
1501 
1502         /* NUMA online/offline status for platforms that support GPU memory as
1503          * NUMA node */
1504         atomic_t status;
1505         NvBool use_auto_online;
1506     } numa_info;
1507 
1508     nvidia_stack_t *sp[NV_DEV_STACK_COUNT];
1509 
1510     char registry_keys[NV_MAX_REGISTRY_KEYS_LENGTH];
1511 
1512     nv_work_t work;
1513 
1514     /* get a timer callback every second */
1515     struct nv_timer rc_timer;
1516 
1517     /* lock for linux-specific data, not used by core rm */
1518     struct semaphore ldata_lock;
1519 
1520     /* proc directory information */
1521     struct proc_dir_entry *proc_dir;
1522 
1523     NvU32 minor_num;
1524     struct nv_linux_state_s *next;
1525 
1526     /* DRM private information */
1527     struct drm_device *drm;
1528 
1529     /* kthread based bottom half servicing queue and elements */
1530     nv_kthread_q_t bottom_half_q;
1531     nv_kthread_q_item_t bottom_half_q_item;
1532 
1533     /* Lock for unlocked bottom half protecting common allocated stack */
1534     void *isr_bh_unlocked_mutex;
1535 
1536     NvBool tce_bypass_enabled;
1537 
1538     NvU32 num_intr;
1539 
1540     /* Lock serializing ISRs for different MSI-X vectors */
1541     nv_spinlock_t msix_isr_lock;
1542 
1543     /* Lock serializing bottom halves for different MSI-X vectors */
1544     void *msix_bh_mutex;
1545 
1546     struct msix_entry *msix_entries;
1547 
1548     NvU64 numa_memblock_size;
1549 
1550     struct {
1551         struct backlight_device *dev;
1552         NvU32 displayId;
1553         const char *device_name;
1554     } backlight;
1555 
1556     /*
1557      * file handle for pci sysfs config file (/sys/bus/pci/devices/.../config)
1558      * which will be opened during device probe
1559      */
1560     struct file *sysfs_config_file;
1561 
1562     /* Per-GPU queue */
1563     struct os_work_queue queue;
1564 
1565     /* GPU user mapping revocation/remapping (only for non-CTL device) */
1566     struct semaphore mmap_lock; /* Protects all fields in this category */
1567     struct list_head open_files;
1568     NvBool all_mappings_revoked;
1569     NvBool safe_to_mmap;
1570     NvBool gpu_wakeup_callback_needed;
1571 
1572     /* Per-device notifier block for ACPI events */
1573     struct notifier_block acpi_nb;
1574 
1575 #if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
1576     nv_acpi_t* nv_acpi_object;
1577 #endif
1578 
1579     /* Lock serializing ISRs for different SOC vectors */
1580     nv_spinlock_t soc_isr_lock;
1581     void *soc_bh_mutex;
1582 
1583     struct nv_timer snapshot_timer;
1584     nv_spinlock_t snapshot_timer_lock;
1585     void (*snapshot_callback)(void *context);
1586 
1587     /* count for unhandled, total and timestamp of irq */
1588     nv_irq_count_info_t *irq_count;
1589 
1590     /* Max number of irq triggered and are getting tracked */
1591     NvU16 current_num_irq_tracked;
1592 
1593     NvBool is_forced_shutdown;
1594 
1595     struct nv_dma_device dma_dev;
1596     struct nv_dma_device niso_dma_dev;
1597 
1598     /*
1599      * Background kthread for handling deferred open operations
1600      * (e.g. from O_NONBLOCK).
1601      *
1602      * Adding to open_q and reading/writing is_accepting_opens
1603      * are protected by nvl->open_q_lock (not nvl->ldata_lock).
1604      * This allows new deferred open operations to be enqueued without
1605      * blocking behind previous ones (which hold nvl->ldata_lock).
1606      *
1607      * Adding to open_q is only safe if is_accepting_opens is true.
1608      * This prevents open operations from racing with device removal.
1609      *
1610      * Stopping open_q is only safe after setting is_accepting_opens to false.
1611      * This ensures that the open_q (and the larger nvl structure) will
1612      * outlive any of the open operations enqueued.
1613      */
1614     nv_kthread_q_t open_q;
1615     NvBool is_accepting_opens;
1616     struct semaphore open_q_lock;
1617 } nv_linux_state_t;
1618 
1619 extern nv_linux_state_t *nv_linux_devices;
1620 
1621 /*
1622  * Macros to protect operations on nv_linux_devices list
1623  * Lock acquisition order while using the nv_linux_devices list
1624  * 1. LOCK_NV_LINUX_DEVICES()
1625  * 2. Traverse the list
1626  *    If the list is traversed to search for an element say nvl,
1627  *    acquire the nvl->ldata_lock before step 3
1628  * 3. UNLOCK_NV_LINUX_DEVICES()
1629  * 4. Release nvl->ldata_lock after any read/write access to the
1630  *    nvl element is complete
1631  */
1632 extern struct semaphore nv_linux_devices_lock;
1633 #define LOCK_NV_LINUX_DEVICES()     down(&nv_linux_devices_lock)
1634 #define UNLOCK_NV_LINUX_DEVICES()   up(&nv_linux_devices_lock)
1635 
1636 /*
1637  * Lock to synchronize system power management transitions,
1638  * and to protect the global system PM state.  The procfs power
1639  * management interface acquires this lock in write mode for
1640  * the duration of the sleep operation, any other paths accessing
1641  * device state must acquire the lock in read mode.
1642  */
1643 extern struct rw_semaphore nv_system_pm_lock;
1644 
1645 extern NvBool nv_ats_supported;
1646 
1647 /*
1648  * file-private data
1649  * hide a pointer to our data structures in a file-private ptr
1650  * there are times we need to grab this data back from the file
1651  * data structure..
1652  */
1653 
1654 typedef struct nvidia_event
1655 {
1656     struct nvidia_event *next;
1657     nv_event_t event;
1658 } nvidia_event_t;
1659 
1660 typedef struct
1661 {
1662     nv_file_private_t nvfp;
1663 
1664     nvidia_stack_t *sp;
1665     nv_alloc_t *free_list;
1666     nv_linux_state_t *nvptr;
1667     nvidia_event_t *event_data_head, *event_data_tail;
1668     NvBool dataless_event_pending;
1669     nv_spinlock_t fp_lock;
1670     wait_queue_head_t waitqueue;
1671     nv_kthread_q_item_t deferred_close_q_item;
1672     NvU32 *attached_gpus;
1673     size_t num_attached_gpus;
1674     nv_alloc_mapping_context_t mmap_context;
1675     struct address_space mapping;
1676 
1677     nv_kthread_q_item_t open_q_item;
1678     struct completion open_complete;
1679     nv_linux_state_t *deferred_open_nvl;
1680     int open_rc;
1681     NV_STATUS adapter_status;
1682 
1683     struct list_head entry;
1684 } nv_linux_file_private_t;
1685 
1686 static inline nv_linux_file_private_t *nv_get_nvlfp_from_nvfp(nv_file_private_t *nvfp)
1687 {
1688     return container_of(nvfp, nv_linux_file_private_t, nvfp);
1689 }
1690 
1691 static inline int nv_wait_open_complete_interruptible(nv_linux_file_private_t *nvlfp)
1692 {
1693     return wait_for_completion_interruptible(&nvlfp->open_complete);
1694 }
1695 
1696 static inline void nv_wait_open_complete(nv_linux_file_private_t *nvlfp)
1697 {
1698     wait_for_completion(&nvlfp->open_complete);
1699 }
1700 
1701 static inline NvBool nv_is_open_complete(nv_linux_file_private_t *nvlfp)
1702 {
1703     return completion_done(&nvlfp->open_complete);
1704 }
1705 
1706 #define NV_SET_FILE_PRIVATE(filep,data) ((filep)->private_data = (data))
1707 #define NV_GET_LINUX_FILE_PRIVATE(filep) ((nv_linux_file_private_t *)(filep)->private_data)
1708 
1709 /* for the card devices */
1710 #define NV_GET_NVL_FROM_FILEP(filep)    (NV_GET_LINUX_FILE_PRIVATE(filep)->nvptr)
1711 #define NV_GET_NVL_FROM_NV_STATE(nv)    ((nv_linux_state_t *)nv->os_state)
1712 
1713 #define NV_STATE_PTR(nvl)   &(((nv_linux_state_t *)(nvl))->nv_state)
1714 
1715 #define NV_ATOMIC_READ(data)            atomic_read(&(data))
1716 #define NV_ATOMIC_SET(data,val)         atomic_set(&(data), (val))
1717 #define NV_ATOMIC_INC(data)             atomic_inc(&(data))
1718 #define NV_ATOMIC_DEC(data)             atomic_dec(&(data))
1719 #define NV_ATOMIC_DEC_AND_TEST(data)    atomic_dec_and_test(&(data))
1720 
1721 static inline struct kmem_cache *nv_kmem_cache_create(const char *name, unsigned int size,
1722                                                       unsigned int align)
1723 {
1724     char *name_unique;
1725     struct kmem_cache *cache;
1726 
1727 #if defined(NV_KMEM_CACHE_HAS_KOBJ_REMOVE_WORK) && !defined(NV_SYSFS_SLAB_UNLINK_PRESENT)
1728     size_t len;
1729     NvU64 tm_ns = nv_ktime_get_raw_ns();
1730 
1731     /*
1732      * Wait for timer to change at least once. This ensures
1733      * that the name generated below is always unique.
1734      */
1735     while (tm_ns == nv_ktime_get_raw_ns());
1736     tm_ns = nv_ktime_get_raw_ns();
1737 
1738     /* 20 is the max length of a 64-bit integer printed in decimal */
1739     len = strlen(name) + 20 + 1;
1740     name_unique = kzalloc(len, GFP_KERNEL);
1741     if (!name_unique)
1742         return NULL;
1743 
1744     if (snprintf(name_unique, len, "%s-%llu", name, tm_ns) >= len)
1745     {
1746         WARN(1, "kmem cache name too long: %s\n", name);
1747         kfree(name_unique);
1748         return NULL;
1749     }
1750 #else
1751     name_unique = (char *)name;
1752 #endif
1753     cache = kmem_cache_create(name_unique, size, align, 0, nv_kmem_ctor_dummy);
1754     if (name_unique != name)
1755         kfree(name_unique);
1756 
1757     return cache;
1758 }
1759 
1760 #if defined(CONFIG_PCI_IOV)
1761 #define NV_PCI_SRIOV_SUPPORT
1762 #endif /* CONFIG_PCI_IOV */
1763 
1764 #define NV_PCIE_CFG_MAX_OFFSET 0x1000
1765 
1766 #include "nv-proto.h"
1767 
1768 /*
1769  * Check if GPU is present on the bus by checking flag
1770  * NV_FLAG_IN_SURPRISE_REMOVAL(set when eGPU is removed from TB3).
1771  */
1772 static inline NV_STATUS nv_check_gpu_state(nv_state_t *nv)
1773 {
1774 #if !defined(NVCPU_PPC64LE)
1775     if (NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv))
1776     {
1777         return NV_ERR_GPU_IS_LOST;
1778     }
1779 #endif
1780 
1781     return NV_OK;
1782 }
1783 
1784 extern NvU32 NVreg_EnableUserNUMAManagement;
1785 extern NvU32 NVreg_RegisterPCIDriver;
1786 extern NvU32 NVreg_EnableResizableBar;
1787 extern NvU32 NVreg_EnableNonblockingOpen;
1788 
1789 extern NvU32 num_probed_nv_devices;
1790 extern NvU32 num_nv_devices;
1791 
1792 #define NV_FILE_INODE(file) (file)->f_inode
1793 
1794 static inline int nv_is_control_device(struct inode *inode)
1795 {
1796     return (minor((inode)->i_rdev) == NV_MINOR_DEVICE_NUMBER_CONTROL_DEVICE);
1797 }
1798 
1799 #if defined(NV_DOM0_KERNEL_PRESENT) || defined(NV_VGPU_KVM_BUILD)
1800 #define NV_VGX_HYPER
1801 #if defined(NV_XEN_IOEMU_INJECT_MSI)
1802 #include <xen/ioemu.h>
1803 #endif
1804 #endif
1805 
1806 static inline NvU64 nv_pci_bus_address(struct pci_dev *dev, NvU8 bar_index)
1807 {
1808     NvU64 bus_addr = 0;
1809 #if defined(NV_PCI_BUS_ADDRESS_PRESENT)
1810     bus_addr = pci_bus_address(dev, bar_index);
1811 #elif defined(CONFIG_PCI)
1812     struct pci_bus_region region;
1813 
1814     pcibios_resource_to_bus(dev, &region, &dev->resource[bar_index]);
1815     bus_addr = region.start;
1816 #endif
1817     return bus_addr;
1818 }
1819 
1820 /*
1821  * Decrements the usage count of the allocation, and moves the allocation to
1822  * the given nvlfp's free list if the usage count drops to zero.
1823  *
1824  * Returns NV_TRUE if the allocation is moved to the nvlfp's free list.
1825  */
1826 static inline NvBool nv_alloc_release(nv_linux_file_private_t *nvlfp, nv_alloc_t *at)
1827 {
1828     NV_PRINT_AT(NV_DBG_MEMINFO, at);
1829 
1830     if (NV_ATOMIC_DEC_AND_TEST(at->usage_count))
1831     {
1832         NV_ATOMIC_INC(at->usage_count);
1833 
1834         at->next = nvlfp->free_list;
1835         nvlfp->free_list = at;
1836         return NV_TRUE;
1837     }
1838 
1839     return NV_FALSE;
1840 }
1841 
1842 /*
1843  * RB_EMPTY_ROOT was added in 2.6.18 by this commit:
1844  *   2006-06-21  dd67d051529387f6e44d22d1d5540ef281965fdd
1845  */
1846 #if !defined(RB_EMPTY_ROOT)
1847 #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
1848 #endif
1849 
1850 /*
1851  * Starting on Power9 systems, DMA addresses for NVLink are no longer
1852  * the same as used over PCIe.
1853  *
1854  * Power9 supports a 56-bit Real Address. This address range is compressed
1855  * when accessed over NVLink to allow the GPU to access all of memory using
1856  * its 47-bit Physical address.
1857  *
1858  * If there is an NPU device present on the system, it implies that NVLink
1859  * sysmem links are present and we need to apply the required address
1860  * conversion for NVLink within the driver.
1861  *
1862  * See Bug 1920398 for further background and details.
1863  *
1864  * Note, a deviation from the documented compression scheme is that the
1865  * upper address bits (i.e. bit 56-63) instead of being set to zero are
1866  * preserved during NVLink address compression so the orignal PCIe DMA
1867  * address can be reconstructed on expansion. These bits can be safely
1868  * ignored on NVLink since they are truncated by the GPU.
1869  *
1870  * Bug 1968345: As a performance enhancement it is the responsibility of
1871  * the caller on PowerPC platforms to check for presence of an NPU device
1872  * before the address transformation is applied.
1873  */
1874 static inline NvU64 nv_compress_nvlink_addr(NvU64 addr)
1875 {
1876     NvU64 addr47 = addr;
1877 
1878 #if defined(NVCPU_PPC64LE)
1879     addr47 = addr & ((1ULL << 43) - 1);
1880     addr47 |= (addr & (0x3ULL << 45)) >> 2;
1881     WARN_ON(addr47 & (1ULL << 44));
1882     addr47 |= (addr & (0x3ULL << 49)) >> 4;
1883     addr47 |= addr & ~((1ULL << 56) - 1);
1884 #endif
1885 
1886     return addr47;
1887 }
1888 
1889 static inline NvU64 nv_expand_nvlink_addr(NvU64 addr47)
1890 {
1891     NvU64 addr = addr47;
1892 
1893 #if defined(NVCPU_PPC64LE)
1894     addr = addr47 & ((1ULL << 43) - 1);
1895     addr |= (addr47 & (3ULL << 43)) << 2;
1896     addr |= (addr47 & (3ULL << 45)) << 4;
1897     addr |= addr47 & ~((1ULL << 56) - 1);
1898 #endif
1899 
1900     return addr;
1901 }
1902 
1903 // Default flags for ISRs
1904 static inline NvU32 nv_default_irq_flags(nv_state_t *nv)
1905 {
1906     NvU32 flags = 0;
1907 
1908     /*
1909      * Request IRQs to be disabled in our ISRs to keep consistency across the
1910      * supported kernel versions.
1911      *
1912      * IRQF_DISABLED has been made the default in 2.6.35 with commit e58aa3d2d0cc
1913      * from March 2010. And it has been later completely removed in 4.1 with commit
1914      * d8bf368d0631 from March 2015. Add it to our flags if it's defined to get the
1915      * same behaviour on pre-2.6.35 kernels as on recent ones.
1916      */
1917 #if defined(IRQF_DISABLED)
1918     flags |= IRQF_DISABLED;
1919 #endif
1920 
1921     /*
1922      * For legacy interrupts, also allow sharing. Sharing doesn't make sense
1923      * for MSI(-X) as on Linux they are never shared across different devices
1924      * and we only register one ISR today.
1925      */
1926     if ((nv->flags & (NV_FLAG_USES_MSI | NV_FLAG_USES_MSIX)) == 0)
1927         flags |= IRQF_SHARED;
1928 
1929     return flags;
1930 }
1931 
1932 /*
1933  * From v3.7-rc1 kernel have stopped exporting get_unused_fd() and started
1934  * exporting get_unused_fd_flags(), as of this commit:
1935  * 2012-09-26 1a7bd2265fc ("make get_unused_fd_flags() a function")
1936  */
1937 #if NV_IS_EXPORT_SYMBOL_PRESENT_get_unused_fd
1938     #define NV_GET_UNUSED_FD()  get_unused_fd()
1939 #else
1940     #define NV_GET_UNUSED_FD()  get_unused_fd_flags(0)
1941 #endif
1942 
1943 #if NV_IS_EXPORT_SYMBOL_PRESENT_get_unused_fd_flags
1944     #define NV_GET_UNUSED_FD_FLAGS(flags)  get_unused_fd_flags(flags)
1945 #else
1946     #define NV_GET_UNUSED_FD_FLAGS(flags)  (-1)
1947 #endif
1948 
1949 #define MODULE_BASE_NAME "nvidia"
1950 #define MODULE_INSTANCE_NUMBER 0
1951 #define MODULE_INSTANCE_STRING ""
1952 #define MODULE_NAME MODULE_BASE_NAME MODULE_INSTANCE_STRING
1953 
1954 NvS32 nv_request_soc_irq(nv_linux_state_t *, NvU32, nv_soc_irq_type_t, NvU32, NvU32, const char*);
1955 
1956 static inline void nv_mutex_destroy(struct mutex *lock)
1957 {
1958     mutex_destroy(lock);
1959 }
1960 
1961 static inline NvBool nv_platform_supports_numa(nv_linux_state_t *nvl)
1962 {
1963     return nvl->numa_info.node_id != NUMA_NO_NODE;
1964 }
1965 
1966 static inline int nv_get_numa_status(nv_linux_state_t *nvl)
1967 {
1968     if (!nv_platform_supports_numa(nvl))
1969     {
1970         return NV_IOCTL_NUMA_STATUS_DISABLED;
1971     }
1972 
1973     return NV_ATOMIC_READ(nvl->numa_info.status);
1974 }
1975 
1976 static inline int nv_set_numa_status(nv_linux_state_t *nvl, int status)
1977 {
1978     if (!nv_platform_supports_numa(nvl))
1979     {
1980         return -EINVAL;
1981     }
1982 
1983     NV_ATOMIC_SET(nvl->numa_info.status, status);
1984     return 0;
1985 }
1986 
1987 static inline NvBool nv_platform_use_auto_online(nv_linux_state_t *nvl)
1988 {
1989     return nvl->numa_info.use_auto_online;
1990 }
1991 
1992 typedef struct {
1993     NvU64 base;
1994     NvU64 size;
1995     NvU32 nodeId;
1996     int ret;
1997 } remove_numa_memory_info_t;
1998 
1999 static void offline_numa_memory_callback
2000 (
2001     void *args
2002 )
2003 {
2004 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
2005     remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
2006 #ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
2007     pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->nodeId,
2008                                                pNumaInfo->base,
2009                                                pNumaInfo->size);
2010 #else
2011     pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->base,
2012                                                pNumaInfo->size);
2013 #endif
2014 #endif
2015 }
2016 
2017 typedef enum
2018 {
2019     NV_NUMA_STATUS_DISABLED             = 0,
2020     NV_NUMA_STATUS_OFFLINE              = 1,
2021     NV_NUMA_STATUS_ONLINE_IN_PROGRESS   = 2,
2022     NV_NUMA_STATUS_ONLINE               = 3,
2023     NV_NUMA_STATUS_ONLINE_FAILED        = 4,
2024     NV_NUMA_STATUS_OFFLINE_IN_PROGRESS  = 5,
2025     NV_NUMA_STATUS_OFFLINE_FAILED       = 6,
2026     NV_NUMA_STATUS_COUNT
2027 } nv_numa_status_t;
2028 
2029 #if defined(NV_LINUX_PLATFORM_DEVICE_H_PRESENT)
2030 #include <linux/platform_device.h>
2031 #endif
2032 
2033 #if defined(NV_LINUX_MUTEX_H_PRESENT)
2034 #include <linux/mutex.h>
2035 #endif
2036 
2037 #if defined(NV_LINUX_RESET_H_PRESENT)
2038 #include <linux/reset.h>
2039 #endif
2040 
2041 #if defined(NV_LINUX_DMA_BUF_H_PRESENT)
2042 #include <linux/dma-buf.h>
2043 #endif
2044 
2045 #if defined(NV_LINUX_GPIO_H_PRESENT)
2046 #include <linux/gpio.h>
2047 #endif
2048 
2049 #if defined(NV_LINUX_OF_GPIO_H_PRESENT)
2050 #include <linux/of_gpio.h>
2051 #endif
2052 
2053 #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
2054 #include <linux/of_device.h>
2055 #endif
2056 
2057 #if defined(NV_LINUX_OF_PLATFORM_H_PRESENT)
2058 #include <linux/of_platform.h>
2059 #endif
2060 
2061 #if defined(NV_LINUX_INTERCONNECT_H_PRESENT)
2062 #include <linux/interconnect.h>
2063 #endif
2064 
2065 #if defined(NV_LINUX_PM_RUNTIME_H_PRESENT)
2066 #include <linux/pm_runtime.h>
2067 #endif
2068 
2069 #if defined(NV_LINUX_CLK_H_PRESENT)
2070 #include <linux/clk.h>
2071 #endif
2072 
2073 #if defined(NV_LINUX_CLK_PROVIDER_H_PRESENT)
2074 #include <linux/clk-provider.h>
2075 #endif
2076 
2077 #define NV_EXPORT_SYMBOL(symbol)        EXPORT_SYMBOL_GPL(symbol)
2078 #define NV_CHECK_EXPORT_SYMBOL(symbol)  NV_IS_EXPORT_SYMBOL_PRESENT_##symbol
2079 
2080 #endif  /* _NV_LINUX_H_ */
2081