xref: /open-nvidia-gpu/kernel-open/nvidia/nv.c (revision 4397463e)
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1999-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include <linux/module.h>  // for MODULE_FIRMWARE
25 
26 // must precede "nv.h" and "nv-firmware.h" includes
27 #define NV_FIRMWARE_PATH_FOR_FILENAME(filename)  "nvidia/" NV_VERSION_STRING "/" filename
28 #define NV_FIRMWARE_DECLARE_GSP_FILENAME(filename) \
29     MODULE_FIRMWARE(NV_FIRMWARE_PATH_FOR_FILENAME(filename));
30 #include "nv-firmware.h"
31 
32 #include "nvmisc.h"
33 #include "os-interface.h"
34 #include "nv-linux.h"
35 #include "nv-p2p.h"
36 #include "nv-reg.h"
37 #include "nv-msi.h"
38 #include "nv-pci-table.h"
39 
40 #if defined(NV_UVM_ENABLE)
41 #include "nv_uvm_interface.h"
42 #endif
43 
44 #if defined(NV_VGPU_KVM_BUILD)
45 #include "nv-vgpu-vfio-interface.h"
46 #endif
47 
48 #include "nvlink_proto.h"
49 #include "nvlink_caps.h"
50 
51 #include "nv-frontend.h"
52 #include "nv-hypervisor.h"
53 #include "nv-ibmnpu.h"
54 #include "nv-rsync.h"
55 #include "nv-kthread-q.h"
56 #include "nv-pat.h"
57 #include "nv-dmabuf.h"
58 
59 #if !defined(CONFIG_RETPOLINE)
60 #include "nv-retpoline.h"
61 #endif
62 
63 #include <linux/firmware.h>
64 
65 #include <sound/core.h>             /* HDA struct snd_card */
66 
67 #include <asm/cache.h>
68 
69 #if defined(NV_SOUND_HDAUDIO_H_PRESENT)
70 #include "sound/hdaudio.h"
71 #endif
72 
73 #if defined(NV_SOUND_HDA_CODEC_H_PRESENT)
74 #include <sound/core.h>
75 #include <sound/hda_codec.h>
76 #include <sound/hda_verbs.h>
77 #endif
78 
79 #if defined(NV_SEQ_READ_ITER_PRESENT)
80 #include <linux/uio.h>
81 #include <linux/seq_file.h>
82 #include <linux/kernfs.h>
83 #endif
84 
85 #include <linux/dmi.h>              /* System DMI info */
86 
87 #include <linux/ioport.h>
88 
89 #include "conftest/patches.h"
90 
91 #define RM_THRESHOLD_TOTAL_IRQ_COUNT     100000
92 #define RM_THRESHOLD_UNAHNDLED_IRQ_COUNT 99900
93 #define RM_UNHANDLED_TIMEOUT_US          100000
94 
95 const NvBool nv_is_rm_firmware_supported_os = NV_TRUE;
96 
97 // Deprecated, use NV_REG_ENABLE_GPU_FIRMWARE instead
98 char *rm_firmware_active = NULL;
99 NV_MODULE_STRING_PARAMETER(rm_firmware_active);
100 
101 /*
102  * Global NVIDIA capability state, for GPU driver
103  */
104 nv_cap_t *nvidia_caps_root = NULL;
105 
106 /*
107  * our global state; one per device
108  */
109 NvU32 num_nv_devices = 0;
110 NvU32 num_probed_nv_devices = 0;
111 
112 nv_linux_state_t *nv_linux_devices;
113 
114 /*
115  * And one for the control device
116  */
117 nv_linux_state_t nv_ctl_device = { { 0 } };
118 extern NvU32 nv_dma_remap_peer_mmio;
119 
120 nv_kthread_q_t nv_kthread_q;
121 nv_kthread_q_t nv_deferred_close_kthread_q;
122 
123 struct rw_semaphore nv_system_pm_lock;
124 
125 #if defined(CONFIG_PM)
126 static nv_power_state_t nv_system_power_state;
127 static nv_pm_action_depth_t nv_system_pm_action_depth;
128 struct semaphore nv_system_power_state_lock;
129 #endif
130 
131 void *nvidia_p2p_page_t_cache;
132 static void *nvidia_pte_t_cache;
133 void *nvidia_stack_t_cache;
134 static nvidia_stack_t *__nv_init_sp;
135 
136 static int nv_tce_bypass_mode = NV_TCE_BYPASS_MODE_DEFAULT;
137 
138 struct semaphore nv_linux_devices_lock;
139 
140 static NvTristate nv_chipset_is_io_coherent = NV_TRISTATE_INDETERMINATE;
141 
142 // True if all the successfully probed devices support ATS
143 // Assigned at device probe (module init) time
144 NvBool nv_ats_supported = NVCPU_IS_PPC64LE
145 ;
146 
147 // allow an easy way to convert all debug printfs related to events
148 // back and forth between 'info' and 'errors'
149 #if defined(NV_DBG_EVENTS)
150 #define NV_DBG_EVENTINFO NV_DBG_ERRORS
151 #else
152 #define NV_DBG_EVENTINFO NV_DBG_INFO
153 #endif
154 
155 #if defined(HDA_MAX_CODECS)
156 #define NV_HDA_MAX_CODECS HDA_MAX_CODECS
157 #else
158 #define NV_HDA_MAX_CODECS 8
159 #endif
160 
161 /***
162  *** STATIC functions, only in this file
163  ***/
164 
165 /* nvos_ functions.. do not take a state device parameter  */
166 static int      nvos_count_devices(void);
167 
168 static nv_alloc_t  *nvos_create_alloc(struct device *, int);
169 static int          nvos_free_alloc(nv_alloc_t *);
170 
171 /***
172  *** EXPORTS to Linux Kernel
173  ***/
174 
175 static irqreturn_t   nvidia_isr_common_bh   (void *);
176 static void          nvidia_isr_bh_unlocked (void *);
177 static int           nvidia_ctl_open        (struct inode *, struct file *);
178 static int           nvidia_ctl_close       (struct inode *, struct file *);
179 
180 const char *nv_device_name = MODULE_NAME;
181 static const char *nvidia_stack_cache_name = MODULE_NAME "_stack_cache";
182 static const char *nvidia_pte_cache_name = MODULE_NAME "_pte_cache";
183 static const char *nvidia_p2p_page_cache_name = MODULE_NAME "_p2p_page_cache";
184 
185 static int           nvidia_open           (struct inode *, struct file *);
186 static int           nvidia_close          (struct inode *, struct file *);
187 static unsigned int  nvidia_poll           (struct file *, poll_table *);
188 static int           nvidia_ioctl          (struct inode *, struct file *, unsigned int, unsigned long);
189 
190 /* character device entry points*/
191 nvidia_module_t nv_fops = {
192     .owner       = THIS_MODULE,
193     .module_name = MODULE_NAME,
194     .instance    = MODULE_INSTANCE_NUMBER,
195     .open        = nvidia_open,
196     .close       = nvidia_close,
197     .ioctl       = nvidia_ioctl,
198     .mmap        = nvidia_mmap,
199     .poll        = nvidia_poll,
200 };
201 
202 #if defined(CONFIG_PM)
203 static int           nv_pmops_suspend          (struct device *dev);
204 static int           nv_pmops_resume           (struct device *dev);
205 static int           nv_pmops_freeze           (struct device *dev);
206 static int           nv_pmops_thaw             (struct device *dev);
207 static int           nv_pmops_restore          (struct device *dev);
208 static int           nv_pmops_poweroff         (struct device *dev);
209 static int           nv_pmops_runtime_suspend  (struct device *dev);
210 static int           nv_pmops_runtime_resume   (struct device *dev);
211 
212 struct dev_pm_ops nv_pm_ops = {
213     .suspend         = nv_pmops_suspend,
214     .resume          = nv_pmops_resume,
215     .freeze          = nv_pmops_freeze,
216     .thaw            = nv_pmops_thaw,
217     .poweroff        = nv_pmops_poweroff,
218     .restore         = nv_pmops_restore,
219     .runtime_suspend = nv_pmops_runtime_suspend,
220     .runtime_resume  = nv_pmops_runtime_resume,
221 };
222 #endif
223 
224 /***
225  *** see nv.h for functions exported to other parts of resman
226  ***/
227 
228 /***
229  *** STATIC functions
230  ***/
231 
232 #if defined(NVCPU_X86_64)
233 #define NV_AMD_SEV_BIT BIT(1)
234 
235 static
236 NvBool nv_is_sev_supported(
237     void
238 )
239 {
240     unsigned int eax, ebx, ecx, edx;
241 
242     /* Check for the SME/SEV support leaf */
243     eax = 0x80000000;
244     ecx = 0;
245     native_cpuid(&eax, &ebx, &ecx, &edx);
246     if (eax < 0x8000001f)
247         return NV_FALSE;
248 
249     eax = 0x8000001f;
250     ecx = 0;
251     native_cpuid(&eax, &ebx, &ecx, &edx);
252     /* Check whether SEV is supported */
253     if (!(eax & NV_AMD_SEV_BIT))
254         return NV_FALSE;
255 
256     return NV_TRUE;
257 }
258 #endif
259 
260 static
261 void nv_sev_init(
262     void
263 )
264 {
265 #if defined(MSR_AMD64_SEV) && defined(NVCPU_X86_64)
266     NvU32 lo_val, hi_val;
267 
268     if (!nv_is_sev_supported())
269         return;
270 
271     rdmsr(MSR_AMD64_SEV, lo_val, hi_val);
272 
273     os_sev_status = lo_val;
274 #if defined(MSR_AMD64_SEV_ENABLED)
275     os_sev_enabled = (os_sev_status & MSR_AMD64_SEV_ENABLED);
276 #endif
277 #endif
278 }
279 
280 static
281 nv_alloc_t *nvos_create_alloc(
282     struct device *dev,
283     int            num_pages
284 )
285 {
286     nv_alloc_t  *at;
287     unsigned int pt_size;
288     unsigned int i;
289 
290     NV_KZALLOC(at, sizeof(nv_alloc_t));
291     if (at == NULL)
292     {
293         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate alloc info\n");
294         return NULL;
295     }
296 
297     at->dev = dev;
298     pt_size = num_pages *  sizeof(nvidia_pte_t *);
299 
300     if (os_alloc_mem((void **)&at->page_table, pt_size) != NV_OK)
301     {
302         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate page table\n");
303         NV_KFREE(at, sizeof(nv_alloc_t));
304         return NULL;
305     }
306 
307     memset(at->page_table, 0, pt_size);
308     at->num_pages = num_pages;
309     NV_ATOMIC_SET(at->usage_count, 0);
310 
311     for (i = 0; i < at->num_pages; i++)
312     {
313         at->page_table[i] = NV_KMEM_CACHE_ALLOC(nvidia_pte_t_cache);
314         if (at->page_table[i] == NULL)
315         {
316             nv_printf(NV_DBG_ERRORS,
317                       "NVRM: failed to allocate page table entry\n");
318             nvos_free_alloc(at);
319             return NULL;
320         }
321         memset(at->page_table[i], 0, sizeof(nvidia_pte_t));
322     }
323 
324     at->pid = os_get_current_process();
325 
326     return at;
327 }
328 
329 static
330 int nvos_free_alloc(
331     nv_alloc_t *at
332 )
333 {
334     unsigned int i;
335 
336     if (at == NULL)
337         return -1;
338 
339     if (NV_ATOMIC_READ(at->usage_count))
340         return 1;
341 
342     for (i = 0; i < at->num_pages; i++)
343     {
344         if (at->page_table[i] != NULL)
345             NV_KMEM_CACHE_FREE(at->page_table[i], nvidia_pte_t_cache);
346     }
347     os_free_mem(at->page_table);
348 
349     NV_KFREE(at, sizeof(nv_alloc_t));
350 
351     return 0;
352 }
353 
354 static void
355 nv_module_resources_exit(nv_stack_t *sp)
356 {
357     nv_kmem_cache_free_stack(sp);
358 
359     NV_KMEM_CACHE_DESTROY(nvidia_p2p_page_t_cache);
360     NV_KMEM_CACHE_DESTROY(nvidia_pte_t_cache);
361     NV_KMEM_CACHE_DESTROY(nvidia_stack_t_cache);
362 }
363 
364 static int __init
365 nv_module_resources_init(nv_stack_t **sp)
366 {
367     int rc = -ENOMEM;
368 
369     nvidia_stack_t_cache = NV_KMEM_CACHE_CREATE(nvidia_stack_cache_name,
370                                                 nvidia_stack_t);
371     if (nvidia_stack_t_cache == NULL)
372     {
373         nv_printf(NV_DBG_ERRORS,
374                   "NVRM: nvidia_stack_t cache allocation failed.\n");
375         goto exit;
376     }
377 
378     nvidia_pte_t_cache = NV_KMEM_CACHE_CREATE(nvidia_pte_cache_name,
379                                               nvidia_pte_t);
380     if (nvidia_pte_t_cache == NULL)
381     {
382         nv_printf(NV_DBG_ERRORS,
383                   "NVRM: nvidia_pte_t cache allocation failed.\n");
384         goto exit;
385     }
386 
387     nvidia_p2p_page_t_cache = NV_KMEM_CACHE_CREATE(nvidia_p2p_page_cache_name,
388                                                    nvidia_p2p_page_t);
389     if (nvidia_p2p_page_t_cache == NULL)
390     {
391         nv_printf(NV_DBG_ERRORS,
392                   "NVRM: nvidia_p2p_page_t cache allocation failed.\n");
393         goto exit;
394     }
395 
396     rc = nv_kmem_cache_alloc_stack(sp);
397     if (rc < 0)
398     {
399         goto exit;
400     }
401 
402 exit:
403     if (rc < 0)
404     {
405         nv_kmem_cache_free_stack(*sp);
406 
407         NV_KMEM_CACHE_DESTROY(nvidia_p2p_page_t_cache);
408         NV_KMEM_CACHE_DESTROY(nvidia_pte_t_cache);
409         NV_KMEM_CACHE_DESTROY(nvidia_stack_t_cache);
410     }
411 
412     return rc;
413 }
414 
415 static void
416 nvlink_drivers_exit(void)
417 {
418 #if NVCPU_IS_64_BITS
419     nvswitch_exit();
420 #endif
421 
422 #if defined(NVCPU_PPC64LE)
423     ibmnpu_exit();
424 #endif
425 
426     nvlink_core_exit();
427 }
428 
429 static int __init
430 nvlink_drivers_init(void)
431 {
432     int rc = 0;
433 
434     rc = nvlink_core_init();
435     if (rc < 0)
436     {
437         nv_printf(NV_DBG_INFO, "NVRM: NVLink core init failed.\n");
438         return rc;
439     }
440 
441 #if defined(NVCPU_PPC64LE)
442     rc = ibmnpu_init();
443     if (rc < 0)
444     {
445         nv_printf(NV_DBG_INFO, "NVRM: IBM NPU init failed.\n");
446         nvlink_core_exit();
447         return rc;
448     }
449 #endif
450 
451 #if NVCPU_IS_64_BITS
452     rc = nvswitch_init();
453     if (rc < 0)
454     {
455         nv_printf(NV_DBG_INFO, "NVRM: NVSwitch init failed.\n");
456 #if defined(NVCPU_PPC64LE)
457         ibmnpu_exit();
458 #endif
459         nvlink_core_exit();
460     }
461 #endif
462 
463     return rc;
464 }
465 
466 static void
467 nv_module_state_exit(nv_stack_t *sp)
468 {
469     nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);
470 
471     nv_teardown_pat_support();
472 
473     nv_kthread_q_stop(&nv_deferred_close_kthread_q);
474     nv_kthread_q_stop(&nv_kthread_q);
475 
476     nv_lock_destroy_locks(sp, nv);
477 }
478 
479 static int
480 nv_module_state_init(nv_stack_t *sp)
481 {
482     int rc;
483     nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);
484 
485     nv->os_state = (void *)&nv_ctl_device;
486 
487     if (!nv_lock_init_locks(sp, nv))
488     {
489         return -ENOMEM;
490     }
491 
492     rc = nv_kthread_q_init(&nv_kthread_q, "nv_queue");
493     if (rc != 0)
494     {
495         goto exit;
496     }
497 
498     rc = nv_kthread_q_init(&nv_deferred_close_kthread_q, "nv_queue");
499     if (rc != 0)
500     {
501         nv_kthread_q_stop(&nv_kthread_q);
502         goto exit;
503     }
504 
505     rc = nv_init_pat_support(sp);
506     if (rc < 0)
507     {
508         nv_kthread_q_stop(&nv_deferred_close_kthread_q);
509         nv_kthread_q_stop(&nv_kthread_q);
510         goto exit;
511     }
512 
513     nv_linux_devices = NULL;
514     NV_INIT_MUTEX(&nv_linux_devices_lock);
515     init_rwsem(&nv_system_pm_lock);
516 
517 #if defined(CONFIG_PM)
518     NV_INIT_MUTEX(&nv_system_power_state_lock);
519     nv_system_power_state = NV_POWER_STATE_RUNNING;
520     nv_system_pm_action_depth = NV_PM_ACTION_DEPTH_DEFAULT;
521 #endif
522 
523     NV_SPIN_LOCK_INIT(&nv_ctl_device.snapshot_timer_lock);
524 
525 exit:
526     if (rc < 0)
527     {
528         nv_lock_destroy_locks(sp, nv);
529     }
530 
531     return rc;
532 }
533 
534 static void __init
535 nv_registry_keys_init(nv_stack_t *sp)
536 {
537     NV_STATUS status;
538     nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);
539     NvU32 data;
540 
541     /*
542      * Determine the TCE bypass mode here so it can be used during
543      * device probe.  Also determine whether we should allow
544      * user-mode NUMA onlining of device memory.
545      */
546     if (NVCPU_IS_PPC64LE)
547     {
548         status = rm_read_registry_dword(sp, nv,
549                                         NV_REG_TCE_BYPASS_MODE,
550                                         &data);
551         if ((status == NV_OK) && ((int)data != NV_TCE_BYPASS_MODE_DEFAULT))
552         {
553             nv_tce_bypass_mode = data;
554         }
555 
556         if (NVreg_EnableUserNUMAManagement)
557         {
558             /* Force on the core RM registry key to match. */
559             status = rm_write_registry_dword(sp, nv, "RMNumaOnlining", 1);
560             WARN_ON(status != NV_OK);
561         }
562     }
563 
564     status = rm_read_registry_dword(sp, nv, NV_DMA_REMAP_PEER_MMIO, &data);
565     if (status == NV_OK)
566     {
567         nv_dma_remap_peer_mmio = data;
568     }
569 }
570 
571 static void __init
572 nv_report_applied_patches(void)
573 {
574     unsigned i;
575 
576     for (i = 0; __nv_patches[i].short_description; i++)
577     {
578         if (i == 0)
579         {
580             nv_printf(NV_DBG_ERRORS, "NVRM: Applied patches:\n");
581         }
582 
583         nv_printf(NV_DBG_ERRORS,
584             "NVRM:    Patch #%d: %s\n", i + 1, __nv_patches[i].short_description);
585     }
586 }
587 
588 static void
589 nv_drivers_exit(void)
590 {
591     nv_pci_unregister_driver();
592 
593     nvidia_unregister_module(&nv_fops);
594 }
595 
596 static int __init
597 nv_drivers_init(void)
598 {
599     int rc;
600 
601     rc = nvidia_register_module(&nv_fops);
602     if (rc < 0)
603     {
604         nv_printf(NV_DBG_ERRORS,
605                   "NVRM: failed to register character device.\n");
606         return rc;
607     }
608 
609     rc = nv_pci_register_driver();
610     if (rc < 0)
611     {
612         nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA PCI devices found.\n");
613         rc = -ENODEV;
614         goto exit;
615     }
616 
617 exit:
618     if (rc < 0)
619     {
620         nvidia_unregister_module(&nv_fops);
621     }
622 
623     return rc;
624 }
625 
626 static void
627 nv_module_exit(nv_stack_t *sp)
628 {
629     nv_module_state_exit(sp);
630 
631     rm_shutdown_rm(sp);
632 
633     nv_destroy_rsync_info();
634     nvlink_drivers_exit();
635 
636     nv_cap_drv_exit();
637 
638     nv_module_resources_exit(sp);
639 }
640 
641 static int __init
642 nv_module_init(nv_stack_t **sp)
643 {
644     int rc;
645 
646     rc = nv_module_resources_init(sp);
647     if (rc < 0)
648     {
649         return rc;
650     }
651 
652     rc = nv_cap_drv_init();
653     if (rc < 0)
654     {
655         nv_printf(NV_DBG_ERRORS, "NVRM: nv-cap-drv init failed.\n");
656         goto cap_drv_exit;
657     }
658 
659     rc = nvlink_drivers_init();
660     if (rc < 0)
661     {
662         goto cap_drv_exit;
663     }
664 
665     nv_init_rsync_info();
666     nv_sev_init();
667 
668     if (!rm_init_rm(*sp))
669     {
670         nv_printf(NV_DBG_ERRORS, "NVRM: rm_init_rm() failed!\n");
671         rc = -EIO;
672         goto nvlink_exit;
673     }
674 
675     rc = nv_module_state_init(*sp);
676     if (rc < 0)
677     {
678         goto init_rm_exit;
679     }
680 
681     return rc;
682 
683 init_rm_exit:
684     rm_shutdown_rm(*sp);
685 
686 nvlink_exit:
687     nv_destroy_rsync_info();
688     nvlink_drivers_exit();
689 
690 cap_drv_exit:
691     nv_cap_drv_exit();
692     nv_module_resources_exit(*sp);
693 
694     return rc;
695 }
696 
697 /*
698  * In this function we check for the cases where GPU exclusion is not
699  * honored, and issue a warning.
700  *
701  * Only GPUs that support a mechanism to query UUID prior to
702  * initializing the GPU can be excluded, so that we can detect and
703  * exclude them during device probe.  This function checks that an
704  * initialized GPU was not specified in the exclusion list, and issues a
705  * warning if so.
706  */
707 static void
708 nv_assert_not_in_gpu_exclusion_list(
709     nvidia_stack_t *sp,
710     nv_state_t *nv
711 )
712 {
713     char *uuid = rm_get_gpu_uuid(sp, nv);
714 
715     if (uuid == NULL)
716     {
717         NV_DEV_PRINTF(NV_DBG_INFO, nv, "Unable to read UUID");
718         return;
719     }
720 
721     if (nv_is_uuid_in_gpu_exclusion_list(uuid))
722     {
723         NV_DEV_PRINTF(NV_DBG_WARNINGS, nv,
724                       "Could not exclude GPU %s because PBI is not supported\n",
725                       uuid);
726         WARN_ON(1);
727     }
728 
729     os_free_mem(uuid);
730 
731     return;
732 }
733 
734 static int __init nv_caps_root_init(void)
735 {
736     nvidia_caps_root = os_nv_cap_init("driver/" MODULE_NAME);
737 
738     return (nvidia_caps_root == NULL) ? -ENOENT : 0;
739 }
740 
741 static void nv_caps_root_exit(void)
742 {
743     os_nv_cap_destroy_entry(nvidia_caps_root);
744     nvidia_caps_root = NULL;
745 }
746 
747 int __init nvidia_init_module(void)
748 {
749     int rc;
750     NvU32 count;
751     nvidia_stack_t *sp = NULL;
752     const NvBool is_nvswitch_present = os_is_nvswitch_present();
753 
754     nv_memdbg_init();
755 
756     rc = nv_procfs_init();
757     if (rc < 0)
758     {
759         nv_printf(NV_DBG_ERRORS, "NVRM: failed to initialize procfs.\n");
760         return rc;
761     }
762 
763     rc = nv_caps_root_init();
764     if (rc < 0)
765     {
766         nv_printf(NV_DBG_ERRORS, "NVRM: failed to initialize capabilities.\n");
767         goto procfs_exit;
768     }
769 
770     rc = nv_module_init(&sp);
771     if (rc < 0)
772     {
773         nv_printf(NV_DBG_ERRORS, "NVRM: failed to initialize module.\n");
774         goto caps_root_exit;
775     }
776 
777     count = nvos_count_devices();
778     if ((count == 0) && (!is_nvswitch_present))
779     {
780         nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA GPU found.\n");
781         rc = -ENODEV;
782         goto module_exit;
783     }
784 
785     rc = nv_drivers_init();
786     if (rc < 0)
787     {
788         goto module_exit;
789     }
790 
791     if (num_probed_nv_devices != count)
792     {
793         nv_printf(NV_DBG_ERRORS,
794             "NVRM: The NVIDIA probe routine was not called for %d device(s).\n",
795             count - num_probed_nv_devices);
796         nv_printf(NV_DBG_ERRORS,
797             "NVRM: This can occur when a driver such as: \n"
798             "NVRM: nouveau, rivafb, nvidiafb or rivatv "
799             "\nNVRM: was loaded and obtained ownership of the NVIDIA device(s).\n");
800         nv_printf(NV_DBG_ERRORS,
801             "NVRM: Try unloading the conflicting kernel module (and/or\n"
802             "NVRM: reconfigure your kernel without the conflicting\n"
803             "NVRM: driver(s)), then try loading the NVIDIA kernel module\n"
804             "NVRM: again.\n");
805     }
806 
807     if ((num_probed_nv_devices == 0) && (!is_nvswitch_present))
808     {
809         rc = -ENODEV;
810         nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA devices probed.\n");
811         goto drivers_exit;
812     }
813 
814     if (num_probed_nv_devices != num_nv_devices)
815     {
816         nv_printf(NV_DBG_ERRORS,
817             "NVRM: The NVIDIA probe routine failed for %d device(s).\n",
818             num_probed_nv_devices - num_nv_devices);
819     }
820 
821     if ((num_nv_devices == 0) && (!is_nvswitch_present))
822     {
823         rc = -ENODEV;
824         nv_printf(NV_DBG_ERRORS,
825             "NVRM: None of the NVIDIA devices were initialized.\n");
826         goto drivers_exit;
827     }
828 
829     /*
830      * Initialize registry keys after PCI driver registration has
831      * completed successfully to support per-device module
832      * parameters.
833      */
834     nv_registry_keys_init(sp);
835 
836     nv_report_applied_patches();
837 
838     nv_printf(NV_DBG_ERRORS, "NVRM: loading %s\n", pNVRM_ID);
839 
840 #if defined(NV_UVM_ENABLE)
841     rc = nv_uvm_init();
842     if (rc != 0)
843     {
844         goto drivers_exit;
845     }
846 #endif
847 
848     __nv_init_sp = sp;
849 
850     return 0;
851 
852 drivers_exit:
853     nv_drivers_exit();
854 
855 module_exit:
856     nv_module_exit(sp);
857 
858 caps_root_exit:
859     nv_caps_root_exit();
860 
861 procfs_exit:
862     nv_procfs_exit();
863 
864     return rc;
865 }
866 
867 void nvidia_exit_module(void)
868 {
869     nvidia_stack_t *sp = __nv_init_sp;
870 
871 #if defined(NV_UVM_ENABLE)
872     nv_uvm_exit();
873 #endif
874 
875     nv_drivers_exit();
876 
877     nv_module_exit(sp);
878 
879     nv_caps_root_exit();
880 
881     nv_procfs_exit();
882 
883     nv_memdbg_exit();
884 }
885 
886 static void *nv_alloc_file_private(void)
887 {
888     nv_linux_file_private_t *nvlfp;
889     unsigned int i;
890 
891     NV_KZALLOC(nvlfp, sizeof(nv_linux_file_private_t));
892     if (!nvlfp)
893         return NULL;
894 
895     if (rm_is_altstack_in_use())
896     {
897         for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
898         {
899             NV_INIT_MUTEX(&nvlfp->fops_sp_lock[i]);
900         }
901     }
902 
903     init_waitqueue_head(&nvlfp->waitqueue);
904     NV_SPIN_LOCK_INIT(&nvlfp->fp_lock);
905 
906     return nvlfp;
907 }
908 
909 static void nv_free_file_private(nv_linux_file_private_t *nvlfp)
910 {
911     nvidia_event_t *nvet;
912 
913     if (nvlfp == NULL)
914         return;
915 
916     for (nvet = nvlfp->event_data_head; nvet != NULL; nvet = nvlfp->event_data_head)
917     {
918         nvlfp->event_data_head = nvlfp->event_data_head->next;
919         NV_KFREE(nvet, sizeof(nvidia_event_t));
920     }
921 
922     if (nvlfp->mmap_context.page_array != NULL)
923     {
924         os_free_mem(nvlfp->mmap_context.page_array);
925     }
926 
927     NV_KFREE(nvlfp, sizeof(nv_linux_file_private_t));
928 }
929 
930 
931 static int nv_is_control_device(
932     struct inode *inode
933 )
934 {
935     return (minor((inode)->i_rdev) == NV_CONTROL_DEVICE_MINOR);
936 }
937 
938 /*
939  * Search the global list of nv devices for the one with the given minor device
940  * number. If found, nvl is returned with nvl->ldata_lock taken.
941  */
942 static nv_linux_state_t *find_minor(NvU32 minor)
943 {
944     nv_linux_state_t *nvl;
945 
946     LOCK_NV_LINUX_DEVICES();
947     nvl = nv_linux_devices;
948     while (nvl != NULL)
949     {
950         if (nvl->minor_num == minor)
951         {
952             down(&nvl->ldata_lock);
953             break;
954         }
955         nvl = nvl->next;
956     }
957 
958     UNLOCK_NV_LINUX_DEVICES();
959     return nvl;
960 }
961 
962 /*
963  * Search the global list of nv devices for the one with the given gpu_id.
964  * If found, nvl is returned with nvl->ldata_lock taken.
965  */
966 static nv_linux_state_t *find_gpu_id(NvU32 gpu_id)
967 {
968     nv_linux_state_t *nvl;
969 
970     LOCK_NV_LINUX_DEVICES();
971     nvl = nv_linux_devices;
972     while (nvl != NULL)
973     {
974         nv_state_t *nv = NV_STATE_PTR(nvl);
975         if (nv->gpu_id == gpu_id)
976         {
977             down(&nvl->ldata_lock);
978             break;
979         }
980         nvl = nvl->next;
981     }
982 
983     UNLOCK_NV_LINUX_DEVICES();
984     return nvl;
985 }
986 
987 /*
988  * Search the global list of nv devices for the one with the given UUID. Devices
989  * with missing UUID information are ignored. If found, nvl is returned with
990  * nvl->ldata_lock taken.
991  */
992 nv_linux_state_t *find_uuid(const NvU8 *uuid)
993 {
994     nv_linux_state_t *nvl = NULL;
995     nv_state_t *nv;
996     const NvU8 *dev_uuid;
997 
998     LOCK_NV_LINUX_DEVICES();
999 
1000     for (nvl = nv_linux_devices; nvl; nvl = nvl->next)
1001     {
1002         nv = NV_STATE_PTR(nvl);
1003         down(&nvl->ldata_lock);
1004         dev_uuid = nv_get_cached_uuid(nv);
1005         if (dev_uuid && memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
1006             goto out;
1007         up(&nvl->ldata_lock);
1008     }
1009 
1010 out:
1011     UNLOCK_NV_LINUX_DEVICES();
1012     return nvl;
1013 }
1014 
1015 /*
1016  * Search the global list of nv devices. The search logic is:
1017  *
1018  * 1) If any device has the given UUID, return it
1019  *
1020  * 2) If no device has the given UUID but at least one device is missing
1021  *    its UUID (for example because rm_init_adapter has not run on it yet),
1022  *    return that device.
1023  *
1024  * 3) If no device has the given UUID and all UUIDs are present, return NULL.
1025  *
1026  * In cases 1 and 2, nvl is returned with nvl->ldata_lock taken.
1027  *
1028  * The reason for this weird logic is because UUIDs aren't always available. See
1029  * bug 1642200.
1030  */
1031 static nv_linux_state_t *find_uuid_candidate(const NvU8 *uuid)
1032 {
1033     nv_linux_state_t *nvl = NULL;
1034     nv_state_t *nv;
1035     const NvU8 *dev_uuid;
1036     int use_missing;
1037     int has_missing = 0;
1038 
1039     LOCK_NV_LINUX_DEVICES();
1040 
1041     /*
1042      * Take two passes through the list. The first pass just looks for the UUID.
1043      * The second looks for the target or missing UUIDs. It would be nice if
1044      * this could be done in a single pass by remembering which nvls are missing
1045      * UUIDs, but we have to hold the nvl lock after we check for the UUID.
1046      */
1047     for (use_missing = 0; use_missing <= 1; use_missing++)
1048     {
1049         for (nvl = nv_linux_devices; nvl; nvl = nvl->next)
1050         {
1051             nv = NV_STATE_PTR(nvl);
1052             down(&nvl->ldata_lock);
1053             dev_uuid = nv_get_cached_uuid(nv);
1054             if (dev_uuid)
1055             {
1056                 /* Case 1: If a device has the given UUID, return it */
1057                 if (memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
1058                     goto out;
1059             }
1060             else
1061             {
1062                 /* Case 2: If no device has the given UUID but at least one
1063                  * device is missing its UUID, return that device. */
1064                 if (use_missing)
1065                     goto out;
1066                 has_missing = 1;
1067             }
1068             up(&nvl->ldata_lock);
1069         }
1070 
1071         /* Case 3: If no device has the given UUID and all UUIDs are present,
1072          * return NULL. */
1073         if (!has_missing)
1074             break;
1075     }
1076 
1077 out:
1078     UNLOCK_NV_LINUX_DEVICES();
1079     return nvl;
1080 }
1081 
1082 void nv_dev_free_stacks(nv_linux_state_t *nvl)
1083 {
1084     NvU32 i;
1085     for (i = 0; i < NV_DEV_STACK_COUNT; i++)
1086     {
1087         if (nvl->sp[i])
1088         {
1089             nv_kmem_cache_free_stack(nvl->sp[i]);
1090             nvl->sp[i] = NULL;
1091         }
1092     }
1093 }
1094 
1095 static int nv_dev_alloc_stacks(nv_linux_state_t *nvl)
1096 {
1097     NvU32 i;
1098     int rc;
1099 
1100     for (i = 0; i < NV_DEV_STACK_COUNT; i++)
1101     {
1102         rc = nv_kmem_cache_alloc_stack(&nvl->sp[i]);
1103         if (rc != 0)
1104         {
1105             nv_dev_free_stacks(nvl);
1106             return rc;
1107         }
1108     }
1109 
1110     return 0;
1111 }
1112 
1113 static int validate_numa_start_state(nv_linux_state_t *nvl)
1114 {
1115     int rc = 0;
1116     int numa_status = nv_get_numa_status(nvl);
1117 
1118     if (numa_status != NV_IOCTL_NUMA_STATUS_DISABLED)
1119     {
1120         if (nv_ctl_device.numa_memblock_size == 0)
1121         {
1122             nv_printf(NV_DBG_ERRORS, "NVRM: numa memblock size of zero "
1123                       "found during device start");
1124             rc = -EINVAL;
1125         }
1126         else
1127         {
1128             /* Keep the individual devices consistent with the control device */
1129             nvl->numa_memblock_size = nv_ctl_device.numa_memblock_size;
1130         }
1131     }
1132 
1133     return rc;
1134 }
1135 
1136 NV_STATUS NV_API_CALL nv_get_num_dpaux_instances(nv_state_t *nv, NvU32 *num_instances)
1137 {
1138     *num_instances = nv->num_dpaux_instance;
1139     return NV_OK;
1140 }
1141 
1142 void NV_API_CALL
1143 nv_schedule_uvm_isr(nv_state_t *nv)
1144 {
1145 #if defined(NV_UVM_ENABLE)
1146     nv_uvm_event_interrupt(nv_get_cached_uuid(nv));
1147 #endif
1148 }
1149 
1150 /*
1151  * Brings up the device on the first file open. Assumes nvl->ldata_lock is held.
1152  */
1153 static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
1154 {
1155     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
1156 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1157     NvU32 msi_config = 0;
1158 #endif
1159     int rc = 0;
1160     NvBool kthread_init = NV_FALSE;
1161     NvBool power_ref = NV_FALSE;
1162 
1163     rc = nv_get_rsync_info();
1164     if (rc != 0)
1165     {
1166         return rc;
1167     }
1168 
1169     rc = validate_numa_start_state(nvl);
1170     if (rc != 0)
1171     {
1172         goto failed;
1173     }
1174 
1175     if (dev_is_pci(nvl->dev) && (nv->pci_info.device_id == 0))
1176     {
1177         nv_printf(NV_DBG_ERRORS, "NVRM: open of non-existent GPU with minor number %d\n", nvl->minor_num);
1178         rc = -ENXIO;
1179         goto failed;
1180     }
1181 
1182     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1183     {
1184         if (rm_ref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE) != NV_OK)
1185         {
1186             rc = -EINVAL;
1187             goto failed;
1188         }
1189         power_ref = NV_TRUE;
1190     }
1191     else
1192     {
1193         if (rm_ref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE) != NV_OK)
1194         {
1195             rc = -EINVAL;
1196             goto failed;
1197         }
1198         power_ref = NV_TRUE;
1199     }
1200 
1201     rc = nv_init_ibmnpu_devices(nv);
1202     if (rc != 0)
1203     {
1204         nv_printf(NV_DBG_ERRORS,
1205             "NVRM: failed to initialize ibmnpu devices attached to GPU with minor number %d\n",
1206             nvl->minor_num);
1207         goto failed;
1208     }
1209 
1210     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1211     {
1212         rc = nv_dev_alloc_stacks(nvl);
1213         if (rc != 0)
1214             goto failed;
1215     }
1216 
1217 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1218     if (dev_is_pci(nvl->dev))
1219     {
1220         if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1221         {
1222             rm_read_registry_dword(sp, nv, NV_REG_ENABLE_MSI, &msi_config);
1223             if (msi_config == 1)
1224             {
1225                 if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSIX))
1226                 {
1227                     nv_init_msix(nv);
1228                 }
1229                 if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSI) &&
1230                     !(nv->flags & NV_FLAG_USES_MSIX))
1231                 {
1232                     nv_init_msi(nv);
1233                 }
1234             }
1235         }
1236     }
1237 #endif
1238 
1239     if (((!(nv->flags & NV_FLAG_USES_MSI)) && (!(nv->flags & NV_FLAG_USES_MSIX)))
1240         && (nv->interrupt_line == 0) && !(nv->flags & NV_FLAG_SOC_DISPLAY)
1241         && !(nv->flags & NV_FLAG_SOC_IGPU))
1242     {
1243         NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1244                       "No interrupts of any type are available. Cannot use this GPU.\n");
1245         rc = -EIO;
1246         goto failed;
1247     }
1248 
1249     rc = 0;
1250     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1251     {
1252         if (nv->flags & NV_FLAG_SOC_DISPLAY)
1253         {
1254         }
1255         else if (!(nv->flags & NV_FLAG_USES_MSIX))
1256         {
1257             rc = request_threaded_irq(nv->interrupt_line, nvidia_isr,
1258                                   nvidia_isr_kthread_bh, nv_default_irq_flags(nv),
1259                                   nv_device_name, (void *)nvl);
1260         }
1261 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1262         else
1263         {
1264             rc = nv_request_msix_irq(nvl);
1265         }
1266 #endif
1267     }
1268     if (rc != 0)
1269     {
1270         if ((nv->interrupt_line != 0) && (rc == -EBUSY))
1271         {
1272             NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1273                 "Tried to get IRQ %d, but another driver\n",
1274                 (unsigned int) nv->interrupt_line);
1275             nv_printf(NV_DBG_ERRORS, "NVRM: has it and is not sharing it.\n");
1276             nv_printf(NV_DBG_ERRORS, "NVRM: You may want to verify that no audio driver");
1277             nv_printf(NV_DBG_ERRORS, " is using the IRQ.\n");
1278         }
1279         NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "request_irq() failed (%d)\n", rc);
1280         goto failed;
1281     }
1282 
1283     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1284     {
1285         rc = os_alloc_mutex(&nvl->isr_bh_unlocked_mutex);
1286         if (rc != 0)
1287             goto failed;
1288         nv_kthread_q_item_init(&nvl->bottom_half_q_item, nvidia_isr_bh_unlocked, (void *)nv);
1289         rc = nv_kthread_q_init(&nvl->bottom_half_q, nv_device_name);
1290         if (rc != 0)
1291             goto failed;
1292         kthread_init = NV_TRUE;
1293 
1294         rc = nv_kthread_q_init(&nvl->queue.nvk, "nv_queue");
1295         if (rc)
1296             goto failed;
1297         nv->queue = &nvl->queue;
1298     }
1299 
1300     if (!rm_init_adapter(sp, nv))
1301     {
1302         if (!(nv->flags & NV_FLAG_USES_MSIX) &&
1303             !(nv->flags & NV_FLAG_SOC_DISPLAY) &&
1304             !(nv->flags & NV_FLAG_SOC_IGPU))
1305         {
1306             free_irq(nv->interrupt_line, (void *) nvl);
1307         }
1308         else if (nv->flags & NV_FLAG_SOC_DISPLAY)
1309         {
1310         }
1311 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1312         else
1313         {
1314             nv_free_msix_irq(nvl);
1315         }
1316 #endif
1317         NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1318                       "rm_init_adapter failed, device minor number %d\n",
1319                       nvl->minor_num);
1320         rc = -EIO;
1321         goto failed;
1322     }
1323 
1324     {
1325         const NvU8 *uuid = rm_get_gpu_uuid_raw(sp, nv);
1326 
1327         if (uuid != NULL)
1328         {
1329 #if defined(NV_UVM_ENABLE)
1330             nv_uvm_notify_start_device(uuid);
1331 #endif
1332         }
1333     }
1334 
1335     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1336     {
1337         nv_acpi_register_notifier(nvl);
1338     }
1339 
1340     nv->flags |= NV_FLAG_OPEN;
1341 
1342     /*
1343      * Now that RM init is done, allow dynamic power to control the GPU in FINE
1344      * mode, if enabled.  (If the mode is COARSE, this unref will do nothing
1345      * which will cause the GPU to remain powered up.)
1346      * This is balanced by a FINE ref increment at the beginning of
1347      * nv_stop_device().
1348      */
1349     rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE);
1350 
1351     return 0;
1352 
1353 failed:
1354 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1355     if (nv->flags & NV_FLAG_USES_MSI)
1356     {
1357         nv->flags &= ~NV_FLAG_USES_MSI;
1358         NV_PCI_DISABLE_MSI(nvl->pci_dev);
1359         if(nvl->irq_count)
1360             NV_KFREE(nvl->irq_count, nvl->num_intr * sizeof(nv_irq_count_info_t));
1361     }
1362     else if (nv->flags & NV_FLAG_USES_MSIX)
1363     {
1364         nv->flags &= ~NV_FLAG_USES_MSIX;
1365         pci_disable_msix(nvl->pci_dev);
1366         NV_KFREE(nvl->irq_count, nvl->num_intr*sizeof(nv_irq_count_info_t));
1367         NV_KFREE(nvl->msix_entries, nvl->num_intr*sizeof(struct msix_entry));
1368     }
1369 
1370     if (nvl->msix_bh_mutex)
1371     {
1372         os_free_mutex(nvl->msix_bh_mutex);
1373         nvl->msix_bh_mutex = NULL;
1374     }
1375 #endif
1376 
1377     if (nv->queue && !(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1378     {
1379         nv->queue = NULL;
1380         nv_kthread_q_stop(&nvl->queue.nvk);
1381     }
1382 
1383     if (kthread_init && !(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1384         nv_kthread_q_stop(&nvl->bottom_half_q);
1385 
1386     if (nvl->isr_bh_unlocked_mutex)
1387     {
1388         os_free_mutex(nvl->isr_bh_unlocked_mutex);
1389         nvl->isr_bh_unlocked_mutex = NULL;
1390     }
1391 
1392     nv_dev_free_stacks(nvl);
1393 
1394     nv_unregister_ibmnpu_devices(nv);
1395 
1396     if (power_ref)
1397     {
1398         rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
1399     }
1400 
1401     nv_put_rsync_info();
1402 
1403     return rc;
1404 }
1405 
1406 /*
1407  * Makes sure the device is ready for operations and increases nvl->usage_count.
1408  * Assumes nvl->ldata_lock is held.
1409  */
1410 static int nv_open_device(nv_state_t *nv, nvidia_stack_t *sp)
1411 {
1412     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
1413     int rc;
1414     NV_STATUS status;
1415 
1416     if (os_is_vgx_hyper())
1417     {
1418         /* fail open if GPU is being unbound */
1419         if (nv->flags & NV_FLAG_UNBIND_LOCK)
1420         {
1421             NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1422                           "Open failed as GPU is locked for unbind operation\n");
1423             return -ENODEV;
1424         }
1425     }
1426 
1427     NV_DEV_PRINTF(NV_DBG_INFO, nv, "Opening GPU with minor number %d\n",
1428                   nvl->minor_num);
1429 
1430     status = nv_check_gpu_state(nv);
1431     if (status == NV_ERR_GPU_IS_LOST)
1432     {
1433         NV_DEV_PRINTF(NV_DBG_INFO, nv, "Device in removal process\n");
1434         return -ENODEV;
1435     }
1436 
1437     if (unlikely(NV_ATOMIC_READ(nvl->usage_count) >= NV_S32_MAX))
1438         return -EMFILE;
1439 
1440     if ( ! (nv->flags & NV_FLAG_OPEN))
1441     {
1442         /* Sanity check: !NV_FLAG_OPEN requires usage_count == 0 */
1443         if (NV_ATOMIC_READ(nvl->usage_count) != 0)
1444         {
1445             NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1446                           "Minor device %u is referenced without being open!\n",
1447                           nvl->minor_num);
1448             WARN_ON(1);
1449             return -EBUSY;
1450         }
1451 
1452         rc = nv_start_device(nv, sp);
1453         if (rc != 0)
1454             return rc;
1455     }
1456     else if (rm_is_device_sequestered(sp, nv))
1457     {
1458         /* Do not increment the usage count of sequestered devices. */
1459         NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Device is currently unavailable\n");
1460         return -EBUSY;
1461     }
1462 
1463     NV_ATOMIC_INC(nvl->usage_count);
1464     return 0;
1465 }
1466 
1467 static void nv_init_mapping_revocation(nv_linux_state_t *nvl,
1468                                        struct file *file,
1469                                        nv_linux_file_private_t *nvlfp,
1470                                        struct inode *inode)
1471 {
1472     down(&nvl->mmap_lock);
1473 
1474     /* Set up struct address_space for use with unmap_mapping_range() */
1475     address_space_init_once(&nvlfp->mapping);
1476     nvlfp->mapping.host = inode;
1477     nvlfp->mapping.a_ops = inode->i_mapping->a_ops;
1478 #if defined(NV_ADDRESS_SPACE_HAS_BACKING_DEV_INFO)
1479     nvlfp->mapping.backing_dev_info = inode->i_mapping->backing_dev_info;
1480 #endif
1481     file->f_mapping = &nvlfp->mapping;
1482 
1483     /* Add nvlfp to list of open files in nvl for mapping revocation */
1484     list_add(&nvlfp->entry, &nvl->open_files);
1485 
1486     up(&nvl->mmap_lock);
1487 }
1488 
1489 /*
1490 ** nvidia_open
1491 **
1492 ** nv driver open entry point.  Sessions are created here.
1493 */
1494 int
1495 nvidia_open(
1496     struct inode *inode,
1497     struct file *file
1498 )
1499 {
1500     nv_state_t *nv = NULL;
1501     nv_linux_state_t *nvl = NULL;
1502     int rc = 0;
1503     nv_linux_file_private_t *nvlfp = NULL;
1504     nvidia_stack_t *sp = NULL;
1505     unsigned int i;
1506     unsigned int k;
1507 
1508     nv_printf(NV_DBG_INFO, "NVRM: nvidia_open...\n");
1509 
1510     nvlfp = nv_alloc_file_private();
1511     if (nvlfp == NULL)
1512     {
1513         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate file private!\n");
1514         return -ENOMEM;
1515     }
1516 
1517     rc = nv_kmem_cache_alloc_stack(&sp);
1518     if (rc != 0)
1519     {
1520         nv_free_file_private(nvlfp);
1521         return rc;
1522     }
1523 
1524     for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
1525     {
1526         rc = nv_kmem_cache_alloc_stack(&nvlfp->fops_sp[i]);
1527         if (rc != 0)
1528         {
1529             nv_kmem_cache_free_stack(sp);
1530             for (k = 0; k < i; ++k)
1531             {
1532                 nv_kmem_cache_free_stack(nvlfp->fops_sp[k]);
1533             }
1534             nv_free_file_private(nvlfp);
1535             return rc;
1536         }
1537     }
1538 
1539     NV_SET_FILE_PRIVATE(file, nvlfp);
1540     nvlfp->sp = sp;
1541 
1542     /* for control device, just jump to its open routine */
1543     /* after setting up the private data */
1544     if (nv_is_control_device(inode))
1545     {
1546         rc = nvidia_ctl_open(inode, file);
1547         if (rc != 0)
1548             goto failed;
1549         return rc;
1550     }
1551 
1552     rc = nv_down_read_interruptible(&nv_system_pm_lock);
1553     if (rc < 0)
1554         goto failed;
1555 
1556     /* Takes nvl->ldata_lock */
1557     nvl = find_minor(NV_DEVICE_MINOR_NUMBER(inode));
1558     if (!nvl)
1559     {
1560         rc = -ENODEV;
1561         up_read(&nv_system_pm_lock);
1562         goto failed;
1563     }
1564 
1565     nvlfp->nvptr = nvl;
1566     nv = NV_STATE_PTR(nvl);
1567 
1568     if ((nv->flags & NV_FLAG_EXCLUDE) != 0)
1569     {
1570         char *uuid = rm_get_gpu_uuid(sp, nv);
1571         NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
1572                       "open() not permitted for excluded %s\n",
1573                       (uuid != NULL) ? uuid : "GPU");
1574         if (uuid != NULL)
1575             os_free_mem(uuid);
1576         rc = -EPERM;
1577         goto failed1;
1578     }
1579 
1580     rc = nv_open_device(nv, sp);
1581     /* Fall-through on error */
1582 
1583     nv_assert_not_in_gpu_exclusion_list(sp, nv);
1584 
1585 failed1:
1586     up(&nvl->ldata_lock);
1587 
1588     up_read(&nv_system_pm_lock);
1589 failed:
1590     if (rc != 0)
1591     {
1592         if (nvlfp != NULL)
1593         {
1594             nv_kmem_cache_free_stack(sp);
1595             for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
1596             {
1597                 nv_kmem_cache_free_stack(nvlfp->fops_sp[i]);
1598             }
1599             nv_free_file_private(nvlfp);
1600             NV_SET_FILE_PRIVATE(file, NULL);
1601         }
1602     }
1603     else
1604     {
1605         nv_init_mapping_revocation(nvl, file, nvlfp, inode);
1606     }
1607 
1608     return rc;
1609 }
1610 
1611 static void validate_numa_shutdown_state(nv_linux_state_t *nvl)
1612 {
1613     int numa_status = nv_get_numa_status(nvl);
1614     WARN_ON((numa_status != NV_IOCTL_NUMA_STATUS_OFFLINE) &&
1615             (numa_status != NV_IOCTL_NUMA_STATUS_DISABLED));
1616 }
1617 
1618 void nv_shutdown_adapter(nvidia_stack_t *sp,
1619                          nv_state_t *nv,
1620                          nv_linux_state_t *nvl)
1621 {
1622     validate_numa_shutdown_state(nvl);
1623 
1624     rm_disable_adapter(sp, nv);
1625 
1626     // It's safe to call nv_kthread_q_stop even if queue is not initialized
1627     nv_kthread_q_stop(&nvl->bottom_half_q);
1628 
1629     if (nv->queue != NULL)
1630     {
1631         nv->queue = NULL;
1632         nv_kthread_q_stop(&nvl->queue.nvk);
1633     }
1634 
1635     if (nvl->isr_bh_unlocked_mutex)
1636     {
1637         os_free_mutex(nvl->isr_bh_unlocked_mutex);
1638         nvl->isr_bh_unlocked_mutex = NULL;
1639     }
1640 
1641     if (!(nv->flags & NV_FLAG_USES_MSIX) &&
1642         !(nv->flags & NV_FLAG_SOC_DISPLAY) &&
1643         !(nv->flags & NV_FLAG_SOC_IGPU))
1644     {
1645         free_irq(nv->interrupt_line, (void *)nvl);
1646         if (nv->flags & NV_FLAG_USES_MSI)
1647         {
1648             NV_PCI_DISABLE_MSI(nvl->pci_dev);
1649             if(nvl->irq_count)
1650                 NV_KFREE(nvl->irq_count, nvl->num_intr * sizeof(nv_irq_count_info_t));
1651         }
1652     }
1653     else if (nv->flags & NV_FLAG_SOC_DISPLAY)
1654     {
1655     }
1656 #if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
1657     else
1658     {
1659         nv_free_msix_irq(nvl);
1660         pci_disable_msix(nvl->pci_dev);
1661         nv->flags &= ~NV_FLAG_USES_MSIX;
1662         NV_KFREE(nvl->msix_entries, nvl->num_intr*sizeof(struct msix_entry));
1663         NV_KFREE(nvl->irq_count, nvl->num_intr*sizeof(nv_irq_count_info_t));
1664     }
1665 #endif
1666 
1667     if (nvl->msix_bh_mutex)
1668     {
1669         os_free_mutex(nvl->msix_bh_mutex);
1670         nvl->msix_bh_mutex = NULL;
1671     }
1672 
1673     rm_shutdown_adapter(sp, nv);
1674 }
1675 
1676 /*
1677  * Tears down the device on the last file close. Assumes nvl->ldata_lock is
1678  * held.
1679  */
1680 static void nv_stop_device(nv_state_t *nv, nvidia_stack_t *sp)
1681 {
1682     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
1683     static int persistence_mode_notice_logged;
1684 
1685     /*
1686      * The GPU needs to be powered on to go through the teardown sequence.
1687      * This balances the FINE unref at the end of nv_start_device().
1688      */
1689     rm_ref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE);
1690 
1691 #if defined(NV_UVM_ENABLE)
1692     {
1693         const NvU8* uuid;
1694         // Inform UVM before disabling adapter. Use cached copy
1695         uuid = nv_get_cached_uuid(nv);
1696         if (uuid != NULL)
1697         {
1698             // this function cannot fail
1699             nv_uvm_notify_stop_device(uuid);
1700         }
1701     }
1702 #endif
1703     /* Adapter is already shutdown as part of nvidia_pci_remove */
1704     if (!nv->removed)
1705     {
1706         if (nv->flags & NV_FLAG_PERSISTENT_SW_STATE)
1707         {
1708             rm_disable_adapter(sp, nv);
1709         }
1710         else
1711         {
1712             nv_acpi_unregister_notifier(nvl);
1713             nv_shutdown_adapter(sp, nv, nvl);
1714         }
1715     }
1716 
1717     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1718     {
1719         nv_dev_free_stacks(nvl);
1720     }
1721 
1722     if ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) &&
1723         (!persistence_mode_notice_logged) && (!os_is_vgx_hyper()))
1724     {
1725         nv_printf(NV_DBG_ERRORS, "NVRM: Persistence mode is deprecated and"
1726                   " will be removed in a future release. Please use"
1727                   " nvidia-persistenced instead.\n");
1728         persistence_mode_notice_logged  = 1;
1729     }
1730 
1731     /* leave INIT flag alone so we don't reinit every time */
1732     nv->flags &= ~NV_FLAG_OPEN;
1733 
1734     nv_unregister_ibmnpu_devices(nv);
1735 
1736     if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
1737     {
1738         rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
1739     }
1740     else
1741     {
1742         /* If in legacy persistence mode, only unref FINE refcount. */
1743         rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE);
1744     }
1745 
1746     nv_put_rsync_info();
1747 }
1748 
1749 /*
1750  * Decreases nvl->usage_count, stopping the device when it reaches 0. Assumes
1751  * nvl->ldata_lock is held.
1752  */
1753 static void nv_close_device(nv_state_t *nv, nvidia_stack_t *sp)
1754 {
1755     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
1756 
1757     if (NV_ATOMIC_READ(nvl->usage_count) == 0)
1758     {
1759         nv_printf(NV_DBG_ERRORS,
1760                   "NVRM: Attempting to close unopened minor device %u!\n",
1761                   nvl->minor_num);
1762         WARN_ON(1);
1763         return;
1764     }
1765 
1766     if (NV_ATOMIC_DEC_AND_TEST(nvl->usage_count))
1767         nv_stop_device(nv, sp);
1768 }
1769 
1770 /*
1771 ** nvidia_close
1772 **
1773 ** Primary driver close entry point.
1774 */
1775 
1776 static void
1777 nvidia_close_callback(
1778    nv_linux_file_private_t *nvlfp
1779 )
1780 {
1781     nv_linux_state_t *nvl = nvlfp->nvptr;
1782     nv_state_t *nv = NV_STATE_PTR(nvl);
1783     nvidia_stack_t *sp = nvlfp->sp;
1784     unsigned int i;
1785     NvBool bRemove = NV_FALSE;
1786 
1787     rm_cleanup_file_private(sp, nv, &nvlfp->nvfp);
1788 
1789     down(&nvl->mmap_lock);
1790     list_del(&nvlfp->entry);
1791     up(&nvl->mmap_lock);
1792 
1793     down(&nvl->ldata_lock);
1794     nv_close_device(nv, sp);
1795 
1796     bRemove = (!NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)) &&
1797               (NV_ATOMIC_READ(nvl->usage_count) == 0) &&
1798               rm_get_device_remove_flag(sp, nv->gpu_id);
1799 
1800     for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
1801     {
1802         nv_kmem_cache_free_stack(nvlfp->fops_sp[i]);
1803     }
1804 
1805     nv_free_file_private(nvlfp);
1806 
1807     /*
1808      * In case of surprise removal of device, we have 2 cases as below:
1809      *
1810      * 1> When nvidia_pci_remove is scheduled prior to nvidia_close.
1811      * nvidia_pci_remove will not destroy linux layer locks & nv linux state
1812      * struct but will set variable nv->removed for nvidia_close.
1813      * Once all the clients are closed, last nvidia_close will clean up linux
1814      * layer locks and nv linux state struct.
1815      *
1816      * 2> When nvidia_close is scheduled prior to nvidia_pci_remove.
1817      * This will be treated as normal working case. nvidia_close will not do
1818      * any cleanup related to linux layer locks and nv linux state struct.
1819      * nvidia_pci_remove when scheduled will do necessary cleanup.
1820      */
1821     if ((NV_ATOMIC_READ(nvl->usage_count) == 0) && nv->removed)
1822     {
1823         nvidia_frontend_remove_device((void *)&nv_fops, nvl);
1824         nv_lock_destroy_locks(sp, nv);
1825         NV_KFREE(nvl, sizeof(nv_linux_state_t));
1826     }
1827     else
1828     {
1829         up(&nvl->ldata_lock);
1830 
1831 #if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE)
1832         if (bRemove)
1833         {
1834             NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(nvl->pci_dev);
1835         }
1836 #endif
1837     }
1838 
1839     nv_kmem_cache_free_stack(sp);
1840 }
1841 
1842 static void nvidia_close_deferred(void *data)
1843 {
1844     nv_linux_file_private_t *nvlfp = data;
1845 
1846     down_read(&nv_system_pm_lock);
1847 
1848     nvidia_close_callback(nvlfp);
1849 
1850     up_read(&nv_system_pm_lock);
1851 }
1852 
1853 int
1854 nvidia_close(
1855     struct inode *inode,
1856     struct file *file
1857 )
1858 {
1859     int rc;
1860     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
1861     nv_linux_state_t *nvl = nvlfp->nvptr;
1862     nv_state_t *nv = NV_STATE_PTR(nvl);
1863 
1864     NV_DEV_PRINTF(NV_DBG_INFO, nv, "nvidia_close on GPU with minor number %d\n", NV_DEVICE_MINOR_NUMBER(inode));
1865 
1866     if (nv_is_control_device(inode))
1867     {
1868         return nvidia_ctl_close(inode, file);
1869     }
1870 
1871     NV_SET_FILE_PRIVATE(file, NULL);
1872 
1873     rc = nv_down_read_interruptible(&nv_system_pm_lock);
1874     if (rc == 0)
1875     {
1876         nvidia_close_callback(nvlfp);
1877         up_read(&nv_system_pm_lock);
1878     }
1879     else
1880     {
1881         nv_kthread_q_item_init(&nvlfp->deferred_close_q_item,
1882                                nvidia_close_deferred,
1883                                nvlfp);
1884         rc = nv_kthread_q_schedule_q_item(&nv_deferred_close_kthread_q,
1885                                           &nvlfp->deferred_close_q_item);
1886         WARN_ON(rc == 0);
1887     }
1888 
1889     return 0;
1890 }
1891 
1892 unsigned int
1893 nvidia_poll(
1894     struct file *file,
1895     poll_table  *wait
1896 )
1897 {
1898     unsigned int mask = 0;
1899     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
1900     unsigned long eflags;
1901     nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
1902     nv_state_t *nv = NV_STATE_PTR(nvl);
1903     NV_STATUS status;
1904 
1905     status = nv_check_gpu_state(nv);
1906     if (status == NV_ERR_GPU_IS_LOST)
1907     {
1908         NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping nvidia_poll\n");
1909         return POLLHUP;
1910     }
1911 
1912     if ((file->f_flags & O_NONBLOCK) == 0)
1913         poll_wait(file, &nvlfp->waitqueue, wait);
1914 
1915     NV_SPIN_LOCK_IRQSAVE(&nvlfp->fp_lock, eflags);
1916 
1917     if ((nvlfp->event_data_head != NULL) || nvlfp->dataless_event_pending)
1918     {
1919         mask = (POLLPRI | POLLIN);
1920         nvlfp->dataless_event_pending = NV_FALSE;
1921     }
1922 
1923     NV_SPIN_UNLOCK_IRQRESTORE(&nvlfp->fp_lock, eflags);
1924 
1925     return mask;
1926 }
1927 
1928 #define NV_CTL_DEVICE_ONLY(nv)                 \
1929 {                                              \
1930     if (((nv)->flags & NV_FLAG_CONTROL) == 0)  \
1931     {                                          \
1932         status = -EINVAL;                      \
1933         goto done;                             \
1934     }                                          \
1935 }
1936 
1937 #define NV_ACTUAL_DEVICE_ONLY(nv)              \
1938 {                                              \
1939     if (((nv)->flags & NV_FLAG_CONTROL) != 0)  \
1940     {                                          \
1941         status = -EINVAL;                      \
1942         goto done;                             \
1943     }                                          \
1944 }
1945 
1946 /*
1947  * Fills the ci array with the state of num_entries devices. Returns -EINVAL if
1948  * num_entries isn't big enough to hold all available devices.
1949  */
1950 static int nvidia_read_card_info(nv_ioctl_card_info_t *ci, size_t num_entries)
1951 {
1952     nv_state_t *nv;
1953     nv_linux_state_t *nvl;
1954     size_t i = 0;
1955     int rc = 0;
1956 
1957     /* Clear each card's flags field the lazy way */
1958     memset(ci, 0, num_entries * sizeof(ci[0]));
1959 
1960     LOCK_NV_LINUX_DEVICES();
1961 
1962     if (num_entries < num_nv_devices)
1963     {
1964         rc = -EINVAL;
1965         goto out;
1966     }
1967 
1968     for (nvl = nv_linux_devices; nvl && i < num_entries; nvl = nvl->next)
1969     {
1970         nv = NV_STATE_PTR(nvl);
1971 
1972         /* We do not include excluded GPUs in the list... */
1973         if ((nv->flags & NV_FLAG_EXCLUDE) != 0)
1974             continue;
1975 
1976         ci[i].valid              = NV_TRUE;
1977         ci[i].pci_info.domain    = nv->pci_info.domain;
1978         ci[i].pci_info.bus       = nv->pci_info.bus;
1979         ci[i].pci_info.slot      = nv->pci_info.slot;
1980         ci[i].pci_info.vendor_id = nv->pci_info.vendor_id;
1981         ci[i].pci_info.device_id = nv->pci_info.device_id;
1982         ci[i].gpu_id             = nv->gpu_id;
1983         ci[i].interrupt_line     = nv->interrupt_line;
1984         ci[i].reg_address        = nv->regs->cpu_address;
1985         ci[i].reg_size           = nv->regs->size;
1986         ci[i].minor_number       = nvl->minor_num;
1987         if (dev_is_pci(nvl->dev))
1988         {
1989             ci[i].fb_address         = nv->fb->cpu_address;
1990             ci[i].fb_size            = nv->fb->size;
1991         }
1992         i++;
1993     }
1994 
1995 out:
1996     UNLOCK_NV_LINUX_DEVICES();
1997     return rc;
1998 }
1999 
2000 int
2001 nvidia_ioctl(
2002     struct inode *inode,
2003     struct file *file,
2004     unsigned int cmd,
2005     unsigned long i_arg)
2006 {
2007     NV_STATUS rmStatus;
2008     int status = 0;
2009     nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
2010     nv_state_t *nv = NV_STATE_PTR(nvl);
2011     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
2012     nvidia_stack_t *sp = NULL;
2013     nv_ioctl_xfer_t ioc_xfer;
2014     void *arg_ptr = (void *) i_arg;
2015     void *arg_copy = NULL;
2016     size_t arg_size = 0;
2017     int arg_cmd;
2018 
2019     nv_printf(NV_DBG_INFO, "NVRM: ioctl(0x%x, 0x%x, 0x%x)\n",
2020         _IOC_NR(cmd), (unsigned int) i_arg, _IOC_SIZE(cmd));
2021 
2022     status = nv_down_read_interruptible(&nv_system_pm_lock);
2023     if (status < 0)
2024         return status;
2025 
2026     sp = nv_nvlfp_get_sp(nvlfp, NV_FOPS_STACK_INDEX_IOCTL);
2027 
2028     rmStatus = nv_check_gpu_state(nv);
2029     if (rmStatus == NV_ERR_GPU_IS_LOST)
2030     {
2031         nv_printf(NV_DBG_INFO, "NVRM: GPU is lost, skipping nvidia_ioctl\n");
2032         status = -EINVAL;
2033         goto done;
2034     }
2035 
2036     arg_size = _IOC_SIZE(cmd);
2037     arg_cmd  = _IOC_NR(cmd);
2038 
2039     if (arg_cmd == NV_ESC_IOCTL_XFER_CMD)
2040     {
2041         if (arg_size != sizeof(nv_ioctl_xfer_t))
2042         {
2043             nv_printf(NV_DBG_ERRORS,
2044                     "NVRM: invalid ioctl XFER structure size!\n");
2045             status = -EINVAL;
2046             goto done;
2047         }
2048 
2049         if (NV_COPY_FROM_USER(&ioc_xfer, arg_ptr, sizeof(ioc_xfer)))
2050         {
2051             nv_printf(NV_DBG_ERRORS,
2052                     "NVRM: failed to copy in ioctl XFER data!\n");
2053             status = -EFAULT;
2054             goto done;
2055         }
2056 
2057         arg_cmd  = ioc_xfer.cmd;
2058         arg_size = ioc_xfer.size;
2059         arg_ptr  = NvP64_VALUE(ioc_xfer.ptr);
2060 
2061         if (arg_size > NV_ABSOLUTE_MAX_IOCTL_SIZE)
2062         {
2063             nv_printf(NV_DBG_ERRORS, "NVRM: invalid ioctl XFER size!\n");
2064             status = -EINVAL;
2065             goto done;
2066         }
2067     }
2068 
2069     NV_KMALLOC(arg_copy, arg_size);
2070     if (arg_copy == NULL)
2071     {
2072         nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate ioctl memory\n");
2073         status = -ENOMEM;
2074         goto done;
2075     }
2076 
2077     if (NV_COPY_FROM_USER(arg_copy, arg_ptr, arg_size))
2078     {
2079         nv_printf(NV_DBG_ERRORS, "NVRM: failed to copy in ioctl data!\n");
2080         status = -EFAULT;
2081         goto done;
2082     }
2083 
2084     switch (arg_cmd)
2085     {
2086         case NV_ESC_QUERY_DEVICE_INTR:
2087         {
2088             nv_ioctl_query_device_intr *query_intr = arg_copy;
2089 
2090             NV_ACTUAL_DEVICE_ONLY(nv);
2091 
2092             if ((arg_size < sizeof(*query_intr)) ||
2093                 (!nv->regs->map))
2094             {
2095                 status = -EINVAL;
2096                 goto done;
2097             }
2098 
2099             query_intr->intrStatus =
2100                 *(nv->regs->map + (NV_RM_DEVICE_INTR_ADDRESS >> 2));
2101             query_intr->status = NV_OK;
2102             break;
2103         }
2104 
2105         /* pass out info about the card */
2106         case NV_ESC_CARD_INFO:
2107         {
2108             size_t num_arg_devices = arg_size / sizeof(nv_ioctl_card_info_t);
2109 
2110             NV_CTL_DEVICE_ONLY(nv);
2111 
2112             status = nvidia_read_card_info(arg_copy, num_arg_devices);
2113             break;
2114         }
2115 
2116         case NV_ESC_ATTACH_GPUS_TO_FD:
2117         {
2118             size_t num_arg_gpus = arg_size / sizeof(NvU32);
2119             size_t i;
2120 
2121             NV_CTL_DEVICE_ONLY(nv);
2122 
2123             if (num_arg_gpus == 0 || nvlfp->num_attached_gpus != 0 ||
2124                 arg_size % sizeof(NvU32) != 0)
2125             {
2126                 status = -EINVAL;
2127                 goto done;
2128             }
2129 
2130             NV_KMALLOC(nvlfp->attached_gpus, arg_size);
2131             if (nvlfp->attached_gpus == NULL)
2132             {
2133                 status = -ENOMEM;
2134                 goto done;
2135             }
2136             memcpy(nvlfp->attached_gpus, arg_copy, arg_size);
2137             nvlfp->num_attached_gpus = num_arg_gpus;
2138 
2139             for (i = 0; i < nvlfp->num_attached_gpus; i++)
2140             {
2141                 if (nvlfp->attached_gpus[i] == 0)
2142                 {
2143                     continue;
2144                 }
2145 
2146                 if (nvidia_dev_get(nvlfp->attached_gpus[i], sp))
2147                 {
2148                     while (i--)
2149                     {
2150                         if (nvlfp->attached_gpus[i] != 0)
2151                             nvidia_dev_put(nvlfp->attached_gpus[i], sp);
2152                     }
2153                     NV_KFREE(nvlfp->attached_gpus, arg_size);
2154                     nvlfp->num_attached_gpus = 0;
2155 
2156                     status = -EINVAL;
2157                     break;
2158                 }
2159             }
2160 
2161             break;
2162         }
2163 
2164         case NV_ESC_CHECK_VERSION_STR:
2165         {
2166             NV_CTL_DEVICE_ONLY(nv);
2167 
2168             rmStatus = rm_perform_version_check(sp, arg_copy, arg_size);
2169             status = ((rmStatus == NV_OK) ? 0 : -EINVAL);
2170             break;
2171         }
2172 
2173         case NV_ESC_SYS_PARAMS:
2174         {
2175             nv_ioctl_sys_params_t *api = arg_copy;
2176 
2177             NV_CTL_DEVICE_ONLY(nv);
2178 
2179             if (arg_size != sizeof(nv_ioctl_sys_params_t))
2180             {
2181                 status = -EINVAL;
2182                 goto done;
2183             }
2184 
2185             /* numa_memblock_size should only be set once */
2186             if (nvl->numa_memblock_size == 0)
2187             {
2188                 nvl->numa_memblock_size = api->memblock_size;
2189             }
2190             else
2191             {
2192                 status = (nvl->numa_memblock_size == api->memblock_size) ?
2193                     0 : -EBUSY;
2194                 goto done;
2195             }
2196             break;
2197         }
2198 
2199         case NV_ESC_NUMA_INFO:
2200         {
2201             nv_ioctl_numa_info_t *api = arg_copy;
2202             rmStatus = NV_OK;
2203 
2204             NV_ACTUAL_DEVICE_ONLY(nv);
2205 
2206             if (arg_size != sizeof(nv_ioctl_numa_info_t))
2207             {
2208                 status = -EINVAL;
2209                 goto done;
2210             }
2211 
2212             api->offline_addresses.numEntries =
2213                 ARRAY_SIZE(api->offline_addresses.addresses),
2214 
2215             rmStatus = rm_get_gpu_numa_info(sp, nv,
2216                 &(api->nid),
2217                 &(api->numa_mem_addr),
2218                 &(api->numa_mem_size),
2219                  (api->offline_addresses.addresses),
2220                 &(api->offline_addresses.numEntries));
2221             if (rmStatus != NV_OK)
2222             {
2223                 status = -EBUSY;
2224                 goto done;
2225             }
2226 
2227             api->status = nv_get_numa_status(nvl);
2228             api->memblock_size = nv_ctl_device.numa_memblock_size;
2229             break;
2230         }
2231 
2232         case NV_ESC_SET_NUMA_STATUS:
2233         {
2234             nv_ioctl_set_numa_status_t *api = arg_copy;
2235             rmStatus = NV_OK;
2236 
2237             if (!NV_IS_SUSER())
2238             {
2239                 status = -EACCES;
2240                 goto done;
2241             }
2242 
2243             NV_ACTUAL_DEVICE_ONLY(nv);
2244 
2245             if (arg_size != sizeof(nv_ioctl_set_numa_status_t))
2246             {
2247                 status = -EINVAL;
2248                 goto done;
2249             }
2250 
2251             /*
2252              * The nv_linux_state_t for the device needs to be locked
2253              * in order to prevent additional open()/close() calls from
2254              * manipulating the usage count for the device while we
2255              * determine if NUMA state can be changed.
2256              */
2257             down(&nvl->ldata_lock);
2258 
2259             if (nv_get_numa_status(nvl) != api->status)
2260             {
2261                 if (api->status == NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS)
2262                 {
2263                     /*
2264                      * Only the current client should have an open file
2265                      * descriptor for the device, to allow safe offlining.
2266                      */
2267                     if (NV_ATOMIC_READ(nvl->usage_count) > 1)
2268                     {
2269                         status = -EBUSY;
2270                         goto unlock;
2271                     }
2272                     else
2273                     {
2274                         /*
2275                          * If this call fails, it indicates that RM
2276                          * is not ready to offline memory, and we should keep
2277                          * the current NUMA status of ONLINE.
2278                          */
2279                         rmStatus = rm_gpu_numa_offline(sp, nv);
2280                         if (rmStatus != NV_OK)
2281                         {
2282                             status = -EBUSY;
2283                             goto unlock;
2284                         }
2285                     }
2286                 }
2287 
2288                 status = nv_set_numa_status(nvl, api->status);
2289                 if (status < 0)
2290                 {
2291                     if (api->status == NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS)
2292                         (void) rm_gpu_numa_online(sp, nv);
2293                     goto unlock;
2294                 }
2295 
2296                 if (api->status == NV_IOCTL_NUMA_STATUS_ONLINE)
2297                 {
2298                     rmStatus = rm_gpu_numa_online(sp, nv);
2299                     if (rmStatus != NV_OK)
2300                     {
2301                         status = -EBUSY;
2302                         goto unlock;
2303                     }
2304                 }
2305             }
2306 
2307 unlock:
2308             up(&nvl->ldata_lock);
2309 
2310             break;
2311         }
2312 
2313         case NV_ESC_EXPORT_TO_DMABUF_FD:
2314         {
2315             nv_ioctl_export_to_dma_buf_fd_t *params = arg_copy;
2316 
2317             if (arg_size != sizeof(nv_ioctl_export_to_dma_buf_fd_t))
2318             {
2319                 status = -EINVAL;
2320                 goto done;
2321             }
2322 
2323             NV_ACTUAL_DEVICE_ONLY(nv);
2324 
2325             params->status = nv_dma_buf_export(nv, params);
2326 
2327             break;
2328         }
2329 
2330         default:
2331             rmStatus = rm_ioctl(sp, nv, &nvlfp->nvfp, arg_cmd, arg_copy, arg_size);
2332             status = ((rmStatus == NV_OK) ? 0 : -EINVAL);
2333             break;
2334     }
2335 
2336 done:
2337     nv_nvlfp_put_sp(nvlfp, NV_FOPS_STACK_INDEX_IOCTL);
2338 
2339     up_read(&nv_system_pm_lock);
2340 
2341     if (arg_copy != NULL)
2342     {
2343         if (status != -EFAULT)
2344         {
2345             if (NV_COPY_TO_USER(arg_ptr, arg_copy, arg_size))
2346             {
2347                 nv_printf(NV_DBG_ERRORS, "NVRM: failed to copy out ioctl data\n");
2348                 status = -EFAULT;
2349             }
2350         }
2351         NV_KFREE(arg_copy, arg_size);
2352     }
2353 
2354     return status;
2355 }
2356 
2357 irqreturn_t
2358 nvidia_isr_msix(
2359     int   irq,
2360     void *arg
2361 )
2362 {
2363     irqreturn_t ret;
2364     nv_linux_state_t *nvl = (void *) arg;
2365 
2366     // nvidia_isr_msix() is called for each of the MSI-X vectors and they can
2367     // run in parallel on different CPUs (cores), but this is not currently
2368     // supported by nvidia_isr() and its children. As a big hammer fix just
2369     // spinlock around the nvidia_isr() call to serialize them.
2370     //
2371     // At this point interrupts are disabled on the CPU running our ISR (see
2372     // comments for nv_default_irq_flags()) so a plain spinlock is enough.
2373     NV_SPIN_LOCK(&nvl->msix_isr_lock);
2374 
2375     ret = nvidia_isr(irq, arg);
2376 
2377     NV_SPIN_UNLOCK(&nvl->msix_isr_lock);
2378 
2379     return ret;
2380 }
2381 
2382 /*
2383  * driver receives an interrupt
2384  *    if someone waiting, then hand it off.
2385  */
2386 irqreturn_t
2387 nvidia_isr(
2388     int   irq,
2389     void *arg
2390 )
2391 {
2392     nv_linux_state_t *nvl = (void *) arg;
2393     nv_state_t *nv = NV_STATE_PTR(nvl);
2394     NvU32 need_to_run_bottom_half_gpu_lock_held = 0;
2395     NvBool rm_handled = NV_FALSE, uvm_handled = NV_FALSE, rm_fault_handling_needed = NV_FALSE;
2396     NvU32 rm_serviceable_fault_cnt = 0;
2397     NvU32 sec, usec;
2398     NvU16 index = 0;
2399     NvU64 currentTime = 0;
2400     NvBool found_irq = NV_FALSE;
2401 
2402     rm_gpu_handle_mmu_faults(nvl->sp[NV_DEV_STACK_ISR], nv, &rm_serviceable_fault_cnt);
2403     rm_fault_handling_needed = (rm_serviceable_fault_cnt != 0);
2404 
2405 #if defined (NV_UVM_ENABLE)
2406     //
2407     // Returns NV_OK if the UVM driver handled the interrupt
2408     //
2409     // Returns NV_ERR_NO_INTR_PENDING if the interrupt is not for
2410     // the UVM driver.
2411     //
2412     // Returns NV_WARN_MORE_PROCESSING_REQUIRED if the UVM top-half ISR was
2413     // unable to get its lock(s), due to other (UVM) threads holding them.
2414     //
2415     // RM can normally treat NV_WARN_MORE_PROCESSING_REQUIRED the same as
2416     // NV_ERR_NO_INTR_PENDING, but in some cases the extra information may
2417     // be helpful.
2418     //
2419     if (nv_uvm_event_interrupt(nv_get_cached_uuid(nv)) == NV_OK)
2420         uvm_handled = NV_TRUE;
2421 #endif
2422 
2423     rm_handled = rm_isr(nvl->sp[NV_DEV_STACK_ISR], nv,
2424                         &need_to_run_bottom_half_gpu_lock_held);
2425 
2426     /* Replicating the logic in linux kernel to track unhandled interrupt crossing a threshold */
2427     if ((nv->flags & NV_FLAG_USES_MSI) || (nv->flags & NV_FLAG_USES_MSIX))
2428     {
2429         if (nvl->irq_count != NULL)
2430         {
2431             for (index = 0; index < nvl->current_num_irq_tracked; index++)
2432             {
2433                 if (nvl->irq_count[index].irq == irq)
2434                 {
2435                     found_irq = NV_TRUE;
2436                     break;
2437                 }
2438 
2439                 found_irq = NV_FALSE;
2440             }
2441 
2442             if (!found_irq && nvl->current_num_irq_tracked < nvl->num_intr)
2443             {
2444                 index = nvl->current_num_irq_tracked;
2445                 nvl->irq_count[index].irq = irq;
2446                 nvl->current_num_irq_tracked++;
2447                 found_irq = NV_TRUE;
2448             }
2449 
2450             if (found_irq)
2451             {
2452                 nvl->irq_count[index].total++;
2453 
2454                 if(rm_handled == NV_FALSE)
2455                 {
2456                     os_get_current_time(&sec, &usec);
2457                     currentTime = ((NvU64)sec) * 1000000 + (NvU64)usec;
2458 
2459                     /* Reset unhandled count if it's been more than 0.1 seconds since the last unhandled IRQ */
2460                     if ((currentTime - nvl->irq_count[index].last_unhandled) > RM_UNHANDLED_TIMEOUT_US)
2461                         nvl->irq_count[index].unhandled = 1;
2462                     else
2463                         nvl->irq_count[index].unhandled++;
2464 
2465                     nvl->irq_count[index].last_unhandled = currentTime;
2466                     rm_handled = NV_TRUE;
2467                 }
2468 
2469                 if (nvl->irq_count[index].total >= RM_THRESHOLD_TOTAL_IRQ_COUNT)
2470                 {
2471                     if (nvl->irq_count[index].unhandled > RM_THRESHOLD_UNAHNDLED_IRQ_COUNT)
2472                         nv_printf(NV_DBG_ERRORS,"NVRM: Going over RM unhandled interrupt threshold for irq %d\n", irq);
2473 
2474                     nvl->irq_count[index].total = 0;
2475                     nvl->irq_count[index].unhandled = 0;
2476                     nvl->irq_count[index].last_unhandled = 0;
2477                 }
2478             }
2479             else
2480                 nv_printf(NV_DBG_ERRORS,"NVRM: IRQ number out of valid range\n");
2481         }
2482     }
2483 
2484     if (need_to_run_bottom_half_gpu_lock_held)
2485     {
2486         return IRQ_WAKE_THREAD;
2487     }
2488     else
2489     {
2490         //
2491         // If rm_isr does not need to run a bottom half and mmu_faults_copied
2492         // indicates that bottom half is needed, then we enqueue a kthread based
2493         // bottom half, as this specific bottom_half will acquire the GPU lock
2494         //
2495         if (rm_fault_handling_needed)
2496             nv_kthread_q_schedule_q_item(&nvl->bottom_half_q, &nvl->bottom_half_q_item);
2497     }
2498 
2499     return IRQ_RETVAL(rm_handled || uvm_handled || rm_fault_handling_needed);
2500 }
2501 
2502 irqreturn_t
2503 nvidia_isr_kthread_bh(
2504     int irq,
2505     void *data
2506 )
2507 {
2508     return nvidia_isr_common_bh(data);
2509 }
2510 
2511 irqreturn_t
2512 nvidia_isr_msix_kthread_bh(
2513     int irq,
2514     void *data
2515 )
2516 {
2517     NV_STATUS status;
2518     irqreturn_t ret;
2519     nv_state_t *nv = (nv_state_t *) data;
2520     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2521 
2522     //
2523     // Synchronize kthreads servicing bottom halves for different MSI-X vectors
2524     // as they share same pre-allocated alt-stack.
2525     //
2526     status = os_acquire_mutex(nvl->msix_bh_mutex);
2527     // os_acquire_mutex can only fail if we cannot sleep and we can
2528     WARN_ON(status != NV_OK);
2529 
2530     ret = nvidia_isr_common_bh(data);
2531 
2532     os_release_mutex(nvl->msix_bh_mutex);
2533 
2534     return ret;
2535 }
2536 
2537 static irqreturn_t
2538 nvidia_isr_common_bh(
2539     void *data
2540 )
2541 {
2542     nv_state_t *nv = (nv_state_t *) data;
2543     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2544     nvidia_stack_t *sp = nvl->sp[NV_DEV_STACK_ISR_BH];
2545     NV_STATUS status;
2546 
2547     status = nv_check_gpu_state(nv);
2548     if (status == NV_ERR_GPU_IS_LOST)
2549     {
2550         nv_printf(NV_DBG_INFO, "NVRM: GPU is lost, skipping ISR bottom half\n");
2551     }
2552     else
2553     {
2554         rm_isr_bh(sp, nv);
2555     }
2556 
2557     return IRQ_HANDLED;
2558 }
2559 
2560 static void
2561 nvidia_isr_bh_unlocked(
2562     void * args
2563 )
2564 {
2565     nv_state_t *nv = (nv_state_t *) args;
2566     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2567     nvidia_stack_t *sp;
2568     NV_STATUS status;
2569 
2570     //
2571     // Synchronize kthreads servicing unlocked bottom half as they
2572     // share same pre-allocated stack for alt-stack
2573     //
2574     status = os_acquire_mutex(nvl->isr_bh_unlocked_mutex);
2575     if (status != NV_OK)
2576     {
2577         nv_printf(NV_DBG_ERRORS, "NVRM: %s: Unable to take bottom_half mutex!\n",
2578                   __FUNCTION__);
2579         WARN_ON(1);
2580     }
2581 
2582     sp = nvl->sp[NV_DEV_STACK_ISR_BH_UNLOCKED];
2583 
2584     status = nv_check_gpu_state(nv);
2585     if (status == NV_ERR_GPU_IS_LOST)
2586     {
2587         nv_printf(NV_DBG_INFO,
2588             "NVRM: GPU is lost, skipping unlocked ISR bottom half\n");
2589     }
2590     else
2591     {
2592         rm_isr_bh_unlocked(sp, nv);
2593     }
2594 
2595     os_release_mutex(nvl->isr_bh_unlocked_mutex);
2596 }
2597 
2598 static void
2599 nvidia_rc_timer_callback(
2600     struct nv_timer *nv_timer
2601 )
2602 {
2603     nv_linux_state_t *nvl = container_of(nv_timer, nv_linux_state_t, rc_timer);
2604     nv_state_t *nv = NV_STATE_PTR(nvl);
2605     nvidia_stack_t *sp = nvl->sp[NV_DEV_STACK_TIMER];
2606     NV_STATUS status;
2607 
2608     status = nv_check_gpu_state(nv);
2609     if (status == NV_ERR_GPU_IS_LOST)
2610     {
2611         nv_printf(NV_DBG_INFO,
2612             "NVRM: GPU is lost, skipping device timer callbacks\n");
2613         return;
2614     }
2615 
2616     if (rm_run_rc_callback(sp, nv) == NV_OK)
2617     {
2618         // set another timeout 1 sec in the future:
2619         mod_timer(&nvl->rc_timer.kernel_timer, jiffies + HZ);
2620     }
2621 }
2622 
2623 /*
2624 ** nvidia_ctl_open
2625 **
2626 ** nv control driver open entry point.  Sessions are created here.
2627 */
2628 static int
2629 nvidia_ctl_open(
2630     struct inode *inode,
2631     struct file *file
2632 )
2633 {
2634     nv_linux_state_t *nvl = &nv_ctl_device;
2635     nv_state_t *nv = NV_STATE_PTR(nvl);
2636     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
2637 
2638     nv_printf(NV_DBG_INFO, "NVRM: nvidia_ctl_open\n");
2639 
2640     down(&nvl->ldata_lock);
2641 
2642     /* save the nv away in file->private_data */
2643     nvlfp->nvptr = nvl;
2644 
2645     if (NV_ATOMIC_READ(nvl->usage_count) == 0)
2646     {
2647         nv->flags |= (NV_FLAG_OPEN | NV_FLAG_CONTROL);
2648     }
2649 
2650     NV_ATOMIC_INC(nvl->usage_count);
2651     up(&nvl->ldata_lock);
2652 
2653     return 0;
2654 }
2655 
2656 
2657 /*
2658 ** nvidia_ctl_close
2659 */
2660 static int
2661 nvidia_ctl_close(
2662     struct inode *inode,
2663     struct file *file
2664 )
2665 {
2666     nv_alloc_t *at, *next;
2667     nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
2668     nv_state_t *nv = NV_STATE_PTR(nvl);
2669     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
2670     nvidia_stack_t *sp = nvlfp->sp;
2671     unsigned int i;
2672 
2673     nv_printf(NV_DBG_INFO, "NVRM: nvidia_ctl_close\n");
2674 
2675     down(&nvl->ldata_lock);
2676     if (NV_ATOMIC_DEC_AND_TEST(nvl->usage_count))
2677     {
2678         nv->flags &= ~NV_FLAG_OPEN;
2679     }
2680     up(&nvl->ldata_lock);
2681 
2682     rm_cleanup_file_private(sp, nv, &nvlfp->nvfp);
2683 
2684     if (nvlfp->free_list != NULL)
2685     {
2686         at = nvlfp->free_list;
2687         while (at != NULL)
2688         {
2689             next = at->next;
2690             if (at->pid == os_get_current_process())
2691                 NV_PRINT_AT(NV_DBG_MEMINFO, at);
2692             nv_free_pages(nv, at->num_pages,
2693                           at->flags.contig,
2694                           at->cache_type,
2695                           (void *)at);
2696             at = next;
2697         }
2698     }
2699 
2700     if (nvlfp->num_attached_gpus != 0)
2701     {
2702         size_t i;
2703 
2704         for (i = 0; i < nvlfp->num_attached_gpus; i++)
2705         {
2706             if (nvlfp->attached_gpus[i] != 0)
2707                 nvidia_dev_put(nvlfp->attached_gpus[i], sp);
2708         }
2709 
2710         NV_KFREE(nvlfp->attached_gpus, sizeof(NvU32) * nvlfp->num_attached_gpus);
2711         nvlfp->num_attached_gpus = 0;
2712     }
2713 
2714     for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
2715     {
2716         nv_kmem_cache_free_stack(nvlfp->fops_sp[i]);
2717     }
2718 
2719     nv_free_file_private(nvlfp);
2720     NV_SET_FILE_PRIVATE(file, NULL);
2721 
2722     nv_kmem_cache_free_stack(sp);
2723 
2724     return 0;
2725 }
2726 
2727 
2728 void NV_API_CALL
2729 nv_set_dma_address_size(
2730     nv_state_t  *nv,
2731     NvU32       phys_addr_bits
2732 )
2733 {
2734     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2735     NvU64 start_addr = nv_get_dma_start_address(nv);
2736     NvU64 new_mask = (((NvU64)1) << phys_addr_bits) - 1;
2737 
2738     nvl->dma_dev.addressable_range.limit = start_addr + new_mask;
2739 
2740     /*
2741      * The only scenario in which we definitely should not update the DMA mask
2742      * is on POWER, when using TCE bypass mode (see nv_get_dma_start_address()
2743      * for details), since the meaning of the DMA mask is overloaded in that
2744      * case.
2745      */
2746     if (!nvl->tce_bypass_enabled)
2747     {
2748         dma_set_mask(&nvl->pci_dev->dev, new_mask);
2749         /* Certain kernels have a bug which causes pci_set_consistent_dma_mask
2750          * to call GPL sme_active symbol, this bug has already been fixed in a
2751          * minor release update but detect the failure scenario here to prevent
2752          * an installation regression */
2753 #if !NV_IS_EXPORT_SYMBOL_GPL_sme_active
2754         dma_set_coherent_mask(&nvl->pci_dev->dev, new_mask);
2755 #endif
2756     }
2757 }
2758 
2759 static NvUPtr
2760 nv_map_guest_pages(nv_alloc_t *at,
2761                    NvU64 address,
2762                    NvU32 page_count,
2763                    NvU32 page_idx)
2764 {
2765     struct page **pages;
2766     NvU32 j;
2767     NvUPtr virt_addr;
2768 
2769     NV_KMALLOC(pages, sizeof(struct page *) * page_count);
2770     if (pages == NULL)
2771     {
2772         nv_printf(NV_DBG_ERRORS,
2773                   "NVRM: failed to allocate vmap() page descriptor table!\n");
2774         return 0;
2775     }
2776 
2777     for (j = 0; j < page_count; j++)
2778     {
2779         pages[j] = NV_GET_PAGE_STRUCT(at->page_table[page_idx+j]->phys_addr);
2780     }
2781 
2782     virt_addr = nv_vm_map_pages(pages, page_count,
2783         at->cache_type == NV_MEMORY_CACHED, at->flags.unencrypted);
2784     NV_KFREE(pages, sizeof(struct page *) * page_count);
2785 
2786     return virt_addr;
2787 }
2788 
2789 NV_STATUS NV_API_CALL
2790 nv_alias_pages(
2791     nv_state_t *nv,
2792     NvU32 page_cnt,
2793     NvU32 contiguous,
2794     NvU32 cache_type,
2795     NvU64 guest_id,
2796     NvU64 *pte_array,
2797     void **priv_data
2798 )
2799 {
2800     nv_alloc_t *at;
2801     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2802     NvU32 i=0;
2803     nvidia_pte_t *page_ptr = NULL;
2804 
2805     at = nvos_create_alloc(nvl->dev, page_cnt);
2806 
2807     if (at == NULL)
2808     {
2809         return NV_ERR_NO_MEMORY;
2810     }
2811 
2812     at->cache_type = cache_type;
2813     if (contiguous)
2814         at->flags.contig = NV_TRUE;
2815 #if defined(NVCPU_AARCH64)
2816     if (at->cache_type != NV_MEMORY_CACHED)
2817         at->flags.aliased = NV_TRUE;
2818 #endif
2819 
2820     at->flags.guest = NV_TRUE;
2821 
2822     at->order = get_order(at->num_pages * PAGE_SIZE);
2823 
2824     for (i=0; i < at->num_pages; ++i)
2825     {
2826         page_ptr = at->page_table[i];
2827 
2828         if (contiguous && i>0)
2829         {
2830             page_ptr->dma_addr = pte_array[0] + (i << PAGE_SHIFT);
2831         }
2832         else
2833         {
2834             page_ptr->dma_addr  = pte_array[i];
2835         }
2836 
2837         page_ptr->phys_addr = page_ptr->dma_addr;
2838 
2839         /* aliased pages will be mapped on demand. */
2840         page_ptr->virt_addr = 0x0;
2841     }
2842 
2843     at->guest_id = guest_id;
2844     *priv_data = at;
2845     NV_ATOMIC_INC(at->usage_count);
2846 
2847     NV_PRINT_AT(NV_DBG_MEMINFO, at);
2848 
2849     return NV_OK;
2850 }
2851 
2852 /*
2853  *   This creates a dummy nv_alloc_t for peer IO mem, so that it can
2854  *   be mapped using NvRmMapMemory.
2855  */
2856 NV_STATUS NV_API_CALL nv_register_peer_io_mem(
2857     nv_state_t *nv,
2858     NvU64      *phys_addr,
2859     NvU64       page_count,
2860     void      **priv_data
2861 )
2862 {
2863     nv_alloc_t *at;
2864     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2865     NvU64 i;
2866     NvU64 addr;
2867 
2868     at = nvos_create_alloc(nvl->dev, page_count);
2869 
2870     if (at == NULL)
2871         return NV_ERR_NO_MEMORY;
2872 
2873     // IO regions should be uncached and contiguous
2874     at->cache_type = NV_MEMORY_UNCACHED;
2875     at->flags.contig = NV_TRUE;
2876 #if defined(NVCPU_AARCH64)
2877     at->flags.aliased = NV_TRUE;
2878 #endif
2879     at->flags.peer_io = NV_TRUE;
2880 
2881     at->order = get_order(at->num_pages * PAGE_SIZE);
2882 
2883     addr = phys_addr[0];
2884 
2885     for (i = 0; i < page_count; i++)
2886     {
2887         at->page_table[i]->phys_addr = addr;
2888         addr += PAGE_SIZE;
2889     }
2890 
2891     // No struct page array exists for this memory.
2892     at->user_pages = NULL;
2893 
2894     *priv_data = at;
2895 
2896     NV_PRINT_AT(NV_DBG_MEMINFO, at);
2897 
2898     return NV_OK;
2899 }
2900 
2901 void NV_API_CALL nv_unregister_peer_io_mem(
2902     nv_state_t *nv,
2903     void       *priv_data
2904 )
2905 {
2906     nv_alloc_t *at = priv_data;
2907 
2908     NV_PRINT_AT(NV_DBG_MEMINFO, at);
2909 
2910     nvos_free_alloc(at);
2911 }
2912 
2913 /*
2914  * By registering user pages, we create a dummy nv_alloc_t for it, so that the
2915  * rest of the RM can treat it like any other alloc.
2916  *
2917  * This also converts the page array to an array of physical addresses.
2918  */
2919 NV_STATUS NV_API_CALL nv_register_user_pages(
2920     nv_state_t *nv,
2921     NvU64       page_count,
2922     NvU64      *phys_addr,
2923     void       *import_priv,
2924     void      **priv_data
2925 )
2926 {
2927     nv_alloc_t *at;
2928     NvU64 i;
2929     struct page **user_pages;
2930     nv_linux_state_t *nvl;
2931     nvidia_pte_t *page_ptr;
2932 
2933     nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_register_user_pages: 0x%x\n", page_count);
2934     user_pages = *priv_data;
2935     nvl = NV_GET_NVL_FROM_NV_STATE(nv);
2936 
2937     at = nvos_create_alloc(nvl->dev, page_count);
2938 
2939     if (at == NULL)
2940     {
2941         return NV_ERR_NO_MEMORY;
2942     }
2943 
2944     /*
2945      * Anonymous memory currently must be write-back cacheable, and we can't
2946      * enforce contiguity.
2947      */
2948     at->cache_type = NV_MEMORY_UNCACHED;
2949 #if defined(NVCPU_AARCH64)
2950     at->flags.aliased = NV_TRUE;
2951 #endif
2952 
2953     at->flags.user = NV_TRUE;
2954 
2955     at->order = get_order(at->num_pages * PAGE_SIZE);
2956 
2957     for (i = 0; i < page_count; i++)
2958     {
2959         /*
2960          * We only assign the physical address and not the DMA address, since
2961          * this allocation hasn't been DMA-mapped yet.
2962          */
2963         page_ptr = at->page_table[i];
2964         page_ptr->phys_addr = page_to_phys(user_pages[i]);
2965 
2966         phys_addr[i] = page_ptr->phys_addr;
2967     }
2968 
2969     /* Save off the user pages array to be restored later */
2970     at->user_pages = user_pages;
2971 
2972     /* Save off the import private data to be returned later */
2973     if (import_priv != NULL)
2974     {
2975         at->import_priv = import_priv;
2976     }
2977 
2978     *priv_data = at;
2979 
2980     NV_PRINT_AT(NV_DBG_MEMINFO, at);
2981 
2982     return NV_OK;
2983 }
2984 
2985 void NV_API_CALL nv_unregister_user_pages(
2986     nv_state_t *nv,
2987     NvU64       page_count,
2988     void      **import_priv,
2989     void      **priv_data
2990 )
2991 {
2992     nv_alloc_t *at = *priv_data;
2993 
2994     nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_unregister_user_pages: 0x%x\n", page_count);
2995 
2996     NV_PRINT_AT(NV_DBG_MEMINFO, at);
2997 
2998     WARN_ON(!at->flags.user);
2999 
3000     /* Restore the user pages array for the caller to handle */
3001     *priv_data = at->user_pages;
3002 
3003     /* Return the import private data for the caller to handle */
3004     if (import_priv != NULL)
3005     {
3006         *import_priv = at->import_priv;
3007     }
3008 
3009     nvos_free_alloc(at);
3010 }
3011 
3012 /*
3013  * This creates a dummy nv_alloc_t for existing physical allocations, so
3014  * that it can be mapped using NvRmMapMemory and BAR2 code path.
3015  */
3016 NV_STATUS NV_API_CALL nv_register_phys_pages(
3017     nv_state_t *nv,
3018     NvU64      *phys_addr,
3019     NvU64       page_count,
3020     NvU32       cache_type,
3021     void      **priv_data
3022 )
3023 {
3024     nv_alloc_t *at;
3025     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3026     NvU64 i;
3027     NvU64 addr;
3028 
3029     at = nvos_create_alloc(nvl->dev, page_count);
3030 
3031     if (at == NULL)
3032         return NV_ERR_NO_MEMORY;
3033     /*
3034      * Setting memory flags to cacheable and discontiguous.
3035      */
3036     at->cache_type = cache_type;
3037 
3038     /*
3039      * Only physical address is available so we don't try to reuse existing
3040      * mappings
3041      */
3042     at->flags.physical = NV_TRUE;
3043 
3044     at->order = get_order(at->num_pages * PAGE_SIZE);
3045 
3046     for (i = 0, addr = phys_addr[0]; i < page_count; addr = phys_addr[++i])
3047     {
3048         at->page_table[i]->phys_addr = addr;
3049     }
3050 
3051     at->user_pages = NULL;
3052     *priv_data = at;
3053 
3054     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3055 
3056     return NV_OK;
3057 }
3058 
3059 NV_STATUS NV_API_CALL nv_register_sgt(
3060     nv_state_t *nv,
3061     NvU64      *phys_addr,
3062     NvU64       page_count,
3063     NvU32       cache_type,
3064     void      **priv_data,
3065     struct sg_table *import_sgt,
3066     void       *import_priv
3067 )
3068 {
3069     nv_alloc_t *at;
3070     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3071 
3072     unsigned int i, j = 0;
3073     NvU64 sg_addr, sg_off, sg_len;
3074     struct scatterlist *sg;
3075 
3076     at = nvos_create_alloc(nvl->dev, page_count);
3077 
3078     if (at == NULL)
3079         return NV_ERR_NO_MEMORY;
3080 
3081     /* Populate phys addrs with DMA addrs from SGT */
3082     for_each_sg(import_sgt->sgl, sg, import_sgt->nents, i)
3083     {
3084         /*
3085          * It is possible for dma_map_sg() to merge scatterlist entries, so
3086          * make sure we account for that here.
3087          */
3088         for (sg_addr = sg_dma_address(sg), sg_len = sg_dma_len(sg), sg_off = 0;
3089              (sg_off < sg_len) && (j < page_count);
3090              sg_off += PAGE_SIZE, j++)
3091         {
3092             phys_addr[j] = sg_addr + sg_off;
3093         }
3094     }
3095 
3096     /*
3097      * Setting memory flags to cacheable and discontiguous.
3098      */
3099     at->cache_type = cache_type;
3100 
3101     at->import_sgt = import_sgt;
3102 
3103     /* Save off the import private data to be returned later */
3104     if (import_priv != NULL)
3105     {
3106         at->import_priv = import_priv;
3107     }
3108 
3109     at->order = get_order(at->num_pages * PAGE_SIZE);
3110 
3111     *priv_data = at;
3112 
3113     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3114 
3115     return NV_OK;
3116 }
3117 
3118 void NV_API_CALL nv_unregister_sgt(
3119     nv_state_t *nv,
3120     struct sg_table **import_sgt,
3121     void **import_priv,
3122     void  *priv_data
3123 )
3124 {
3125     nv_alloc_t *at = priv_data;
3126 
3127     nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_unregister_sgt\n");
3128 
3129     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3130 
3131     /* Restore the imported SGT for the caller to handle */
3132     *import_sgt = at->import_sgt;
3133 
3134     /* Return the import private data for the caller to handle */
3135     if (import_priv != NULL)
3136     {
3137         *import_priv = at->import_priv;
3138     }
3139 
3140     nvos_free_alloc(at);
3141 }
3142 
3143 void NV_API_CALL nv_unregister_phys_pages(
3144     nv_state_t *nv,
3145     void       *priv_data
3146 )
3147 {
3148     nv_alloc_t *at = priv_data;
3149     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3150 
3151     nvos_free_alloc(at);
3152 }
3153 
3154 NV_STATUS NV_API_CALL nv_get_num_phys_pages(
3155     void    *pAllocPrivate,
3156     NvU32   *pNumPages
3157 )
3158 {
3159     nv_alloc_t *at = pAllocPrivate;
3160 
3161     if (!pNumPages) {
3162         return NV_ERR_INVALID_ARGUMENT;
3163     }
3164 
3165     *pNumPages = at->num_pages;
3166 
3167     return NV_OK;
3168 }
3169 
3170 NV_STATUS NV_API_CALL nv_get_phys_pages(
3171     void    *pAllocPrivate,
3172     void    *pPages,
3173     NvU32   *pNumPages
3174 )
3175 {
3176     nv_alloc_t *at = pAllocPrivate;
3177     struct page **pages = (struct page **)pPages;
3178     NvU32 page_count;
3179     int i;
3180 
3181     if (!pNumPages || !pPages) {
3182         return NV_ERR_INVALID_ARGUMENT;
3183     }
3184 
3185     page_count = NV_MIN(*pNumPages, at->num_pages);
3186 
3187     for (i = 0; i < page_count; i++) {
3188         pages[i] = NV_GET_PAGE_STRUCT(at->page_table[i]->phys_addr);
3189     }
3190 
3191     *pNumPages = page_count;
3192 
3193     return NV_OK;
3194 }
3195 
3196 void* NV_API_CALL nv_alloc_kernel_mapping(
3197     nv_state_t *nv,
3198     void       *pAllocPrivate,
3199     NvU64       pageIndex,
3200     NvU32       pageOffset,
3201     NvU64       size,
3202     void      **pPrivate
3203 )
3204 {
3205     nv_alloc_t *at = pAllocPrivate;
3206     NvU32 j, page_count;
3207     NvUPtr virt_addr;
3208     struct page **pages;
3209     NvBool isUserAllocatedMem;
3210 
3211     //
3212     // For User allocated memory (like ErrorNotifier's) which is NOT allocated
3213     // nor owned by RM, the RM driver just stores the physical address
3214     // corresponding to that memory and does not map it until required.
3215     // In that case, in page tables the virt_addr == 0, so first we need to map
3216     // those pages to obtain virtual address.
3217     //
3218     isUserAllocatedMem = at->flags.user &&
3219                         !at->page_table[pageIndex]->virt_addr &&
3220                          at->page_table[pageIndex]->phys_addr;
3221 
3222     //
3223     // User memory may NOT have kernel VA. So check this and fallback to else
3224     // case to create one.
3225     //
3226     if (((size + pageOffset) <= PAGE_SIZE) &&
3227          !at->flags.guest && !at->flags.aliased &&
3228          !isUserAllocatedMem && !at->flags.physical)
3229     {
3230         *pPrivate = NULL;
3231         return (void *)(at->page_table[pageIndex]->virt_addr + pageOffset);
3232     }
3233     else
3234     {
3235         size += pageOffset;
3236         page_count = (size >> PAGE_SHIFT) + ((size & ~NV_PAGE_MASK) ? 1 : 0);
3237 
3238         if (at->flags.guest)
3239         {
3240             virt_addr = nv_map_guest_pages(at,
3241                                            nv->bars[NV_GPU_BAR_INDEX_REGS].cpu_address,
3242                                            page_count, pageIndex);
3243         }
3244         else
3245         {
3246             NV_KMALLOC(pages, sizeof(struct page *) * page_count);
3247             if (pages == NULL)
3248             {
3249                 nv_printf(NV_DBG_ERRORS,
3250                           "NVRM: failed to allocate vmap() page descriptor table!\n");
3251                 return NULL;
3252             }
3253 
3254             for (j = 0; j < page_count; j++)
3255                 pages[j] = NV_GET_PAGE_STRUCT(at->page_table[pageIndex+j]->phys_addr);
3256 
3257             virt_addr = nv_vm_map_pages(pages, page_count,
3258                 at->cache_type == NV_MEMORY_CACHED, at->flags.unencrypted);
3259             NV_KFREE(pages, sizeof(struct page *) * page_count);
3260         }
3261 
3262         if (virt_addr == 0)
3263         {
3264             nv_printf(NV_DBG_ERRORS, "NVRM: failed to map pages!\n");
3265             return NULL;
3266         }
3267 
3268         *pPrivate = (void *)(NvUPtr)page_count;
3269         return (void *)(virt_addr + pageOffset);
3270     }
3271 
3272     return NULL;
3273 }
3274 
3275 NV_STATUS NV_API_CALL nv_free_kernel_mapping(
3276     nv_state_t *nv,
3277     void       *pAllocPrivate,
3278     void       *address,
3279     void       *pPrivate
3280 )
3281 {
3282     nv_alloc_t *at = pAllocPrivate;
3283     NvUPtr virt_addr;
3284     NvU32 page_count;
3285 
3286     virt_addr = ((NvUPtr)address & NV_PAGE_MASK);
3287     page_count = (NvUPtr)pPrivate;
3288 
3289     if (at->flags.guest)
3290     {
3291         nv_iounmap((void *)virt_addr, (page_count * PAGE_SIZE));
3292     }
3293     else if (pPrivate != NULL)
3294     {
3295         nv_vm_unmap_pages(virt_addr, page_count);
3296     }
3297 
3298     return NV_OK;
3299 }
3300 
3301 NV_STATUS NV_API_CALL nv_alloc_pages(
3302     nv_state_t *nv,
3303     NvU32       page_count,
3304     NvBool      contiguous,
3305     NvU32       cache_type,
3306     NvBool      zeroed,
3307     NvBool      unencrypted,
3308     NvS32       node_id,
3309     NvU64      *pte_array,
3310     void      **priv_data
3311 )
3312 {
3313     nv_alloc_t *at;
3314     NV_STATUS status = NV_ERR_NO_MEMORY;
3315     nv_linux_state_t *nvl = NULL;
3316     NvBool will_remap = NV_FALSE;
3317     NvU32 i;
3318     struct device *dev = NULL;
3319 
3320     nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_alloc_pages: %d pages, nodeid %d\n", page_count, node_id);
3321     nv_printf(NV_DBG_MEMINFO, "NVRM: VM:    contig %d  cache_type %d\n",
3322         contiguous, cache_type);
3323 
3324     //
3325     // system memory allocation can be associated with a client instead of a gpu
3326     // handle the case where per device state is NULL
3327     //
3328     if(nv)
3329     {
3330        nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3331        will_remap = nv_requires_dma_remap(nv);
3332        dev = nvl->dev;
3333     }
3334 
3335     if (nv_encode_caching(NULL, cache_type, NV_MEMORY_TYPE_SYSTEM))
3336         return NV_ERR_NOT_SUPPORTED;
3337 
3338     at = nvos_create_alloc(dev, page_count);
3339     if (at == NULL)
3340         return NV_ERR_NO_MEMORY;
3341 
3342     at->cache_type = cache_type;
3343 
3344     if (contiguous)
3345         at->flags.contig = NV_TRUE;
3346     if (zeroed)
3347         at->flags.zeroed = NV_TRUE;
3348 #if defined(NVCPU_AARCH64)
3349     if (at->cache_type != NV_MEMORY_CACHED)
3350         at->flags.aliased = NV_TRUE;
3351 #endif
3352     if (unencrypted)
3353         at->flags.unencrypted = NV_TRUE;
3354 
3355 #if defined(NVCPU_PPC64LE)
3356     /*
3357      * Starting on Power9 systems, DMA addresses for NVLink are no longer the
3358      * same as used over PCIe. There is an address compression scheme required
3359      * for NVLink ONLY which impacts the upper address bits of the DMA address.
3360      *
3361      * This divergence between PCIe and NVLink DMA mappings breaks assumptions
3362      * in the driver where during initialization we allocate system memory
3363      * for the GPU to access over PCIe before NVLink is trained -- and some of
3364      * these mappings persist on the GPU. If these persistent mappings are not
3365      * equivalent they will cause invalid DMA accesses from the GPU once we
3366      * switch to NVLink.
3367      *
3368      * To work around this we limit all system memory allocations from the driver
3369      * during the period before NVLink is enabled to be from NUMA node 0 (CPU 0)
3370      * which has a CPU real address with the upper address bits (above bit 42)
3371      * set to 0. Effectively making the PCIe and NVLink DMA mappings equivalent
3372      * allowing persistent system memory mappings already programmed on the GPU
3373      * to remain valid after NVLink is enabled.
3374      *
3375      * See Bug 1920398 for more details.
3376      */
3377     if (nv && nvl->npu && !nvl->dma_dev.nvlink)
3378     {
3379         at->flags.node = NV_TRUE;
3380         at->node_id = 0;
3381     }
3382 #endif
3383 
3384     if (node_id != NUMA_NO_NODE)
3385     {
3386         at->flags.node = NV_TRUE;
3387         at->node_id = node_id;
3388     }
3389 
3390     if (at->flags.contig)
3391         status = nv_alloc_contig_pages(nv, at);
3392     else
3393         status = nv_alloc_system_pages(nv, at);
3394 
3395     if (status != NV_OK)
3396         goto failed;
3397 
3398     for (i = 0; i < ((contiguous) ? 1 : page_count); i++)
3399     {
3400         /*
3401          * The contents of the pte_array[] depend on whether or not this device
3402          * requires DMA-remapping. If it does, it should be the phys addresses
3403          * used by the DMA-remapping paths, otherwise it should be the actual
3404          * address that the device should use for DMA (which, confusingly, may
3405          * be different than the CPU physical address, due to a static DMA
3406          * offset).
3407          */
3408         if ((nv == NULL) || will_remap)
3409         {
3410             pte_array[i] = at->page_table[i]->phys_addr;
3411         }
3412         else
3413         {
3414             pte_array[i] = nv_phys_to_dma(dev,
3415                 at->page_table[i]->phys_addr);
3416         }
3417     }
3418 
3419     *priv_data = at;
3420     NV_ATOMIC_INC(at->usage_count);
3421 
3422     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3423 
3424     return NV_OK;
3425 
3426 failed:
3427     nvos_free_alloc(at);
3428 
3429     return status;
3430 }
3431 
3432 NV_STATUS NV_API_CALL nv_free_pages(
3433     nv_state_t *nv,
3434     NvU32 page_count,
3435     NvBool contiguous,
3436     NvU32 cache_type,
3437     void *priv_data
3438 )
3439 {
3440     NV_STATUS rmStatus = NV_OK;
3441     nv_alloc_t *at = priv_data;
3442 
3443     nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_free_pages: 0x%x\n", page_count);
3444 
3445     NV_PRINT_AT(NV_DBG_MEMINFO, at);
3446 
3447     /*
3448      * If the 'at' usage count doesn't drop to zero here, not all of
3449      * the user mappings have been torn down in time - we can't
3450      * safely free the memory. We report success back to the RM, but
3451      * defer the actual free operation until later.
3452      *
3453      * This is described in greater detail in the comments above the
3454      * nvidia_vma_(open|release)() callbacks in nv-mmap.c.
3455      */
3456     if (!NV_ATOMIC_DEC_AND_TEST(at->usage_count))
3457         return NV_OK;
3458 
3459     if (!at->flags.guest)
3460     {
3461         if (at->flags.contig)
3462             nv_free_contig_pages(at);
3463         else
3464             nv_free_system_pages(at);
3465     }
3466 
3467     nvos_free_alloc(at);
3468 
3469     return rmStatus;
3470 }
3471 
3472 NvBool nv_lock_init_locks
3473 (
3474     nvidia_stack_t *sp,
3475     nv_state_t *nv
3476 )
3477 {
3478     nv_linux_state_t *nvl;
3479     nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3480 
3481     NV_INIT_MUTEX(&nvl->ldata_lock);
3482     NV_INIT_MUTEX(&nvl->mmap_lock);
3483 
3484     NV_ATOMIC_SET(nvl->usage_count, 0);
3485 
3486     if (!rm_init_event_locks(sp, nv))
3487         return NV_FALSE;
3488 
3489     return NV_TRUE;
3490 }
3491 
3492 void nv_lock_destroy_locks
3493 (
3494     nvidia_stack_t *sp,
3495     nv_state_t *nv
3496 )
3497 {
3498     rm_destroy_event_locks(sp, nv);
3499 }
3500 
3501 void NV_API_CALL nv_post_event(
3502     nv_event_t *event,
3503     NvHandle    handle,
3504     NvU32       index,
3505     NvU32       info32,
3506     NvU16       info16,
3507     NvBool      data_valid
3508 )
3509 {
3510     nv_linux_file_private_t *nvlfp = nv_get_nvlfp_from_nvfp(event->nvfp);
3511     unsigned long eflags;
3512     nvidia_event_t *nvet;
3513 
3514     NV_SPIN_LOCK_IRQSAVE(&nvlfp->fp_lock, eflags);
3515 
3516     if (data_valid)
3517     {
3518         NV_KMALLOC_ATOMIC(nvet, sizeof(nvidia_event_t));
3519         if (nvet == NULL)
3520         {
3521             NV_SPIN_UNLOCK_IRQRESTORE(&nvlfp->fp_lock, eflags);
3522             return;
3523         }
3524 
3525         if (nvlfp->event_data_tail != NULL)
3526             nvlfp->event_data_tail->next = nvet;
3527         if (nvlfp->event_data_head == NULL)
3528             nvlfp->event_data_head = nvet;
3529         nvlfp->event_data_tail = nvet;
3530         nvet->next = NULL;
3531 
3532         nvet->event = *event;
3533         nvet->event.hObject = handle;
3534         nvet->event.index = index;
3535         nvet->event.info32 = info32;
3536         nvet->event.info16 = info16;
3537     }
3538     //
3539     // 'event_pending' is interpreted by nvidia_poll() and nv_get_event() to
3540     // mean that an event without data is pending. Therefore, only set it to
3541     // true here if newly posted event is dataless.
3542     //
3543     else
3544     {
3545         nvlfp->dataless_event_pending = NV_TRUE;
3546     }
3547 
3548     NV_SPIN_UNLOCK_IRQRESTORE(&nvlfp->fp_lock, eflags);
3549 
3550     wake_up_interruptible(&nvlfp->waitqueue);
3551 }
3552 
3553 NvBool NV_API_CALL nv_is_rm_firmware_active(
3554     nv_state_t *nv
3555 )
3556 {
3557     if (rm_firmware_active)
3558     {
3559         // "all" here means all GPUs
3560         if (strcmp(rm_firmware_active, "all") == 0)
3561             return NV_TRUE;
3562     }
3563     return NV_FALSE;
3564 }
3565 
3566 const void* NV_API_CALL nv_get_firmware(
3567     nv_state_t *nv,
3568     nv_firmware_type_t fw_type,
3569     nv_firmware_chip_family_t fw_chip_family,
3570     const void **fw_buf,
3571     NvU32 *fw_size
3572 )
3573 {
3574     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3575     const struct firmware *fw;
3576 
3577     // path is relative to /lib/firmware
3578     // if this fails it will print an error to dmesg
3579     if (request_firmware(&fw, nv_firmware_path(fw_type, fw_chip_family), nvl->dev) != 0)
3580         return NULL;
3581 
3582     *fw_size = fw->size;
3583     *fw_buf = fw->data;
3584 
3585     return fw;
3586 }
3587 
3588 void NV_API_CALL nv_put_firmware(
3589     const void *fw_handle
3590 )
3591 {
3592     release_firmware(fw_handle);
3593 }
3594 
3595 nv_file_private_t* NV_API_CALL nv_get_file_private(
3596     NvS32 fd,
3597     NvBool ctl,
3598     void **os_private
3599 )
3600 {
3601     struct file *filp = NULL;
3602     nv_linux_file_private_t *nvlfp = NULL;
3603     dev_t rdev = 0;
3604 
3605     filp = fget(fd);
3606 
3607     if (filp == NULL || !NV_FILE_INODE(filp))
3608     {
3609         goto fail;
3610     }
3611 
3612     rdev = (NV_FILE_INODE(filp))->i_rdev;
3613 
3614     if (MAJOR(rdev) != NV_MAJOR_DEVICE_NUMBER)
3615     {
3616         goto fail;
3617     }
3618 
3619     if (ctl)
3620     {
3621         if (MINOR(rdev) != NV_CONTROL_DEVICE_MINOR)
3622             goto fail;
3623     }
3624     else
3625     {
3626         NvBool found = NV_FALSE;
3627         int i;
3628 
3629         for (i = 0; i <= NV_FRONTEND_CONTROL_DEVICE_MINOR_MIN; i++)
3630         {
3631             if ((nv_minor_num_table[i] != NULL) && (MINOR(rdev) == i))
3632             {
3633                 found = NV_TRUE;
3634                 break;
3635             }
3636         }
3637 
3638         if (!found)
3639             goto fail;
3640     }
3641 
3642     nvlfp = NV_GET_LINUX_FILE_PRIVATE(filp);
3643 
3644     *os_private = filp;
3645 
3646     return &nvlfp->nvfp;
3647 
3648 fail:
3649 
3650     if (filp != NULL)
3651     {
3652         fput(filp);
3653     }
3654 
3655     return NULL;
3656 }
3657 
3658 void NV_API_CALL nv_put_file_private(
3659     void *os_private
3660 )
3661 {
3662     struct file *filp = os_private;
3663     fput(filp);
3664 }
3665 
3666 int NV_API_CALL nv_get_event(
3667     nv_file_private_t  *nvfp,
3668     nv_event_t         *event,
3669     NvU32              *pending
3670 )
3671 {
3672     nv_linux_file_private_t *nvlfp = nv_get_nvlfp_from_nvfp(nvfp);
3673     nvidia_event_t *nvet;
3674     unsigned long eflags;
3675 
3676     NV_SPIN_LOCK_IRQSAVE(&nvlfp->fp_lock, eflags);
3677 
3678     nvet = nvlfp->event_data_head;
3679     if (nvet == NULL)
3680     {
3681         NV_SPIN_UNLOCK_IRQRESTORE(&nvlfp->fp_lock, eflags);
3682         return NV_ERR_GENERIC;
3683     }
3684 
3685     *event = nvet->event;
3686 
3687     if (nvlfp->event_data_tail == nvet)
3688         nvlfp->event_data_tail = NULL;
3689     nvlfp->event_data_head = nvet->next;
3690 
3691     *pending = (nvlfp->event_data_head != NULL);
3692 
3693     NV_SPIN_UNLOCK_IRQRESTORE(&nvlfp->fp_lock, eflags);
3694 
3695     NV_KFREE(nvet, sizeof(nvidia_event_t));
3696 
3697     return NV_OK;
3698 }
3699 
3700 int NV_API_CALL nv_start_rc_timer(
3701     nv_state_t *nv
3702 )
3703 {
3704     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3705 
3706     if (nv->rc_timer_enabled)
3707         return -1;
3708 
3709     nv_printf(NV_DBG_INFO, "NVRM: initializing rc timer\n");
3710 
3711     nv_timer_setup(&nvl->rc_timer, nvidia_rc_timer_callback);
3712 
3713     nv->rc_timer_enabled = 1;
3714 
3715     // set the timeout for 1 second in the future:
3716     mod_timer(&nvl->rc_timer.kernel_timer, jiffies + HZ);
3717 
3718     nv_printf(NV_DBG_INFO, "NVRM: rc timer initialized\n");
3719 
3720     return 0;
3721 }
3722 
3723 int NV_API_CALL nv_stop_rc_timer(
3724     nv_state_t *nv
3725 )
3726 {
3727     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3728 
3729     if (!nv->rc_timer_enabled)
3730         return -1;
3731 
3732     nv_printf(NV_DBG_INFO, "NVRM: stopping rc timer\n");
3733     nv->rc_timer_enabled = 0;
3734     del_timer_sync(&nvl->rc_timer.kernel_timer);
3735     nv_printf(NV_DBG_INFO, "NVRM: rc timer stopped\n");
3736 
3737     return 0;
3738 }
3739 
3740 #define SNAPSHOT_TIMER_FREQ (jiffies + HZ / NV_SNAPSHOT_TIMER_HZ)
3741 
3742 static void snapshot_timer_callback(struct nv_timer *timer)
3743 {
3744     nv_linux_state_t *nvl = &nv_ctl_device;
3745     nv_state_t *nv = NV_STATE_PTR(nvl);
3746     unsigned long flags;
3747 
3748     NV_SPIN_LOCK_IRQSAVE(&nvl->snapshot_timer_lock, flags);
3749     if (nvl->snapshot_callback != NULL)
3750     {
3751         nvl->snapshot_callback(nv->profiler_context);
3752         mod_timer(&timer->kernel_timer, SNAPSHOT_TIMER_FREQ);
3753     }
3754     NV_SPIN_UNLOCK_IRQRESTORE(&nvl->snapshot_timer_lock, flags);
3755 }
3756 
3757 void NV_API_CALL nv_start_snapshot_timer(void (*snapshot_callback)(void *context))
3758 {
3759     nv_linux_state_t *nvl = &nv_ctl_device;
3760 
3761     nvl->snapshot_callback = snapshot_callback;
3762     nv_timer_setup(&nvl->snapshot_timer, snapshot_timer_callback);
3763     mod_timer(&nvl->snapshot_timer.kernel_timer, SNAPSHOT_TIMER_FREQ);
3764 }
3765 
3766 void NV_API_CALL nv_stop_snapshot_timer(void)
3767 {
3768     nv_linux_state_t *nvl = &nv_ctl_device;
3769     NvBool timer_active;
3770     unsigned long flags;
3771 
3772     NV_SPIN_LOCK_IRQSAVE(&nvl->snapshot_timer_lock, flags);
3773     timer_active = nvl->snapshot_callback != NULL;
3774     nvl->snapshot_callback = NULL;
3775     NV_SPIN_UNLOCK_IRQRESTORE(&nvl->snapshot_timer_lock, flags);
3776 
3777     if (timer_active)
3778         del_timer_sync(&nvl->snapshot_timer.kernel_timer);
3779 }
3780 
3781 void NV_API_CALL nv_flush_snapshot_timer(void)
3782 {
3783     nv_linux_state_t *nvl = &nv_ctl_device;
3784     nv_state_t *nv = NV_STATE_PTR(nvl);
3785     unsigned long flags;
3786 
3787     NV_SPIN_LOCK_IRQSAVE(&nvl->snapshot_timer_lock, flags);
3788     if (nvl->snapshot_callback != NULL)
3789         nvl->snapshot_callback(nv->profiler_context);
3790     NV_SPIN_UNLOCK_IRQRESTORE(&nvl->snapshot_timer_lock, flags);
3791 }
3792 
3793 static int __init
3794 nvos_count_devices(void)
3795 {
3796     int count;
3797 
3798     count = nv_pci_count_devices();
3799 
3800     return count;
3801 }
3802 
3803 NvBool nvos_is_chipset_io_coherent(void)
3804 {
3805     if (nv_chipset_is_io_coherent == NV_TRISTATE_INDETERMINATE)
3806     {
3807         nvidia_stack_t *sp = NULL;
3808         if (nv_kmem_cache_alloc_stack(&sp) != 0)
3809         {
3810             nv_printf(NV_DBG_ERRORS,
3811               "NVRM: cannot allocate stack for platform coherence check callback \n");
3812             WARN_ON(1);
3813             return NV_FALSE;
3814         }
3815 
3816         nv_chipset_is_io_coherent = rm_is_chipset_io_coherent(sp);
3817 
3818         nv_kmem_cache_free_stack(sp);
3819     }
3820 
3821     return nv_chipset_is_io_coherent;
3822 }
3823 
3824 #if defined(CONFIG_PM)
3825 static NV_STATUS
3826 nv_power_management(
3827     nv_state_t *nv,
3828     nv_pm_action_t pm_action
3829 )
3830 {
3831     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3832     int status = NV_OK;
3833     nvidia_stack_t *sp = NULL;
3834 
3835     if (nv_kmem_cache_alloc_stack(&sp) != 0)
3836     {
3837         return NV_ERR_NO_MEMORY;
3838     }
3839 
3840     status = nv_check_gpu_state(nv);
3841     if (status == NV_ERR_GPU_IS_LOST)
3842     {
3843         NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping PM event\n");
3844         goto failure;
3845     }
3846 
3847     switch (pm_action)
3848     {
3849         case NV_PM_ACTION_STANDBY:
3850             /* fall through */
3851         case NV_PM_ACTION_HIBERNATE:
3852         {
3853             status = rm_power_management(sp, nv, pm_action);
3854 
3855             nv_kthread_q_stop(&nvl->bottom_half_q);
3856 
3857             nv_disable_pat_support();
3858             break;
3859         }
3860         case NV_PM_ACTION_RESUME:
3861         {
3862             nv_enable_pat_support();
3863 
3864             nv_kthread_q_item_init(&nvl->bottom_half_q_item,
3865                                    nvidia_isr_bh_unlocked, (void *)nv);
3866 
3867             status = nv_kthread_q_init(&nvl->bottom_half_q, nv_device_name);
3868             if (status != NV_OK)
3869                 break;
3870 
3871             status = rm_power_management(sp, nv, pm_action);
3872             break;
3873         }
3874         default:
3875             status = NV_ERR_INVALID_ARGUMENT;
3876             break;
3877     }
3878 
3879 failure:
3880     nv_kmem_cache_free_stack(sp);
3881 
3882     return status;
3883 }
3884 
3885 static NV_STATUS
3886 nv_restore_user_channels(
3887     nv_state_t *nv
3888 )
3889 {
3890     NV_STATUS status = NV_OK;
3891     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3892     nv_stack_t *sp = NULL;
3893 
3894     if (nv_kmem_cache_alloc_stack(&sp) != 0)
3895     {
3896         return NV_ERR_NO_MEMORY;
3897     }
3898 
3899     down(&nvl->ldata_lock);
3900 
3901     if ((nv->flags & NV_FLAG_OPEN) == 0)
3902     {
3903         goto done;
3904     }
3905 
3906     status = rm_restart_user_channels(sp, nv);
3907     WARN_ON(status != NV_OK);
3908 
3909     down(&nvl->mmap_lock);
3910 
3911     nv_set_safe_to_mmap_locked(nv, NV_TRUE);
3912 
3913     up(&nvl->mmap_lock);
3914 
3915     rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE);
3916 
3917 done:
3918     up(&nvl->ldata_lock);
3919 
3920     nv_kmem_cache_free_stack(sp);
3921 
3922     return status;
3923 }
3924 
3925 static NV_STATUS
3926 nv_preempt_user_channels(
3927     nv_state_t *nv
3928 )
3929 {
3930     NV_STATUS status = NV_OK;
3931     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
3932     nv_stack_t *sp = NULL;
3933 
3934     if (nv_kmem_cache_alloc_stack(&sp) != 0)
3935     {
3936         return NV_ERR_NO_MEMORY;
3937     }
3938 
3939     down(&nvl->ldata_lock);
3940 
3941     if ((nv->flags & NV_FLAG_OPEN) == 0)
3942     {
3943         goto done;
3944     }
3945 
3946     status = rm_ref_dynamic_power(sp, nv, NV_DYNAMIC_PM_FINE);
3947     WARN_ON(status != NV_OK);
3948 
3949     down(&nvl->mmap_lock);
3950 
3951     nv_set_safe_to_mmap_locked(nv, NV_FALSE);
3952     nv_revoke_gpu_mappings_locked(nv);
3953 
3954     up(&nvl->mmap_lock);
3955 
3956     status = rm_stop_user_channels(sp, nv);
3957     WARN_ON(status != NV_OK);
3958 
3959 done:
3960     up(&nvl->ldata_lock);
3961 
3962     nv_kmem_cache_free_stack(sp);
3963 
3964     return status;
3965 }
3966 
3967 static NV_STATUS
3968 nvidia_suspend(
3969     struct device *dev,
3970     nv_pm_action_t pm_action,
3971     NvBool is_procfs_suspend
3972 )
3973 {
3974     NV_STATUS status = NV_OK;
3975     struct pci_dev *pci_dev = NULL;
3976     nv_linux_state_t *nvl;
3977     nv_state_t *nv;
3978 
3979     if (dev_is_pci(dev))
3980     {
3981         pci_dev = to_pci_dev(dev);
3982         nvl = pci_get_drvdata(pci_dev);
3983     }
3984     else
3985     {
3986         nvl = dev_get_drvdata(dev);
3987     }
3988     nv = NV_STATE_PTR(nvl);
3989 
3990     down(&nvl->ldata_lock);
3991 
3992     if (((nv->flags & NV_FLAG_OPEN) == 0) &&
3993         ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) == 0))
3994     {
3995         goto done;
3996     }
3997 
3998     if ((nv->flags & NV_FLAG_SUSPENDED) != 0)
3999     {
4000         nvl->suspend_count++;
4001         goto pci_pm;
4002     }
4003 
4004     if (nv->preserve_vidmem_allocations && !is_procfs_suspend)
4005     {
4006         NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
4007                       "PreserveVideoMemoryAllocations module parameter is set. "
4008                       "System Power Management attempted without driver procfs suspend interface. "
4009                       "Please refer to the 'Configuring Power Management Support' section in the driver README.\n");
4010         status = NV_ERR_NOT_SUPPORTED;
4011         goto done;
4012     }
4013 
4014     nvidia_modeset_suspend(nv->gpu_id);
4015 
4016     status = nv_power_management(nv, pm_action);
4017 
4018     if (status != NV_OK)
4019     {
4020         nvidia_modeset_resume(nv->gpu_id);
4021         goto done;
4022     }
4023     else
4024     {
4025         nv->flags |= NV_FLAG_SUSPENDED;
4026     }
4027 
4028 pci_pm:
4029     /*
4030      * Check if PCI power state should be D0 during system suspend. The PCI PM
4031      * core will change the power state only if the driver has not saved the
4032      * state in it's suspend callback.
4033      */
4034     if ((nv->d0_state_in_suspend) && (pci_dev != NULL) &&
4035         !is_procfs_suspend && (pm_action == NV_PM_ACTION_STANDBY))
4036     {
4037         pci_save_state(pci_dev);
4038     }
4039 
4040 done:
4041     up(&nvl->ldata_lock);
4042 
4043     return status;
4044 }
4045 
4046 static NV_STATUS
4047 nvidia_resume(
4048     struct device *dev,
4049     nv_pm_action_t pm_action
4050 )
4051 {
4052     NV_STATUS status = NV_OK;
4053     struct pci_dev *pci_dev;
4054     nv_linux_state_t *nvl;
4055     nv_state_t *nv;
4056 
4057     if (dev_is_pci(dev))
4058     {
4059         pci_dev = to_pci_dev(dev);
4060         nvl = pci_get_drvdata(pci_dev);
4061     }
4062     else
4063     {
4064         nvl = dev_get_drvdata(dev);
4065     }
4066     nv = NV_STATE_PTR(nvl);
4067 
4068     down(&nvl->ldata_lock);
4069 
4070     if ((nv->flags & NV_FLAG_SUSPENDED) == 0)
4071     {
4072         goto done;
4073     }
4074 
4075     if (nvl->suspend_count != 0)
4076     {
4077         nvl->suspend_count--;
4078     }
4079     else
4080     {
4081         status = nv_power_management(nv, pm_action);
4082 
4083         if (status == NV_OK)
4084         {
4085             nvidia_modeset_resume(nv->gpu_id);
4086             nv->flags &= ~NV_FLAG_SUSPENDED;
4087         }
4088     }
4089 
4090 done:
4091     up(&nvl->ldata_lock);
4092 
4093     return status;
4094 }
4095 
4096 static NV_STATUS
4097 nv_resume_devices(
4098     nv_pm_action_t pm_action,
4099     nv_pm_action_depth_t pm_action_depth
4100 )
4101 {
4102     nv_linux_state_t *nvl;
4103     NvBool resume_devices = NV_TRUE;
4104     NV_STATUS status;
4105 
4106     if (pm_action_depth == NV_PM_ACTION_DEPTH_MODESET)
4107     {
4108         goto resume_modeset;
4109     }
4110 
4111     if (pm_action_depth == NV_PM_ACTION_DEPTH_UVM)
4112     {
4113         resume_devices = NV_FALSE;
4114     }
4115 
4116     LOCK_NV_LINUX_DEVICES();
4117 
4118     for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
4119     {
4120         if (resume_devices)
4121         {
4122             status = nvidia_resume(nvl->dev, pm_action);
4123             WARN_ON(status != NV_OK);
4124         }
4125     }
4126 
4127     UNLOCK_NV_LINUX_DEVICES();
4128 
4129     status = nv_uvm_resume();
4130     WARN_ON(status != NV_OK);
4131 
4132     LOCK_NV_LINUX_DEVICES();
4133 
4134     for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
4135     {
4136         status = nv_restore_user_channels(NV_STATE_PTR(nvl));
4137         WARN_ON(status != NV_OK);
4138     }
4139 
4140     UNLOCK_NV_LINUX_DEVICES();
4141 
4142 resume_modeset:
4143     nvidia_modeset_resume(0);
4144 
4145     return NV_OK;
4146 }
4147 
4148 static NV_STATUS
4149 nv_suspend_devices(
4150     nv_pm_action_t pm_action,
4151     nv_pm_action_depth_t pm_action_depth
4152 )
4153 {
4154     nv_linux_state_t *nvl;
4155     NvBool resume_devices = NV_FALSE;
4156     NV_STATUS status = NV_OK;
4157 
4158     nvidia_modeset_suspend(0);
4159 
4160     if (pm_action_depth == NV_PM_ACTION_DEPTH_MODESET)
4161     {
4162         return NV_OK;
4163     }
4164 
4165     LOCK_NV_LINUX_DEVICES();
4166 
4167     for (nvl = nv_linux_devices; nvl != NULL && status == NV_OK; nvl = nvl->next)
4168     {
4169         status = nv_preempt_user_channels(NV_STATE_PTR(nvl));
4170         WARN_ON(status != NV_OK);
4171     }
4172 
4173     UNLOCK_NV_LINUX_DEVICES();
4174 
4175     if (status == NV_OK)
4176     {
4177         status = nv_uvm_suspend();
4178         WARN_ON(status != NV_OK);
4179     }
4180     if (status != NV_OK)
4181     {
4182         goto done;
4183     }
4184 
4185     if (pm_action_depth == NV_PM_ACTION_DEPTH_UVM)
4186     {
4187         return NV_OK;
4188     }
4189 
4190     LOCK_NV_LINUX_DEVICES();
4191 
4192     for (nvl = nv_linux_devices; nvl != NULL && status == NV_OK; nvl = nvl->next)
4193     {
4194         status = nvidia_suspend(nvl->dev, pm_action, NV_TRUE);
4195         WARN_ON(status != NV_OK);
4196     }
4197     if (status != NV_OK)
4198     {
4199         resume_devices = NV_TRUE;
4200     }
4201 
4202     UNLOCK_NV_LINUX_DEVICES();
4203 
4204 done:
4205     if (status != NV_OK)
4206     {
4207         LOCK_NV_LINUX_DEVICES();
4208 
4209         for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
4210         {
4211             if (resume_devices)
4212             {
4213                 nvidia_resume(nvl->dev, pm_action);
4214             }
4215 
4216             nv_restore_user_channels(NV_STATE_PTR(nvl));
4217         }
4218 
4219         UNLOCK_NV_LINUX_DEVICES();
4220     }
4221 
4222     return status;
4223 }
4224 
4225 NV_STATUS
4226 nv_set_system_power_state(
4227     nv_power_state_t power_state,
4228     nv_pm_action_depth_t pm_action_depth
4229 )
4230 {
4231     NV_STATUS status;
4232     nv_pm_action_t pm_action;
4233 
4234     switch (power_state)
4235     {
4236         case NV_POWER_STATE_IN_HIBERNATE:
4237             pm_action = NV_PM_ACTION_HIBERNATE;
4238             break;
4239         case NV_POWER_STATE_IN_STANDBY:
4240             pm_action = NV_PM_ACTION_STANDBY;
4241             break;
4242         case NV_POWER_STATE_RUNNING:
4243             pm_action = NV_PM_ACTION_RESUME;
4244             break;
4245         default:
4246             return NV_ERR_INVALID_ARGUMENT;
4247     }
4248 
4249     down(&nv_system_power_state_lock);
4250 
4251     if (nv_system_power_state == power_state)
4252     {
4253         status = NV_OK;
4254         goto done;
4255     }
4256 
4257     if (power_state == NV_POWER_STATE_RUNNING)
4258     {
4259         status = nv_resume_devices(pm_action, nv_system_pm_action_depth);
4260         up_write(&nv_system_pm_lock);
4261     }
4262     else
4263     {
4264         if (nv_system_power_state != NV_POWER_STATE_RUNNING)
4265         {
4266             status = NV_ERR_INVALID_ARGUMENT;
4267             goto done;
4268         }
4269 
4270         nv_system_pm_action_depth = pm_action_depth;
4271 
4272         down_write(&nv_system_pm_lock);
4273         status = nv_suspend_devices(pm_action, nv_system_pm_action_depth);
4274         if (status != NV_OK)
4275         {
4276             up_write(&nv_system_pm_lock);
4277             goto done;
4278         }
4279     }
4280 
4281     nv_system_power_state = power_state;
4282 
4283 done:
4284     up(&nv_system_power_state_lock);
4285 
4286     return status;
4287 }
4288 
4289 int nv_pmops_suspend(
4290     struct device *dev
4291 )
4292 {
4293     NV_STATUS status;
4294 
4295     status = nvidia_suspend(dev, NV_PM_ACTION_STANDBY, NV_FALSE);
4296     return (status == NV_OK) ? 0 : -EIO;
4297 }
4298 
4299 int nv_pmops_resume(
4300     struct device *dev
4301 )
4302 {
4303     NV_STATUS status;
4304 
4305     status = nvidia_resume(dev, NV_PM_ACTION_RESUME);
4306     return (status == NV_OK) ? 0 : -EIO;
4307 }
4308 
4309 int nv_pmops_freeze(
4310     struct device *dev
4311 )
4312 {
4313     NV_STATUS status;
4314 
4315     status = nvidia_suspend(dev, NV_PM_ACTION_HIBERNATE, NV_FALSE);
4316     return (status == NV_OK) ? 0 : -EIO;
4317 }
4318 
4319 int nv_pmops_thaw(
4320     struct device *dev
4321 )
4322 {
4323     return 0;
4324 }
4325 
4326 int nv_pmops_restore(
4327     struct device *dev
4328 )
4329 {
4330     NV_STATUS status;
4331 
4332     status = nvidia_resume(dev, NV_PM_ACTION_RESUME);
4333     return (status == NV_OK) ? 0 : -EIO;
4334 }
4335 
4336 int nv_pmops_poweroff(
4337     struct device *dev
4338 )
4339 {
4340     return 0;
4341 }
4342 
4343 static int
4344 nvidia_transition_dynamic_power(
4345     struct device *dev,
4346     NvBool enter
4347 )
4348 {
4349     struct pci_dev *pci_dev = to_pci_dev(dev);
4350     nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
4351     nv_state_t *nv = NV_STATE_PTR(nvl);
4352     nvidia_stack_t *sp = NULL;
4353     NV_STATUS status;
4354 
4355     if ((nv->flags & (NV_FLAG_OPEN | NV_FLAG_PERSISTENT_SW_STATE)) == 0)
4356     {
4357         return 0;
4358     }
4359 
4360     if (nv_kmem_cache_alloc_stack(&sp) != 0)
4361     {
4362         return -ENOMEM;
4363     }
4364 
4365     status = rm_transition_dynamic_power(sp, nv, enter);
4366 
4367     nv_kmem_cache_free_stack(sp);
4368 
4369     return (status == NV_OK) ? 0 : -EIO;
4370 }
4371 
4372 int nv_pmops_runtime_suspend(
4373     struct device *dev
4374 )
4375 {
4376     return nvidia_transition_dynamic_power(dev, NV_TRUE);
4377 }
4378 
4379 int nv_pmops_runtime_resume(
4380     struct device *dev
4381 )
4382 {
4383     return nvidia_transition_dynamic_power(dev, NV_FALSE);
4384 }
4385 #endif /* defined(CONFIG_PM) */
4386 
4387 nv_state_t* NV_API_CALL nv_get_adapter_state(
4388     NvU32 domain,
4389     NvU8  bus,
4390     NvU8  slot
4391 )
4392 {
4393     nv_linux_state_t *nvl;
4394 
4395     LOCK_NV_LINUX_DEVICES();
4396     for (nvl = nv_linux_devices; nvl != NULL;  nvl = nvl->next)
4397     {
4398         nv_state_t *nv = NV_STATE_PTR(nvl);
4399         if (nv->pci_info.domain == domain && nv->pci_info.bus == bus
4400             && nv->pci_info.slot == slot)
4401         {
4402             UNLOCK_NV_LINUX_DEVICES();
4403             return nv;
4404         }
4405     }
4406     UNLOCK_NV_LINUX_DEVICES();
4407 
4408     return NULL;
4409 }
4410 
4411 nv_state_t* NV_API_CALL nv_get_ctl_state(void)
4412 {
4413     return NV_STATE_PTR(&nv_ctl_device);
4414 }
4415 
4416 NV_STATUS NV_API_CALL nv_log_error(
4417     nv_state_t *nv,
4418     NvU32       error_number,
4419     const char *format,
4420     va_list    ap
4421 )
4422 {
4423     NV_STATUS status = NV_OK;
4424     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4425 
4426     nv_report_error(nvl->pci_dev, error_number, format, ap);
4427 #if defined(CONFIG_CRAY_XT)
4428     status = nvos_forward_error_to_cray(nvl->pci_dev, error_number,
4429                 format, ap);
4430 #endif
4431 
4432     return status;
4433 }
4434 
4435 NvU64 NV_API_CALL nv_get_dma_start_address(
4436     nv_state_t *nv
4437 )
4438 {
4439 #if defined(NVCPU_PPC64LE)
4440     struct pci_dev *pci_dev;
4441     dma_addr_t dma_addr;
4442     NvU64 saved_dma_mask;
4443     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4444 
4445     /*
4446      * If TCE bypass is disabled via a module parameter, then just return
4447      * the default (which is 0).
4448      *
4449      * Otherwise, the DMA start address only needs to be set once, and it
4450      * won't change afterward. Just return the cached value if asked again,
4451      * to avoid the kernel printing redundant messages to the kernel
4452      * log when we call pci_set_dma_mask().
4453      */
4454     if ((nv_tce_bypass_mode == NV_TCE_BYPASS_MODE_DISABLE) ||
4455         (nvl->tce_bypass_enabled))
4456     {
4457         return nvl->dma_dev.addressable_range.start;
4458     }
4459 
4460     pci_dev = nvl->pci_dev;
4461 
4462     /*
4463      * Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes
4464      * referred to as "windows".
4465      *
4466      * The "default window" provides a 2GB region of PCI address space
4467      * located below the 32-bit line. The IOMMU is used to provide a
4468      * "rich" mapping--any page in system memory can be mapped at an
4469      * arbitrary address within this window. The mappings are dynamic
4470      * and pass in and out of being as pci_map*()/pci_unmap*() calls
4471      * are made.
4472      *
4473      * Dynamic DMA Windows (sometimes "Huge DDW") provides a linear
4474      * mapping of the system's entire physical address space at some
4475      * fixed offset above the 59-bit line. IOMMU is still used, and
4476      * pci_map*()/pci_unmap*() are still required, but mappings are
4477      * static. They're effectively set up in advance, and any given
4478      * system page will always map to the same PCI bus address. I.e.
4479      *   physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx
4480      *
4481      * This driver does not support the 2G default window because
4482      * of its limited size, and for reasons having to do with UVM.
4483      *
4484      * Linux on POWER8 will only provide the DDW-style full linear
4485      * mapping when the driver claims support for 64-bit DMA addressing
4486      * (a pre-requisite because the PCI addresses used in this case will
4487      * be near the top of the 64-bit range). The linear mapping
4488      * is not available in all system configurations.
4489      *
4490      * Detect whether the linear mapping is present by claiming
4491      * 64-bit support and then mapping physical page 0. For historical
4492      * reasons, Linux on POWER8 will never map a page to PCI address 0x0.
4493      * In the "default window" case page 0 will be mapped to some
4494      * non-zero address below the 32-bit line.  In the
4495      * DDW/linear-mapping case, it will be mapped to address 0 plus
4496      * some high-order offset.
4497      *
4498      * If the linear mapping is present and sane then return the offset
4499      * as the starting address for all DMA mappings.
4500      */
4501     saved_dma_mask = pci_dev->dma_mask;
4502     if (pci_set_dma_mask(pci_dev, DMA_BIT_MASK(64)) != 0)
4503     {
4504         goto done;
4505     }
4506 
4507     dma_addr = pci_map_single(pci_dev, NULL, 1, DMA_BIDIRECTIONAL);
4508     if (pci_dma_mapping_error(pci_dev, dma_addr))
4509     {
4510         pci_set_dma_mask(pci_dev, saved_dma_mask);
4511         goto done;
4512     }
4513 
4514     pci_unmap_single(pci_dev, dma_addr, 1, DMA_BIDIRECTIONAL);
4515 
4516     /*
4517      * From IBM: "For IODA2, native DMA bypass or KVM TCE-based implementation
4518      * of full 64-bit DMA support will establish a window in address-space
4519      * with the high 14 bits being constant and the bottom up-to-50 bits
4520      * varying with the mapping."
4521      *
4522      * Unfortunately, we don't have any good interfaces or definitions from
4523      * the kernel to get information about the DMA offset assigned by OS.
4524      * However, we have been told that the offset will be defined by the top
4525      * 14 bits of the address, and bits 40-49 will not vary for any DMA
4526      * mappings until 1TB of system memory is surpassed; this limitation is
4527      * essential for us to function properly since our current GPUs only
4528      * support 40 physical address bits. We are in a fragile place where we
4529      * need to tell the OS that we're capable of 64-bit addressing, while
4530      * relying on the assumption that the top 24 bits will not vary in this
4531      * case.
4532      *
4533      * The way we try to compute the window, then, is mask the trial mapping
4534      * against the DMA capabilities of the device. That way, devices with
4535      * greater addressing capabilities will only take the bits it needs to
4536      * define the window.
4537      */
4538     if ((dma_addr & DMA_BIT_MASK(32)) != 0)
4539     {
4540         /*
4541          * Huge DDW not available - page 0 mapped to non-zero address below
4542          * the 32-bit line.
4543          */
4544         nv_printf(NV_DBG_WARNINGS,
4545             "NVRM: DMA window limited by platform\n");
4546         pci_set_dma_mask(pci_dev, saved_dma_mask);
4547         goto done;
4548     }
4549     else if ((dma_addr & saved_dma_mask) != 0)
4550     {
4551         NvU64 memory_size = os_get_num_phys_pages() * PAGE_SIZE;
4552         if ((dma_addr & ~saved_dma_mask) !=
4553             ((dma_addr + memory_size) & ~saved_dma_mask))
4554         {
4555             /*
4556              * The physical window straddles our addressing limit boundary,
4557              * e.g., for an adapter that can address up to 1TB, the window
4558              * crosses the 40-bit limit so that the lower end of the range
4559              * has different bits 63:40 than the higher end of the range.
4560              * We can only handle a single, static value for bits 63:40, so
4561              * we must fall back here.
4562              */
4563             nv_printf(NV_DBG_WARNINGS,
4564                 "NVRM: DMA window limited by memory size\n");
4565             pci_set_dma_mask(pci_dev, saved_dma_mask);
4566             goto done;
4567         }
4568     }
4569 
4570     nvl->tce_bypass_enabled = NV_TRUE;
4571     nvl->dma_dev.addressable_range.start = dma_addr & ~(saved_dma_mask);
4572 
4573     /* Update the coherent mask to match */
4574     dma_set_coherent_mask(&pci_dev->dev, pci_dev->dma_mask);
4575 
4576 done:
4577     return nvl->dma_dev.addressable_range.start;
4578 #else
4579     return 0;
4580 #endif
4581 }
4582 
4583 NV_STATUS NV_API_CALL nv_set_primary_vga_status(
4584     nv_state_t *nv
4585 )
4586 {
4587     /* IORESOURCE_ROM_SHADOW wasn't added until 2.6.10 */
4588 #if defined(IORESOURCE_ROM_SHADOW)
4589     nv_linux_state_t *nvl;
4590     struct pci_dev *pci_dev;
4591 
4592     nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4593     pci_dev = nvl->pci_dev;
4594 
4595     nv->primary_vga = ((NV_PCI_RESOURCE_FLAGS(pci_dev, PCI_ROM_RESOURCE) &
4596         IORESOURCE_ROM_SHADOW) == IORESOURCE_ROM_SHADOW);
4597     return NV_OK;
4598 #else
4599     return NV_ERR_NOT_SUPPORTED;
4600 #endif
4601 }
4602 
4603 NV_STATUS NV_API_CALL nv_pci_trigger_recovery(
4604      nv_state_t *nv
4605 )
4606 {
4607     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
4608 #if defined(NV_PCI_ERROR_RECOVERY)
4609     nv_linux_state_t *nvl       = NV_GET_NVL_FROM_NV_STATE(nv);
4610 
4611     /*
4612      * Calling readl() on PPC64LE will allow the kernel to check its state for
4613      * the device and update it accordingly. This needs to be done before
4614      * checking if the PCI channel is offline, so that we don't check stale
4615      * state.
4616      *
4617      * This will also kick off the recovery process for the device.
4618      */
4619     if (NV_PCI_ERROR_RECOVERY_ENABLED())
4620     {
4621         if (readl(nv->regs->map) == 0xFFFFFFFF)
4622         {
4623             if (pci_channel_offline(nvl->pci_dev))
4624             {
4625                 NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
4626                               "PCI channel for the device is offline\n");
4627                 status = NV_OK;
4628             }
4629         }
4630     }
4631 #endif
4632     return status;
4633 }
4634 
4635 NvBool NV_API_CALL nv_requires_dma_remap(
4636     nv_state_t *nv
4637 )
4638 {
4639     NvBool dma_remap = NV_FALSE;
4640     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4641     dma_remap = !nv_dma_maps_swiotlb(nvl->dev);
4642     return dma_remap;
4643 }
4644 
4645 /*
4646  * Intended for use by external kernel modules to list nvidia gpu ids.
4647  */
4648 NvBool nvidia_get_gpuid_list(NvU32 *gpu_ids, NvU32 *gpu_count)
4649 {
4650     nv_linux_state_t *nvl;
4651     unsigned int count;
4652     NvBool ret = NV_TRUE;
4653 
4654     LOCK_NV_LINUX_DEVICES();
4655 
4656     count = 0;
4657     for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
4658         count++;
4659 
4660     if (*gpu_count == 0)
4661     {
4662         goto done;
4663     }
4664     else if ((*gpu_count) < count)
4665     {
4666         ret = NV_FALSE;
4667         goto done;
4668     }
4669 
4670     count = 0;
4671     for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
4672     {
4673         nv_state_t *nv = NV_STATE_PTR(nvl);
4674         gpu_ids[count++] = nv->gpu_id;
4675     }
4676 
4677 
4678 done:
4679 
4680     *gpu_count = count;
4681 
4682     UNLOCK_NV_LINUX_DEVICES();
4683 
4684     return ret;
4685 }
4686 
4687 /*
4688  * Kernel-level analog to nvidia_open, intended for use by external
4689  * kernel modules. This increments the ref count of the device with
4690  * the given gpu_id and makes sure the device has been initialized.
4691  *
4692  * Clients of this interface are counted by the RM reset path, to ensure a
4693  * GPU is not reset while the GPU is active.
4694  *
4695  * Returns -ENODEV if the given gpu_id does not exist.
4696  */
4697 int nvidia_dev_get(NvU32 gpu_id, nvidia_stack_t *sp)
4698 {
4699     nv_linux_state_t *nvl;
4700     int rc;
4701 
4702     /* Takes nvl->ldata_lock */
4703     nvl = find_gpu_id(gpu_id);
4704     if (!nvl)
4705         return -ENODEV;
4706 
4707     rc = nv_open_device(NV_STATE_PTR(nvl), sp);
4708 
4709     if (rc == 0)
4710         WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_TRUE) != NV_OK);
4711 
4712     up(&nvl->ldata_lock);
4713     return rc;
4714 }
4715 
4716 /*
4717  * Kernel-level analog to nvidia_close, intended for use by external
4718  * kernel modules. This decrements the ref count of the device with
4719  * the given gpu_id, potentially tearing it down.
4720  */
4721 void nvidia_dev_put(NvU32 gpu_id, nvidia_stack_t *sp)
4722 {
4723     nv_linux_state_t *nvl;
4724 
4725     /* Takes nvl->ldata_lock */
4726     nvl = find_gpu_id(gpu_id);
4727     if (!nvl)
4728         return;
4729 
4730     nv_close_device(NV_STATE_PTR(nvl), sp);
4731 
4732     WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_FALSE) != NV_OK);
4733 
4734     up(&nvl->ldata_lock);
4735 }
4736 
4737 /*
4738  * Like nvidia_dev_get but uses UUID instead of gpu_id. Note that this may
4739  * trigger initialization and teardown of unrelated devices to look up their
4740  * UUIDs.
4741  *
4742  * Clients of this interface are counted by the RM reset path, to ensure a
4743  * GPU is not reset while the GPU is active.
4744  */
4745 int nvidia_dev_get_uuid(const NvU8 *uuid, nvidia_stack_t *sp)
4746 {
4747     nv_state_t *nv = NULL;
4748     nv_linux_state_t *nvl = NULL;
4749     const NvU8 *dev_uuid;
4750     int rc = 0;
4751 
4752     /* Takes nvl->ldata_lock */
4753     nvl = find_uuid_candidate(uuid);
4754     while (nvl)
4755     {
4756         nv = NV_STATE_PTR(nvl);
4757 
4758         /*
4759          * If the device is missing its UUID, this call exists solely so
4760          * rm_get_gpu_uuid_raw will be called and we can inspect the UUID.
4761          */
4762         rc = nv_open_device(nv, sp);
4763         if (rc != 0)
4764             goto out;
4765 
4766         /* The UUID should always be present following nv_open_device */
4767         dev_uuid = nv_get_cached_uuid(nv);
4768         WARN_ON(!dev_uuid);
4769         if (dev_uuid && memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
4770             break;
4771 
4772         /* No match, try again. */
4773         nv_close_device(nv, sp);
4774         up(&nvl->ldata_lock);
4775         nvl = find_uuid_candidate(uuid);
4776     }
4777 
4778     if (nvl)
4779     {
4780         rc = 0;
4781         WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_TRUE) != NV_OK);
4782     }
4783     else
4784         rc = -ENODEV;
4785 
4786 out:
4787     if (nvl)
4788         up(&nvl->ldata_lock);
4789     return rc;
4790 }
4791 
4792 /*
4793  * Like nvidia_dev_put but uses UUID instead of gpu_id.
4794  */
4795 void nvidia_dev_put_uuid(const NvU8 *uuid, nvidia_stack_t *sp)
4796 {
4797     nv_linux_state_t *nvl;
4798 
4799     /* Callers must already have called nvidia_dev_get_uuid() */
4800 
4801     /* Takes nvl->ldata_lock */
4802     nvl = find_uuid(uuid);
4803     if (!nvl)
4804         return;
4805 
4806     nv_close_device(NV_STATE_PTR(nvl), sp);
4807 
4808     WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_FALSE) != NV_OK);
4809 
4810     up(&nvl->ldata_lock);
4811 }
4812 
4813 int nvidia_dev_block_gc6(const NvU8 *uuid, nvidia_stack_t *sp)
4814 
4815 {
4816     nv_linux_state_t *nvl;
4817 
4818     /* Callers must already have called nvidia_dev_get_uuid() */
4819 
4820     /* Takes nvl->ldata_lock */
4821     nvl = find_uuid(uuid);
4822     if (!nvl)
4823         return -ENODEV;
4824 
4825     if (rm_ref_dynamic_power(sp, NV_STATE_PTR(nvl), NV_DYNAMIC_PM_FINE) != NV_OK)
4826     {
4827         up(&nvl->ldata_lock);
4828         return -EINVAL;
4829     }
4830 
4831     up(&nvl->ldata_lock);
4832 
4833     return 0;
4834 }
4835 
4836 int nvidia_dev_unblock_gc6(const NvU8 *uuid, nvidia_stack_t *sp)
4837 
4838 {
4839     nv_linux_state_t *nvl;
4840 
4841     /* Callers must already have called nvidia_dev_get_uuid() */
4842 
4843     /* Takes nvl->ldata_lock */
4844     nvl = find_uuid(uuid);
4845     if (!nvl)
4846         return -ENODEV;
4847 
4848     rm_unref_dynamic_power(sp, NV_STATE_PTR(nvl), NV_DYNAMIC_PM_FINE);
4849 
4850     up(&nvl->ldata_lock);
4851 
4852     return 0;
4853 }
4854 
4855 NV_STATUS NV_API_CALL nv_get_device_memory_config(
4856     nv_state_t *nv,
4857     NvU64 *compr_addr_sys_phys,
4858     NvU64 *addr_guest_phys,
4859     NvU32 *addr_width,
4860     NvS32 *node_id
4861 )
4862 {
4863     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4864     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
4865 
4866     if (!nv_platform_supports_numa(nvl))
4867     {
4868         return NV_ERR_NOT_SUPPORTED;
4869     }
4870 
4871 #if defined(NVCPU_PPC64LE)
4872     nv_npu_numa_info_t *numa_info;
4873 
4874     numa_info = &nvl->npu->numa_info;
4875 
4876     if (node_id != NULL)
4877     {
4878         *node_id = nvl->numa_info.node_id;
4879     }
4880 
4881     if (compr_addr_sys_phys != NULL)
4882     {
4883         *compr_addr_sys_phys =
4884             numa_info->compr_sys_phys_addr;
4885     }
4886 
4887     if (addr_guest_phys != NULL)
4888     {
4889         *addr_guest_phys =
4890             numa_info->guest_phys_addr;
4891     }
4892 
4893     if (addr_width != NULL)
4894     {
4895         *addr_width = nv_volta_dma_addr_size - nv_volta_addr_space_width;
4896     }
4897 
4898     status = NV_OK;
4899 #endif
4900 
4901     return status;
4902 }
4903 
4904 #if defined(NVCPU_PPC64LE)
4905 
4906 NV_STATUS NV_API_CALL nv_get_nvlink_line_rate(
4907     nv_state_t *nvState,
4908     NvU32      *linerate
4909 )
4910 {
4911 #if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
4912 
4913     nv_linux_state_t *nvl;
4914     struct pci_dev   *npuDev;
4915     NvU32            *pSpeedPtr = NULL;
4916     NvU32            speed;
4917     int              len;
4918 
4919     if (nvState != NULL)
4920         nvl = NV_GET_NVL_FROM_NV_STATE(nvState);
4921     else
4922         return NV_ERR_INVALID_ARGUMENT;
4923 
4924     if (!nvl->npu)
4925     {
4926         return NV_ERR_NOT_SUPPORTED;
4927     }
4928 
4929     npuDev = nvl->npu->devs[0];
4930     if (!npuDev->dev.of_node)
4931     {
4932         nv_printf(NV_DBG_ERRORS, "NVRM: %s: OF Node not found in IBM-NPU device node\n",
4933                   __FUNCTION__);
4934         return NV_ERR_NOT_SUPPORTED;
4935     }
4936 
4937     pSpeedPtr = (NvU32 *) of_get_property(npuDev->dev.of_node, "ibm,nvlink-speed", &len);
4938 
4939     if (pSpeedPtr)
4940     {
4941         speed = (NvU32) be32_to_cpup(pSpeedPtr);
4942     }
4943     else
4944     {
4945         return NV_ERR_NOT_SUPPORTED;
4946     }
4947 
4948     if (!speed)
4949     {
4950         return NV_ERR_NOT_SUPPORTED;
4951     }
4952     else
4953     {
4954         *linerate = speed;
4955     }
4956 
4957     return NV_OK;
4958 
4959 #endif
4960 
4961     return NV_ERR_NOT_SUPPORTED;
4962 }
4963 
4964 #endif
4965 
4966 NV_STATUS NV_API_CALL nv_indicate_idle(
4967     nv_state_t *nv
4968 )
4969 {
4970 #if defined(NV_PM_RUNTIME_AVAILABLE)
4971     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
4972     struct device *dev = nvl->dev;
4973     struct file *file = nvl->sysfs_config_file;
4974     loff_t f_pos = 0;
4975     char buf;
4976 
4977     pm_runtime_put_noidle(dev);
4978 
4979 #if defined(NV_SEQ_READ_ITER_PRESENT)
4980     {
4981         struct kernfs_open_file *of = ((struct seq_file *)file->private_data)->private;
4982         struct kernfs_node *kn;
4983 
4984         mutex_lock(&of->mutex);
4985         kn = of->kn;
4986         if (kn != NULL && atomic_inc_unless_negative(&kn->active))
4987         {
4988             if ((kn->attr.ops != NULL) && (kn->attr.ops->read != NULL))
4989             {
4990                 kn->attr.ops->read(of, &buf, 1, f_pos);
4991             }
4992             atomic_dec(&kn->active);
4993         }
4994         mutex_unlock(&of->mutex);
4995     }
4996 #else
4997 #if defined(NV_KERNEL_READ_HAS_POINTER_POS_ARG)
4998     kernel_read(file, &buf, 1, &f_pos);
4999 #else
5000     kernel_read(file, f_pos, &buf, 1);
5001 #endif
5002 #endif
5003 
5004     return NV_OK;
5005 #else
5006     return NV_ERR_NOT_SUPPORTED;
5007 #endif
5008 }
5009 
5010 NV_STATUS NV_API_CALL nv_indicate_not_idle(
5011     nv_state_t *nv
5012 )
5013 {
5014 #if defined(NV_PM_RUNTIME_AVAILABLE)
5015     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5016     struct device *dev = nvl->dev;
5017 
5018     pm_runtime_get_noresume(dev);
5019 
5020     nvl->is_forced_shutdown = NV_TRUE;
5021     pci_bus_type.shutdown(dev);
5022 
5023     return NV_OK;
5024 #else
5025     return NV_ERR_NOT_SUPPORTED;
5026 #endif
5027 }
5028 
5029 void NV_API_CALL nv_idle_holdoff(
5030     nv_state_t *nv
5031 )
5032 {
5033 #if defined(NV_PM_RUNTIME_AVAILABLE)
5034     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5035     struct device *dev = nvl->dev;
5036 
5037     pm_runtime_get_noresume(dev);
5038 #endif
5039 }
5040 
5041 NvBool NV_API_CALL nv_dynamic_power_available(
5042     nv_state_t *nv
5043 )
5044 {
5045 #if defined(NV_PM_RUNTIME_AVAILABLE)
5046     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5047     return nvl->sysfs_config_file != NULL;
5048 #else
5049     return NV_FALSE;
5050 #endif
5051 }
5052 
5053 /* caller should hold nv_linux_devices_lock using LOCK_NV_LINUX_DEVICES */
5054 void nv_linux_add_device_locked(nv_linux_state_t *nvl)
5055 {
5056     if (nv_linux_devices == NULL) {
5057         nv_linux_devices = nvl;
5058     }
5059     else
5060     {
5061         nv_linux_state_t *tnvl;
5062         for (tnvl = nv_linux_devices; tnvl->next != NULL;  tnvl = tnvl->next);
5063         tnvl->next = nvl;
5064     }
5065 }
5066 
5067 /* caller should hold nv_linux_devices_lock using LOCK_NV_LINUX_DEVICES */
5068 void nv_linux_remove_device_locked(nv_linux_state_t *nvl)
5069 {
5070     if (nvl == nv_linux_devices) {
5071         nv_linux_devices = nvl->next;
5072     }
5073     else
5074     {
5075         nv_linux_state_t *tnvl;
5076         for (tnvl = nv_linux_devices; tnvl->next != nvl;  tnvl = tnvl->next);
5077         tnvl->next = nvl->next;
5078     }
5079 }
5080 
5081 void NV_API_CALL nv_control_soc_irqs(nv_state_t *nv, NvBool bEnable)
5082 {
5083     int count;
5084     unsigned long flags;
5085     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5086 
5087     if (nv->current_soc_irq != -1)
5088         return;
5089 
5090     NV_SPIN_LOCK_IRQSAVE(&nvl->soc_isr_lock, flags);
5091     if (bEnable)
5092     {
5093         for (count = 0; count < nv->num_soc_irqs; count++)
5094         {
5095             if (nv->soc_irq_info[count].ref_count == 0)
5096             {
5097                 nv->soc_irq_info[count].ref_count++;
5098                 enable_irq(nv->soc_irq_info[count].irq_num);
5099             }
5100         }
5101     }
5102     else
5103     {
5104         for (count = 0; count < nv->num_soc_irqs; count++)
5105         {
5106             if (nv->soc_irq_info[count].ref_count == 1)
5107             {
5108                 nv->soc_irq_info[count].ref_count--;
5109                 disable_irq_nosync(nv->soc_irq_info[count].irq_num);
5110             }
5111         }
5112     }
5113     NV_SPIN_UNLOCK_IRQRESTORE(&nvl->soc_isr_lock, flags);
5114 }
5115 
5116 NvU32 NV_API_CALL nv_get_dev_minor(nv_state_t *nv)
5117 {
5118     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5119 
5120     return nvl->minor_num;
5121 }
5122 
5123 NV_STATUS NV_API_CALL nv_acquire_fabric_mgmt_cap(int fd, int *duped_fd)
5124 {
5125     *duped_fd = nvlink_cap_acquire(fd, NVLINK_CAP_FABRIC_MANAGEMENT);
5126     if (*duped_fd < 0)
5127     {
5128         return NV_ERR_INSUFFICIENT_PERMISSIONS;
5129     }
5130 
5131     return NV_OK;
5132 }
5133 
5134 /*
5135  * Wakes up the NVIDIA GPU HDA codec and contoller by reading
5136  * codec proc file.
5137  */
5138 void NV_API_CALL nv_audio_dynamic_power(
5139     nv_state_t *nv
5140 )
5141 {
5142 /*
5143  * The runtime power management for nvidia HDA controller can be possible
5144  * after commit 07f4f97d7b4b ("vga_switcheroo: Use device link for HDA
5145  * controller"). This commit has also moved 'PCI_CLASS_MULTIMEDIA_HD_AUDIO'
5146  * macro from <sound/hdaudio.h> to <linux/pci_ids.h>.
5147  * If 'NV_PCI_CLASS_MULTIMEDIA_HD_AUDIO_PRESENT' is not defined, then
5148  * this function will be stub function.
5149  *
5150  * Also, check if runtime PM is enabled in the kernel (with
5151  * 'NV_PM_RUNTIME_AVAILABLE') and stub this function if it is disabled. This
5152  * function uses kernel fields only present when the kconfig has runtime PM
5153  * enabled.
5154  */
5155 #if defined(NV_PCI_CLASS_MULTIMEDIA_HD_AUDIO_PRESENT) && defined(NV_PM_RUNTIME_AVAILABLE)
5156     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5157     struct device *dev = nvl->dev;
5158     struct pci_dev *audio_pci_dev, *pci_dev;
5159     struct snd_card *card;
5160 
5161     if (!dev_is_pci(dev))
5162         return;
5163 
5164     pci_dev = to_pci_dev(dev);
5165 
5166     audio_pci_dev = os_pci_init_handle(NV_PCI_DOMAIN_NUMBER(pci_dev),
5167                                        NV_PCI_BUS_NUMBER(pci_dev),
5168                                        NV_PCI_SLOT_NUMBER(pci_dev),
5169                                        1, NULL, NULL);
5170 
5171     if (audio_pci_dev == NULL)
5172         return;
5173 
5174     /*
5175      * Check if HDA controller is in pm suspended state. The HDA contoller
5176      * can not be runtime resumed if this API is called during system
5177      * suspend/resume time and HDA controller is in pm suspended state.
5178      */
5179     if (audio_pci_dev->dev.power.is_suspended)
5180         return;
5181 
5182     card = pci_get_drvdata(audio_pci_dev);
5183     if (card == NULL)
5184         return;
5185 
5186     /*
5187      * Commit be57bfffb7b5 ("ALSA: hda: move hda_codec.h to include/sound")
5188      * in v4.20-rc1 moved "hda_codec.h" header file from the private sound
5189      * folder to include/sound.
5190      */
5191 #if defined(NV_SOUND_HDA_CODEC_H_PRESENT)
5192     {
5193         struct list_head *p;
5194         struct hda_codec *codec = NULL;
5195         unsigned int cmd, res;
5196 
5197         /*
5198          * Traverse the list of devices which the sound card maintains and
5199          * search for HDA codec controller.
5200          */
5201         list_for_each_prev(p, &card->devices)
5202         {
5203             struct snd_device *pdev = list_entry(p, struct snd_device, list);
5204 
5205             if (pdev->type == SNDRV_DEV_CODEC)
5206             {
5207                 codec = pdev->device_data;
5208 
5209                 /*
5210                  * NVIDIA HDA codec controller uses linux kernel HDA codec
5211                  * driver. Commit 05852448690d ("ALSA: hda - Support indirect
5212                  * execution of verbs") added support for overriding exec_verb.
5213                  * This codec->core.exec_verb will be codec_exec_verb() for
5214                  * NVIDIA HDA codec driver.
5215                  */
5216                 if (codec->core.exec_verb == NULL)
5217                 {
5218                     return;
5219                 }
5220 
5221                 break;
5222             }
5223         }
5224 
5225         if (codec == NULL)
5226         {
5227             return;
5228         }
5229 
5230         /* If HDA codec controller is already runtime active, then return */
5231         if (snd_hdac_is_power_on(&codec->core))
5232         {
5233             return;
5234         }
5235 
5236         /*
5237          * Encode codec verb for getting vendor ID from root node.
5238          * Refer Intel High Definition Audio Specification for more details.
5239          */
5240         cmd = (codec->addr << 28) | (AC_NODE_ROOT << 20) |
5241               (AC_VERB_PARAMETERS << 8) | AC_PAR_VENDOR_ID;
5242 
5243         /*
5244          * It will internally increment the runtime PM refcount,
5245          * wake-up the audio codec controller and send the HW
5246          * command for getting vendor ID. Once the vendor ID will be
5247          * returned back, then it will decrement the runtime PM refcount
5248          * and runtime suspend audio codec controller again (If refcount is
5249          * zero) once auto suspend counter expires.
5250          */
5251         codec->core.exec_verb(&codec->core, cmd, 0, &res);
5252     }
5253 #else
5254     {
5255         int codec_addr;
5256 
5257         /*
5258          * The filp_open() call below depends on the current task's fs_struct
5259          * (current->fs), which may already be NULL if this is called during
5260          * process teardown.
5261          */
5262         if (current->fs == NULL)
5263             return;
5264 
5265         /* If device is runtime active, then return */
5266         if (audio_pci_dev->dev.power.runtime_status == RPM_ACTIVE)
5267             return;
5268 
5269         for (codec_addr = 0; codec_addr < NV_HDA_MAX_CODECS; codec_addr++)
5270         {
5271             char filename[48];
5272             NvU8 buf;
5273             int ret;
5274 
5275             ret = snprintf(filename, sizeof(filename),
5276                            "/proc/asound/card%d/codec#%d",
5277                            card->number, codec_addr);
5278 
5279             if (ret > 0 && ret < sizeof(filename) &&
5280                 (os_open_and_read_file(filename, &buf, 1) == NV_OK))
5281             {
5282                     break;
5283             }
5284         }
5285     }
5286 #endif
5287 #endif
5288 }
5289 
5290 static int nv_match_dev_state(const void *data, struct file *filp, unsigned fd)
5291 {
5292     nv_linux_state_t *nvl = NULL;
5293     dev_t rdev = 0;
5294 
5295     if (filp == NULL ||
5296         filp->private_data == NULL ||
5297         NV_FILE_INODE(filp) == NULL)
5298         return 0;
5299 
5300     rdev = (NV_FILE_INODE(filp))->i_rdev;
5301     if (MAJOR(rdev) != NV_MAJOR_DEVICE_NUMBER)
5302         return 0;
5303 
5304     nvl = NV_GET_NVL_FROM_FILEP(filp);
5305     if (nvl == NULL)
5306         return 0;
5307 
5308     return (data == nvl);
5309 }
5310 
5311 NvBool NV_API_CALL nv_match_gpu_os_info(nv_state_t *nv, void *os_info)
5312 {
5313     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5314 
5315     return nv_match_dev_state(nvl, os_info, -1);
5316 }
5317 
5318 NvBool NV_API_CALL nv_is_gpu_accessible(nv_state_t *nv)
5319 {
5320     struct files_struct *files = current->files;
5321     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5322 
5323     return !!iterate_fd(files, 0, nv_match_dev_state, nvl);
5324 }
5325 
5326 NvBool NV_API_CALL nv_platform_supports_s0ix(void)
5327 {
5328 #if defined(CONFIG_ACPI)
5329     return (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) != 0;
5330 #else
5331     return NV_FALSE;
5332 #endif
5333 }
5334 
5335 NvBool NV_API_CALL nv_s2idle_pm_configured(void)
5336 {
5337     NvU8 buf[8];
5338 
5339 #if defined(NV_SEQ_READ_ITER_PRESENT)
5340     struct file *file;
5341     ssize_t num_read;
5342     struct kiocb kiocb;
5343     struct iov_iter iter;
5344     struct kvec iov = {
5345         .iov_base = &buf,
5346         .iov_len = sizeof(buf),
5347     };
5348 
5349     if (os_open_readonly_file("/sys/power/mem_sleep", (void **)&file) != NV_OK)
5350     {
5351         return NV_FALSE;
5352     }
5353 
5354     /*
5355      * init_sync_kiocb() internally uses GPL licensed __get_task_ioprio() from
5356      * v5.20-rc1.
5357      */
5358 #if defined(NV_GET_TASK_IOPRIO_PRESENT)
5359     memset(&kiocb, 0, sizeof(kiocb));
5360     kiocb.ki_filp = file;
5361     kiocb.ki_flags = iocb_flags(file);
5362     kiocb.ki_ioprio = IOPRIO_DEFAULT;
5363 #else
5364     init_sync_kiocb(&kiocb, file);
5365 #endif
5366 
5367     kiocb.ki_pos = 0;
5368     iov_iter_kvec(&iter, READ, &iov, 1, sizeof(buf));
5369 
5370     num_read = seq_read_iter(&kiocb, &iter);
5371 
5372     os_close_file((void *)file);
5373 
5374     if (num_read != sizeof(buf))
5375     {
5376         return NV_FALSE;
5377     }
5378 #else
5379     if (os_open_and_read_file("/sys/power/mem_sleep", buf,
5380                               sizeof(buf)) != NV_OK)
5381     {
5382         return NV_FALSE;
5383     }
5384 #endif
5385 
5386     return (memcmp(buf, "[s2idle]", 8) == 0);
5387 }
5388 
5389 /*
5390  * Function query system chassis info, to figure out if the platform is
5391  * Laptop or Notebook.
5392  * This function should be used when querying GPU form factor information is
5393  * not possible via core RM or if querying both system and GPU form factor
5394  * information is necessary.
5395  */
5396 NvBool NV_API_CALL nv_is_chassis_notebook(void)
5397 {
5398     const char *chassis_type = dmi_get_system_info(DMI_CHASSIS_TYPE);
5399 
5400     //
5401     // Return true only for Laptop & Notebook
5402     // As per SMBIOS spec Laptop = 9 and Notebook = 10
5403     //
5404     return (chassis_type && (!strcmp(chassis_type, "9") || !strcmp(chassis_type, "10")));
5405 }
5406 
5407 void NV_API_CALL nv_allow_runtime_suspend
5408 (
5409     nv_state_t *nv
5410 )
5411 {
5412 #if defined(NV_PM_RUNTIME_AVAILABLE)
5413     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5414     struct device    *dev = nvl->dev;
5415 
5416     spin_lock_irq(&dev->power.lock);
5417 
5418     if (dev->power.runtime_auto == false)
5419     {
5420         dev->power.runtime_auto = true;
5421         atomic_add_unless(&dev->power.usage_count, -1, 0);
5422     }
5423 
5424     spin_unlock_irq(&dev->power.lock);
5425 #endif
5426 }
5427 
5428 void NV_API_CALL nv_disallow_runtime_suspend
5429 (
5430     nv_state_t *nv
5431 )
5432 {
5433 #if defined(NV_PM_RUNTIME_AVAILABLE)
5434     nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
5435     struct device    *dev = nvl->dev;
5436 
5437     spin_lock_irq(&dev->power.lock);
5438 
5439     if (dev->power.runtime_auto == true)
5440     {
5441         dev->power.runtime_auto = false;
5442         atomic_inc(&dev->power.usage_count);
5443     }
5444 
5445     spin_unlock_irq(&dev->power.lock);
5446 #endif
5447 }
5448 
5449 NvU32 NV_API_CALL nv_get_os_type(void)
5450 {
5451     return OS_TYPE_LINUX;
5452 }
5453 
5454 void NV_API_CALL nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
5455 {
5456 #if NVCPU_IS_PPC64LE
5457     return nv_ibmnpu_cache_flush_range(nv, cpu_virtual, size);
5458 #elif NVCPU_IS_AARCH64
5459     NvU64 va, cbsize;
5460     NvU64 end_cpu_virtual = cpu_virtual + size;
5461 
5462     nv_printf(NV_DBG_INFO,
5463             "Flushing CPU virtual range [0x%llx, 0x%llx)\n",
5464             cpu_virtual, end_cpu_virtual);
5465 
5466     cbsize = cache_line_size();
5467     // Align address to line size
5468     cpu_virtual = NV_ALIGN_UP(cpu_virtual, cbsize);
5469 
5470     // Force eviction of any cache lines from the NUMA-onlined region.
5471     for (va = cpu_virtual; va < end_cpu_virtual; va += cbsize)
5472     {
5473         asm volatile("dc civac, %0" : : "r" (va): "memory");
5474         // Reschedule if necessary to avoid lockup warnings
5475         cond_resched();
5476     }
5477     asm volatile("dsb sy" : : : "memory");
5478 #endif
5479 }
5480 
5481 static struct resource *nv_next_resource(struct resource *p)
5482 {
5483     if (p->child != NULL)
5484         return p->child;
5485 
5486     while ((p->sibling == NULL) && (p->parent != NULL))
5487         p = p->parent;
5488 
5489     return p->sibling;
5490 }
5491 
5492 /*
5493  * Function to get the correct PCI Bus memory window which can be mapped
5494  * in the real mode emulator (emu).
5495  * The function gets called during the initialization of the emu before
5496  * remapping it to OS.
5497  */
5498 void NV_API_CALL nv_get_updated_emu_seg(
5499     NvU32 *start,
5500     NvU32 *end
5501 )
5502 {
5503     struct resource *p;
5504 
5505     if (*start >= *end)
5506         return;
5507 
5508     for (p = iomem_resource.child; (p != NULL); p = nv_next_resource(p))
5509     {
5510         /* If we passed the resource we are looking for, stop */
5511         if (p->start > *end)
5512         {
5513             p = NULL;
5514             break;
5515         }
5516 
5517         /* Skip until we find a range that matches what we look for */
5518         if (p->end < *start)
5519             continue;
5520 
5521         if ((p->end > *end) && (p->child))
5522             continue;
5523 
5524         if ((p->flags & IORESOURCE_MEM) != IORESOURCE_MEM)
5525             continue;
5526 
5527         /* Found a match, break */
5528         break;
5529     }
5530 
5531     if (p != NULL)
5532     {
5533         *start = max((resource_size_t)*start, p->start);
5534         *end = min((resource_size_t)*end, p->end);
5535     }
5536 }
5537 
5538