1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_GPU_H__
25 #define __UVM_GPU_H__
26 
27 #include "nvtypes.h"
28 #include "nvmisc.h"
29 #include "uvm_types.h"
30 #include "nv_uvm_types.h"
31 #include "uvm_linux.h"
32 #include "nv-kref.h"
33 #include "uvm_common.h"
34 #include "ctrl2080mc.h"
35 #include "uvm_forward_decl.h"
36 #include "uvm_processors.h"
37 #include "uvm_pmm_gpu.h"
38 #include "uvm_pmm_sysmem.h"
39 #include "uvm_mmu.h"
40 #include "uvm_gpu_replayable_faults.h"
41 #include "uvm_gpu_isr.h"
42 #include "uvm_hal_types.h"
43 #include "uvm_hmm.h"
44 #include "uvm_va_block_types.h"
45 #include "uvm_perf_module.h"
46 #include "uvm_rb_tree.h"
47 #include "uvm_perf_prefetch.h"
48 #include "nv-kthread-q.h"
49 #include <linux/mmu_notifier.h>
50 #include "uvm_conf_computing.h"
51 
52 // Buffer length to store uvm gpu id, RM device name and gpu uuid.
53 #define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \
54             UVM_GPU_NAME_LENGTH + UVM_GPU_UUID_TEXT_BUFFER_LENGTH)
55 
56 #define UVM_GPU_MAGIC_VALUE 0xc001d00d12341993ULL
57 
58 typedef struct
59 {
60     // Number of faults from this uTLB that have been fetched but have not been
61     // serviced yet.
62     NvU32 num_pending_faults;
63 
64     // Whether the uTLB contains fatal faults
65     bool has_fatal_faults;
66 
67     // We have issued a replay of type START_ACK_ALL while containing fatal
68     // faults. This puts the uTLB in lockdown mode and no new translations are
69     // accepted.
70     bool in_lockdown;
71 
72     // We have issued a cancel on this uTLB
73     bool cancelled;
74 
75     uvm_fault_buffer_entry_t prev_fatal_fault;
76 
77     // Last fetched fault that was originated from this uTLB. Used for fault
78     // filtering.
79     uvm_fault_buffer_entry_t *last_fault;
80 } uvm_fault_utlb_info_t;
81 
82 struct uvm_service_block_context_struct
83 {
84     //
85     // Fields initialized by CPU/GPU fault handling and access counter routines
86     //
87 
88     // Whether the information refers to replayable/non-replayable faults or
89     // access counters
90     uvm_service_operation_t operation;
91 
92     // Processors that will be the residency of pages after the operation has
93     // been serviced
94     uvm_processor_mask_t resident_processors;
95 
96     // VA block region that contains all the pages affected by the operation
97     uvm_va_block_region_t region;
98 
99     // Array of type uvm_fault_access_type_t that contains the type of the
100     // access that caused the fault/access_counter notification to be serviced
101     // for each page.
102     NvU8 access_type[PAGES_PER_UVM_VA_BLOCK];
103 
104     // Number of times the service operation has been retried
105     unsigned num_retries;
106 
107     // Pages that need to be pinned due to thrashing
108     uvm_page_mask_t thrashing_pin_mask;
109 
110     // Number of pages that need to be pinned due to thrashing. This is the same
111     // value as the result of bitmap_weight(thrashing_pin_mask)
112     unsigned thrashing_pin_count;
113 
114     // Pages that can be read-duplicated
115     uvm_page_mask_t read_duplicate_mask;
116 
117     // Number of pages that can be read-duplicated. This is the same value as
118     // the result of bitmap_weight(read_duplicate_count_mask)
119     unsigned read_duplicate_count;
120 
121     //
122     // Fields used by the CPU fault handling routine
123     //
124 
125     struct
126     {
127         // Node of the list of fault service contexts used by the CPU
128         struct list_head service_context_list;
129 
130         // A mask of GPUs that need to be checked for ECC errors before the CPU
131         // fault handler returns, but after the VA space lock has been unlocked
132         // to avoid the RM/UVM VA space lock deadlocks.
133         uvm_processor_mask_t gpus_to_check_for_ecc;
134 
135         // This is set to throttle page fault thrashing.
136         NvU64 wakeup_time_stamp;
137 
138         // This is set if the page migrated to/from the GPU and CPU.
139         bool did_migrate;
140 
141         // Sequence number used to start a mmu notifier read side critical
142         // section.
143         unsigned long notifier_seq;
144 
145         struct vm_fault *vmf;
146     } cpu_fault;
147 
148     //
149     // Fields managed by the common operation servicing routine
150     //
151 
152     uvm_prot_page_mask_array_t mappings_by_prot;
153 
154     // Mask with the pages that did not migrate to the processor (they were
155     // already resident) in the last call to uvm_va_block_make_resident.
156     // This is used to compute the pages that need to revoke mapping permissions
157     // from other processors.
158     uvm_page_mask_t did_not_migrate_mask;
159 
160     // Pages whose permissions need to be revoked from other processors
161     uvm_page_mask_t revocation_mask;
162 
163     struct
164     {
165         // Per-processor mask with the pages that will be resident after
166         // servicing. We need one mask per processor because we may coalesce
167         // faults that trigger migrations to different processors.
168         uvm_page_mask_t new_residency;
169     } per_processor_masks[UVM_ID_MAX_PROCESSORS];
170 
171     // State used by the VA block routines called by the servicing routine
172     uvm_va_block_context_t *block_context;
173 
174     // Prefetch state hint
175     uvm_perf_prefetch_hint_t prefetch_hint;
176 
177     // Prefetch temporary state.
178     uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree;
179 };
180 
181 typedef struct
182 {
183     // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
184     // VMA. Used for batching ATS faults in a vma. This is unused for access
185     // counter service requests.
186     uvm_page_mask_t read_fault_mask;
187 
188     // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a
189     // SAM VMA. Used for batching ATS faults in a vma. This is unused for access
190     // counter service requests.
191     uvm_page_mask_t write_fault_mask;
192 
193     // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region
194     // of a SAM VMA. Used to return ATS fault status. This is unused for access
195     // counter service requests.
196     uvm_page_mask_t faults_serviced_mask;
197 
198     // Mask of successfully serviced read faults on pages in write_fault_mask.
199     // This is unused for access counter service requests.
200     uvm_page_mask_t reads_serviced_mask;
201 
202     // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
203     // VMA. This is used as input for access counter service requests and output
204     // of fault service requests.
205     uvm_page_mask_t accessed_mask;
206 
207     // Client type of the service requestor.
208     uvm_fault_client_type_t client_type;
209 
210     // New residency ID of the faulting region.
211     uvm_processor_id_t residency_id;
212 
213     // New residency NUMA node ID of the faulting region.
214     int residency_node;
215 
216     struct
217     {
218         // True if preferred_location was set on this faulting region.
219         // UVM_VA_BLOCK_SIZE sized region in the faulting region bound by the
220         // VMA is is prefetched if preferred_location was set and if first_touch
221         // is true;
222         bool has_preferred_location;
223 
224         // True if the UVM_VA_BLOCK_SIZE sized region isn't resident on any
225         // node. False if any page in the region is resident somewhere.
226         bool first_touch;
227 
228         // Mask of prefetched pages in a UVM_VA_BLOCK_SIZE aligned region of a
229         // SAM VMA.
230         uvm_page_mask_t prefetch_pages_mask;
231 
232         // PFN info of the faulting region
233         unsigned long pfns[PAGES_PER_UVM_VA_BLOCK];
234 
235         // Faulting/preferred processor residency mask of the faulting region.
236         uvm_page_mask_t residency_mask;
237 
238 #if defined(NV_MMU_INTERVAL_NOTIFIER)
239         // MMU notifier used to compute residency of this faulting region.
240         struct mmu_interval_notifier notifier;
241 #endif
242 
243         uvm_va_space_t *va_space;
244 
245         // Prefetch temporary state.
246         uvm_perf_prefetch_bitmap_tree_t bitmap_tree;
247     } prefetch_state;
248 
249 } uvm_ats_fault_context_t;
250 
251 struct uvm_fault_service_batch_context_struct
252 {
253     // Array of elements fetched from the GPU fault buffer. The number of
254     // elements in this array is exactly max_batch_size
255     uvm_fault_buffer_entry_t *fault_cache;
256 
257     // Array of pointers to elements in fault cache used for fault
258     // preprocessing. The number of elements in this array is exactly
259     // max_batch_size
260     uvm_fault_buffer_entry_t **ordered_fault_cache;
261 
262     // Per uTLB fault information. Used for replay policies and fault
263     // cancellation on Pascal
264     uvm_fault_utlb_info_t *utlbs;
265 
266     // Largest uTLB id seen in a GPU fault
267     NvU32 max_utlb_id;
268 
269     NvU32 num_cached_faults;
270 
271     NvU32 num_coalesced_faults;
272 
273     // One of the VA spaces in this batch which had fatal faults. If NULL, no
274     // faults were fatal. More than one VA space could have fatal faults, but we
275     // pick one to be the target of the cancel sequence.
276     uvm_va_space_t *fatal_va_space;
277 
278     bool has_throttled_faults;
279 
280     NvU32 num_invalid_prefetch_faults;
281 
282     NvU32 num_duplicate_faults;
283 
284     NvU32 num_replays;
285 
286     uvm_ats_fault_context_t ats_context;
287 
288     // Unique id (per-GPU) generated for tools events recording
289     NvU32 batch_id;
290 
291     uvm_tracker_t tracker;
292 
293     // Boolean used to avoid sorting the fault batch by instance_ptr if we
294     // determine at fetch time that all the faults in the batch report the same
295     // instance_ptr
296     bool is_single_instance_ptr;
297 
298     // Last fetched fault. Used for fault filtering.
299     uvm_fault_buffer_entry_t *last_fault;
300 };
301 
302 struct uvm_ats_fault_invalidate_struct
303 {
304     bool            tlb_batch_pending;
305     uvm_tlb_batch_t tlb_batch;
306 };
307 
308 typedef struct
309 {
310     // Fault buffer information and structures provided by RM
311     UvmGpuFaultInfo rm_info;
312 
313     // Maximum number of faults to be processed in batch before fetching new
314     // entries from the GPU buffer
315     NvU32 max_batch_size;
316 
317     struct uvm_replayable_fault_buffer_info_struct
318     {
319         // Maximum number of faults entries that can be stored in the buffer
320         NvU32 max_faults;
321 
322         // Cached value of the GPU GET register to minimize the round-trips
323         // over PCIe
324         NvU32 cached_get;
325 
326         // Cached value of the GPU PUT register to minimize the round-trips over
327         // PCIe
328         NvU32 cached_put;
329 
330         // Policy that determines when GPU replays are issued during normal
331         // fault servicing
332         uvm_perf_fault_replay_policy_t replay_policy;
333 
334         // Tracker used to aggregate replay operations, needed for fault cancel
335         // and GPU removal
336         uvm_tracker_t replay_tracker;
337 
338         // If there is a ratio larger than replay_update_put_ratio of duplicate
339         // faults in a batch, PUT pointer is updated before flushing the buffer
340         // that comes before the replay method.
341         NvU32 replay_update_put_ratio;
342 
343         // Fault statistics. These fields are per-GPU and most of them are only
344         // updated during fault servicing, and can be safely incremented.
345         // Migrations may be triggered by different GPUs and need to be
346         // incremented using atomics
347         struct
348         {
349             NvU64 num_prefetch_faults;
350 
351             NvU64 num_read_faults;
352 
353             NvU64 num_write_faults;
354 
355             NvU64 num_atomic_faults;
356 
357             NvU64 num_duplicate_faults;
358 
359             atomic64_t num_pages_out;
360 
361             atomic64_t num_pages_in;
362 
363             NvU64 num_replays;
364 
365             NvU64 num_replays_ack_all;
366         } stats;
367 
368         // Number of uTLBs in the chip
369         NvU32 utlb_count;
370 
371         // Context structure used to service a GPU fault batch
372         uvm_fault_service_batch_context_t batch_service_context;
373 
374         // Structure used to coalesce fault servicing in a VA block
375         uvm_service_block_context_t block_service_context;
376 
377         // Information required to invalidate stale ATS PTEs from the GPU TLBs
378         uvm_ats_fault_invalidate_t ats_invalidate;
379     } replayable;
380 
381     struct uvm_non_replayable_fault_buffer_info_struct
382     {
383         // Maximum number of faults entries that can be stored in the buffer
384         NvU32 max_faults;
385 
386         // Tracker used to aggregate clear faulted operations, needed for GPU
387         // removal
388         uvm_tracker_t clear_faulted_tracker;
389 
390         // Buffer used to store elements popped out from the queue shared with
391         // RM for fault servicing.
392         void *shadow_buffer_copy;
393 
394         // Array of elements fetched from the GPU fault buffer. The number of
395         // elements in this array is exactly max_batch_size
396         uvm_fault_buffer_entry_t *fault_cache;
397 
398         // Fault statistics. See replayable fault stats for more details.
399         struct
400         {
401             NvU64 num_read_faults;
402 
403             NvU64 num_write_faults;
404 
405             NvU64 num_atomic_faults;
406 
407             NvU64 num_physical_faults;
408 
409             atomic64_t num_pages_out;
410 
411             atomic64_t num_pages_in;
412         } stats;
413 
414         // Tracker which temporarily holds the work pushed to service faults
415         uvm_tracker_t fault_service_tracker;
416 
417         // Structure used to coalesce fault servicing in a VA block
418         uvm_service_block_context_t block_service_context;
419 
420         // Unique id (per-GPU) generated for tools events recording
421         NvU32 batch_id;
422 
423         // Information required to service ATS faults.
424         uvm_ats_fault_context_t ats_context;
425 
426         // Information required to invalidate stale ATS PTEs from the GPU TLBs
427         uvm_ats_fault_invalidate_t ats_invalidate;
428     } non_replayable;
429 
430     // Flag that tells if prefetch faults are enabled in HW
431     bool prefetch_faults_enabled;
432 
433     // Timestamp when prefetch faults where disabled last time
434     NvU64 disable_prefetch_faults_timestamp;
435 } uvm_fault_buffer_info_t;
436 
437 struct uvm_access_counter_service_batch_context_struct
438 {
439     uvm_access_counter_buffer_entry_t *notification_cache;
440 
441     NvU32 num_cached_notifications;
442 
443     struct
444     {
445         uvm_access_counter_buffer_entry_t   **notifications;
446 
447         NvU32                             num_notifications;
448 
449         // Boolean used to avoid sorting the fault batch by instance_ptr if we
450         // determine at fetch time that all the access counter notifications in
451         // the batch report the same instance_ptr
452         bool is_single_instance_ptr;
453     } virt;
454 
455     struct
456     {
457         uvm_access_counter_buffer_entry_t    **notifications;
458         uvm_reverse_map_t                      *translations;
459 
460         NvU32                              num_notifications;
461 
462         // Boolean used to avoid sorting the fault batch by aperture if we
463         // determine at fetch time that all the access counter notifications in
464         // the batch report the same aperture
465         bool                              is_single_aperture;
466     } phys;
467 
468     // Helper page mask to compute the accessed pages within a VA block
469     uvm_page_mask_t accessed_pages;
470 
471     // Structure used to coalesce access counter servicing in a VA block
472     uvm_service_block_context_t block_service_context;
473 
474     // Structure used to service access counter migrations in an ATS block.
475     uvm_ats_fault_context_t ats_context;
476 
477     // Unique id (per-GPU) generated for tools events recording
478     NvU32 batch_id;
479 };
480 
481 typedef struct
482 {
483     // Values used to configure access counters in RM
484     struct
485     {
486         UVM_ACCESS_COUNTER_GRANULARITY  granularity;
487         UVM_ACCESS_COUNTER_USE_LIMIT    use_limit;
488     } rm;
489 
490     // The following values are precomputed by the access counter notification
491     // handling code. See comments for UVM_MAX_TRANSLATION_SIZE in
492     // uvm_gpu_access_counters.c for more details.
493     NvU64 translation_size;
494 
495     NvU64 translations_per_counter;
496 
497     NvU64 sub_granularity_region_size;
498 
499     NvU64 sub_granularity_regions_per_translation;
500 } uvm_gpu_access_counter_type_config_t;
501 
502 typedef struct
503 {
504     UvmGpuAccessCntrInfo rm_info;
505 
506     NvU32 max_notifications;
507 
508     NvU32 max_batch_size;
509 
510     // Cached value of the GPU GET register to minimize the round-trips
511     // over PCIe
512     NvU32 cached_get;
513 
514     // Cached value of the GPU PUT register to minimize the round-trips over
515     // PCIe
516     NvU32 cached_put;
517 
518     // Tracker used to aggregate access counters clear operations, needed for
519     // GPU removal
520     uvm_tracker_t clear_tracker;
521 
522     // Current access counter configuration. During normal operation this
523     // information is computed once during GPU initialization. However, tests
524     // may override it to try different configuration values.
525     struct
526     {
527         uvm_gpu_access_counter_type_config_t mimc;
528         uvm_gpu_access_counter_type_config_t momc;
529 
530         NvU32                                threshold;
531     } current_config;
532 
533     // Access counter statistics
534     struct
535     {
536         atomic64_t num_pages_out;
537 
538         atomic64_t num_pages_in;
539     } stats;
540 
541     // Ignoring access counters means that notifications are left in the HW
542     // buffer without being serviced.  Requests to ignore access counters
543     // are counted since the suspend path inhibits access counter interrupts,
544     // and the resume path needs to know whether to reenable them.
545     NvU32 notifications_ignored_count;
546 
547     // Context structure used to service a GPU access counter batch
548     uvm_access_counter_service_batch_context_t batch_service_context;
549 
550     // VA space that reconfigured the access counters configuration, if any.
551     // Used in builtin tests only, to avoid reconfigurations from different
552     // processes
553     //
554     // Locking: both readers and writers must hold the access counters ISR lock
555     uvm_va_space_t *reconfiguration_owner;
556 } uvm_access_counter_buffer_info_t;
557 
558 typedef struct
559 {
560     // VA where the identity mapping should be mapped in the internal VA
561     // space managed by uvm_gpu_t.address_space_tree (see below).
562     NvU64 base;
563 
564     // Page tables with the mapping.
565     uvm_page_table_range_vec_t *range_vec;
566 
567     // Used during init to indicate whether the mapping has been fully
568     // initialized.
569     bool ready;
570 } uvm_gpu_identity_mapping_t;
571 
572 // Root chunk mapping
573 typedef struct
574 {
575     // Page table range representation of the mapping. Because a root chunk
576     // fits into a single 2MB page, in practice the range consists of a single
577     // 2MB PTE.
578     uvm_page_table_range_t *range;
579 
580     // Number of mapped pages of size PAGE_SIZE.
581     NvU32 num_mapped_pages;
582 } uvm_gpu_root_chunk_mapping_t;
583 
584 typedef enum
585 {
586     UVM_GPU_LINK_INVALID = 0,
587     UVM_GPU_LINK_PCIE,
588     UVM_GPU_LINK_NVLINK_1,
589     UVM_GPU_LINK_NVLINK_2,
590     UVM_GPU_LINK_NVLINK_3,
591     UVM_GPU_LINK_NVLINK_4,
592     UVM_GPU_LINK_C2C,
593     UVM_GPU_LINK_MAX
594 } uvm_gpu_link_type_t;
595 
596 // UVM does not support P2P copies on pre-Pascal GPUs. Pascal+ GPUs only
597 // support virtual addresses in P2P copies. Therefore, a peer identity mapping
598 // needs to be created.
599 // Ampere+ GPUs support physical peer copies, too, so identity mappings are not
600 // needed
601 typedef enum
602 {
603     UVM_GPU_PEER_COPY_MODE_UNSUPPORTED,
604     UVM_GPU_PEER_COPY_MODE_VIRTUAL,
605     UVM_GPU_PEER_COPY_MODE_PHYSICAL,
606     UVM_GPU_PEER_COPY_MODE_COUNT
607 } uvm_gpu_peer_copy_mode_t;
608 
609 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
610 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
611 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
612 // partitions within the parent. The parent GPU and partition GPU have
613 // different "id" and "uuid".
614 struct uvm_gpu_struct
615 {
616     uvm_parent_gpu_t *parent;
617 
618     // The gpu's GI uuid if SMC is enabled; otherwise, a copy of parent->uuid.
619     NvProcessorUuid uuid;
620 
621     // Nice printable name in the format:
622     // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>.
623     // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character.
624     char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
625 
626     // Refcount of the gpu, i.e. how many times it has been retained. This is
627     // roughly a count of how many times it has been registered with a VA space,
628     // except that some paths retain the GPU temporarily without a VA space.
629     //
630     // While this is >0, the GPU can't be removed. This differs from gpu_kref,
631     // which merely prevents the uvm_gpu_t object from being freed.
632     //
633     // In most cases this count is protected by the global lock: retaining a GPU
634     // from a UUID and any release require the global lock to be taken. But it's
635     // also useful for a caller to retain a GPU they've already retained, in
636     // which case there's no need to take the global lock. This can happen when
637     // an operation needs to drop the VA space lock but continue operating on a
638     // GPU. This is an atomic variable to handle those cases.
639     //
640     // Security note: keep it as a 64-bit counter to prevent overflow cases (a
641     // user can create a lot of va spaces and register the gpu with them).
642     atomic64_t retained_count;
643 
644     // A unique uvm gpu id in range [1, UVM_ID_MAX_PROCESSORS).
645     uvm_gpu_id_t id;
646 
647     // Should be UVM_GPU_MAGIC_VALUE. Used for memory checking.
648     NvU64 magic;
649 
650     struct
651     {
652         // The amount of memory the GPU has in total, in bytes. If the GPU is in
653         // ZeroFB testing mode, this will be 0.
654         NvU64 size;
655 
656         // Max (inclusive) physical address of this GPU's memory that the driver
657         // can allocate through PMM (PMA).
658         NvU64 max_allocatable_address;
659 
660         // Max supported vidmem page size may be smaller than the max GMMU page
661         // size, because of the vMMU supported page sizes.
662         NvU64 max_vidmem_page_size;
663 
664         struct
665         {
666             // True if the platform supports HW coherence and the GPU's memory
667             // is exposed as a NUMA node to the kernel.
668             bool enabled;
669             unsigned int node_id;
670         } numa;
671     } mem_info;
672 
673     struct
674     {
675         // Big page size used by the internal UVM VA space
676         // Notably it may be different than the big page size used by a user's
677         // VA space in general.
678         NvU32 internal_size;
679     } big_page;
680 
681     // Mapped registers needed to obtain the current GPU timestamp
682     struct
683     {
684         volatile NvU32 *time0_register;
685         volatile NvU32 *time1_register;
686     } time;
687 
688     // Identity peer mappings are only defined when
689     // peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL
690     uvm_gpu_identity_mapping_t peer_mappings[UVM_ID_MAX_GPUS];
691 
692     struct
693     {
694         // Mask of peer_gpus set
695         //
696         // We can use a regular processor id because P2P is not allowed between
697         // partitioned GPUs when SMC is enabled
698         uvm_processor_mask_t peer_gpu_mask;
699 
700         // lazily-populated array of peer GPUs, indexed by the peer's GPU index
701         uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];
702 
703         // Leaf spinlock used to synchronize access to the peer_gpus table so
704         // that it can be safely accessed from the access counters bottom half
705         uvm_spinlock_t peer_gpus_lock;
706     } peer_info;
707 
708     // Maximum number of subcontexts supported
709     NvU32 max_subcontexts;
710 
711     // RM address space handle used in many of the UVM/RM APIs
712     // Represents a GPU VA space within rm_device.
713     //
714     // In SR-IOV heavy, proxy channels are not associated with this address
715     // space.
716     uvmGpuAddressSpaceHandle rm_address_space;
717 
718     // Page tree used for the internal UVM VA space shared with RM
719     uvm_page_tree_t address_space_tree;
720 
721     // Set to true during add_gpu() as soon as the RM's address space is moved
722     // to the address_space_tree.
723     bool rm_address_space_moved_to_page_tree;
724 
725     uvm_gpu_semaphore_pool_t *semaphore_pool;
726 
727     uvm_gpu_semaphore_pool_t *secure_semaphore_pool;
728 
729     uvm_channel_manager_t *channel_manager;
730 
731     uvm_pmm_gpu_t pmm;
732 
733     // Flat linear mapping covering vidmem. This is a kernel mapping that is
734     // only created in certain configurations.
735     //
736     // There are two mutually exclusive versions of the mapping. The simplest
737     // version covers the entire GPU memory, and it is created during GPU
738     // initialization. The dynamic version is a partial vidmem mapping that
739     // creates and destroys mappings to GPU root chunks on demand.
740     union
741     {
742         // Static mapping covering the whole GPU memory.
743         uvm_gpu_identity_mapping_t static_flat_mapping;
744 
745         // Dynamic mapping of GPU memory.
746         struct
747         {
748             // Array of root chunk mappings.
749             uvm_gpu_root_chunk_mapping_t *array;
750 
751             // Number of elements in the array.
752             size_t count;
753 
754             // Each bit in the bitlock protects a single root chunk mapping.
755             uvm_bit_locks_t bitlocks;
756 
757         } root_chunk_mappings;
758     };
759 
760     // Linear sysmem mappings. Mappings are added on demand, and removed upon
761     // GPU deinitialization. The mappings are added to UVM's internal address
762     // space i.e. they are kernel mappings.
763     //
764     // Only used in SR-IOV heavy.
765     struct
766     {
767         // Size of each mapping, in bytes.
768         NvU64 mapping_size;
769 
770         // Array of sysmem mappings.
771         uvm_gpu_identity_mapping_t *array;
772 
773         // Number of elements in the array.
774         size_t count;
775 
776         // Each bit in the bitlock protects a sysmem mapping.
777         uvm_bit_locks_t bitlocks;
778     } sysmem_mappings;
779 
780     // Reverse lookup table used to query the user mapping associated with a
781     // sysmem (DMA) physical address.
782     //
783     // The system memory mapping information referred to by this field is
784     // different from that of sysmem_mappings, because it relates to user
785     // mappings (instead of kernel), and it is used in most configurations.
786     uvm_pmm_sysmem_mappings_t pmm_reverse_sysmem_mappings;
787 
788     struct
789     {
790         uvm_conf_computing_dma_buffer_pool_t dma_buffer_pool;
791 
792         // Dummy memory used to store the IV contents during CE encryption.
793         // This memory location is also only available after CE channels
794         // because we use them to write PTEs for allocations such as this one.
795         // This location is used when a physical addressing for the IV buffer
796         // is required. See uvm_hal_hopper_ce_encrypt().
797         uvm_mem_t *iv_mem;
798 
799         // Dummy memory used to store the IV contents during CE encryption.
800         // Because of the limitations of `iv_mem', and the need to have such
801         // buffer at channel initialization, we use an RM allocation.
802         // This location is used when a virtual addressing for the IV buffer
803         // is required. See uvm_hal_hopper_ce_encrypt().
804         uvm_rm_mem_t *iv_rm_mem;
805     } conf_computing;
806 
807     // ECC handling
808     // In order to trap ECC errors as soon as possible the driver has the hw
809     // interrupt register mapped directly. If an ECC interrupt is ever noticed
810     // to be pending, then the UVM driver needs to:
811     //
812     //   1) ask RM to service interrupts, and then
813     //   2) inspect the ECC error notifier state.
814     //
815     // Notably, checking for channel errors is not enough, because ECC errors
816     // can be pending, even after a channel has become idle.
817     //
818     // See more details in uvm_gpu_check_ecc_error().
819     struct
820     {
821         // Does the GPU have ECC enabled?
822         bool enabled;
823 
824         // Direct mapping of the 32-bit part of the hw interrupt tree that has
825         // the ECC bits.
826         volatile NvU32 *hw_interrupt_tree_location;
827 
828         // Mask to get the ECC interrupt bits from the 32-bits above.
829         NvU32 mask;
830 
831         // Set to true by RM when a fatal ECC error is encountered (requires
832         // asking RM to service pending interrupts to be current).
833         NvBool *error_notifier;
834     } ecc;
835 
836     struct
837     {
838         NvU32 swizz_id;
839 
840         // RM device handle used in many of the UVM/RM APIs.
841         //
842         // Do not read this field directly, use uvm_gpu_device_handle instead.
843         uvmGpuDeviceHandle rm_device;
844     } smc;
845 
846     struct
847     {
848         struct proc_dir_entry *dir;
849 
850         struct proc_dir_entry *dir_symlink;
851 
852         // The GPU instance UUID symlink if SMC is enabled.
853         struct proc_dir_entry *gpu_instance_uuid_symlink;
854 
855         struct proc_dir_entry *info_file;
856 
857         struct proc_dir_entry *dir_peers;
858     } procfs;
859 
860     // Placeholder for per-GPU performance heuristics information
861     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
862 
863     // Force pushbuffer's GPU VA to be >= 1TB; used only for testing purposes.
864     bool uvm_test_force_upper_pushbuffer_segment;
865 };
866 
867 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
868 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
869 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
870 // partitions within the parent. The parent GPU and partition GPU have
871 // different "id" and "uuid".
872 struct uvm_parent_gpu_struct
873 {
874     // Reference count for how many places are holding on to a parent GPU
875     // (internal to the UVM driver).  This includes any GPUs we know about, not
876     // just GPUs that are registered with a VA space.  Most GPUs end up being
877     // registered, but there are brief periods when they are not registered,
878     // such as during interrupt handling, and in add_gpu() or remove_gpu().
879     nv_kref_t gpu_kref;
880 
881     // The number of uvm_gpu_ts referencing this uvm_parent_gpu_t.
882     NvU32 num_retained_gpus;
883 
884     uvm_gpu_t *gpus[UVM_PARENT_ID_MAX_SUB_PROCESSORS];
885 
886     // Bitmap of valid child entries in the gpus[] table.  Used to retrieve a
887     // usable child GPU in bottom-halves.
888     DECLARE_BITMAP(valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
889 
890     // The gpu's uuid
891     NvProcessorUuid uuid;
892 
893     // Nice printable name including the uvm gpu id, ascii name from RM and uuid
894     char name[UVM_GPU_NICE_NAME_BUFFER_LENGTH];
895 
896     // GPU information and provided by RM (architecture, implementation,
897     // hardware classes, etc.).
898     UvmGpuInfo rm_info;
899 
900     // A unique uvm gpu id in range [1, UVM_PARENT_ID_MAX_PROCESSORS)
901     uvm_parent_gpu_id_t id;
902 
903     // Reference to the Linux PCI device
904     //
905     // The reference to the PCI device remains valid as long as the GPU is
906     // registered with RM's Linux layer (between nvUvmInterfaceRegisterGpu() and
907     // nvUvmInterfaceUnregisterGpu()).
908     struct pci_dev *pci_dev;
909 
910     // NVLINK Processing Unit (NPU) on PowerPC platforms. The NPU is a
911     // collection of CPU-side PCI devices which bridge GPU NVLINKs and the CPU
912     // memory bus.
913     //
914     // There is one PCI device per NVLINK. A set of NVLINKs connects to a single
915     // GPU, and all NVLINKs for a given socket are collected logically under
916     // this UVM NPU because some resources (such as register mappings) are
917     // shared by all those NVLINKs. This means multiple GPUs may connect to the
918     // same UVM NPU.
919     uvm_ibm_npu_t *npu;
920 
921     // On kernels with NUMA support, this entry contains the closest CPU NUMA
922     // node to this GPU. Otherwise, the value will be -1.
923     int closest_cpu_numa_node;
924 
925     // RM device handle used in many of the UVM/RM APIs.
926     //
927     // Do not read this field directly, use uvm_gpu_device_handle instead.
928     uvmGpuDeviceHandle rm_device;
929 
930     // The physical address range addressable by the GPU
931     //
932     // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
933     // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when
934     // referencing sysmem from the GPU, dma_addressable_start should be
935     // subtracted from the physical address. The DMA mapping helpers like
936     // uvm_parent_gpu_map_cpu_pages() and uvm_parent_gpu_dma_alloc_page() take
937     // care of that.
938     NvU64 dma_addressable_start;
939     NvU64 dma_addressable_limit;
940 
941     // Total size (in bytes) of physically mapped (with
942     // uvm_parent_gpu_map_cpu_pages) sysmem pages, used for leak detection.
943     atomic64_t mapped_cpu_pages_size;
944 
945     // Hardware Abstraction Layer
946     uvm_host_hal_t *host_hal;
947     uvm_ce_hal_t *ce_hal;
948     uvm_arch_hal_t *arch_hal;
949     uvm_fault_buffer_hal_t *fault_buffer_hal;
950     uvm_access_counter_buffer_hal_t *access_counter_buffer_hal;
951     uvm_sec2_hal_t *sec2_hal;
952 
953     // Whether CE supports physical addressing mode for writes to vidmem
954     bool ce_phys_vidmem_write_supported;
955 
956     uvm_gpu_peer_copy_mode_t peer_copy_mode;
957 
958     // Virtualization mode of the GPU.
959     UVM_VIRT_MODE virt_mode;
960 
961     // Pascal+ GPUs can trigger faults on prefetch instructions. If false, this
962     // feature must be disabled at all times in GPUs of the given architecture.
963     // If true, the feature can be toggled at will by SW.
964     //
965     // The field should not be used unless the GPU supports replayable faults.
966     bool prefetch_fault_supported;
967 
968     // Number of membars required to flush out HSHUB following a TLB invalidate
969     NvU32 num_hshub_tlb_invalidate_membars;
970 
971     // Whether the channels can configure GPFIFO in vidmem
972     bool gpfifo_in_vidmem_supported;
973 
974     bool replayable_faults_supported;
975 
976     bool non_replayable_faults_supported;
977 
978     bool access_counters_supported;
979 
980     // If this is true, physical address based access counter notifications are
981     // potentially generated. If false, only virtual address based notifications
982     // are generated (assuming access_counters_supported is true too).
983     bool access_counters_can_use_physical_addresses;
984 
985     bool fault_cancel_va_supported;
986 
987     // True if the GPU has hardware support for scoped atomics
988     bool scoped_atomics_supported;
989 
990     // If true, a HW method can be used to clear a faulted channel.
991     // If false, then the GPU supports clearing faulted channels using registers
992     // instead of a HW method.
993     // This value is only defined for GPUs that support non-replayable faults.
994     bool has_clear_faulted_channel_method;
995 
996     // If true, a SW method can be used to clear a faulted channel.
997     // If false, the HW method or the registers (whichever is available
998     // according to has_clear_faulted_channel_method) needs to be used.
999     //
1000     // This value is only defined for GPUs that support non-replayable faults.
1001     bool has_clear_faulted_channel_sw_method;
1002 
1003     bool sparse_mappings_supported;
1004 
1005     // Ampere(GA100) requires map->invalidate->remap->invalidate for page size
1006     // promotion
1007     bool map_remap_larger_page_promotion;
1008 
1009     bool plc_supported;
1010 
1011     // If true, page_tree initialization pre-populates no_ats_ranges. It only
1012     // affects ATS systems.
1013     bool no_ats_range_required;
1014 
1015     // Parameters used by the TLB batching API
1016     struct
1017     {
1018         // Is the targeted (single page) VA invalidate supported at all?
1019         NvBool va_invalidate_supported;
1020 
1021         // Is the VA range invalidate supported?
1022         NvBool va_range_invalidate_supported;
1023 
1024         union
1025         {
1026             // Maximum (inclusive) number of single page invalidations before
1027             // falling back to invalidate all
1028             NvU32 max_pages;
1029 
1030             // Maximum (inclusive) number of range invalidations before falling
1031             // back to invalidate all
1032             NvU32 max_ranges;
1033         };
1034     } tlb_batch;
1035 
1036     // Largest VA (exclusive) which can be used for channel buffer mappings
1037     NvU64 max_channel_va;
1038 
1039     // Largest VA (exclusive) which Host can operate.
1040     NvU64 max_host_va;
1041 
1042     // Indicates whether the GPU can map sysmem with pages larger than 4k
1043     bool can_map_sysmem_with_large_pages;
1044 
1045     // VA base and size of the RM managed part of the internal UVM VA space.
1046     //
1047     // The internal UVM VA is shared with RM by RM controlling some of the top
1048     // level PDEs and leaving the rest for UVM to control.
1049     // On Pascal a single top level PDE covers 128 TB of VA and given that
1050     // semaphores and other allocations limited to 40bit are currently allocated
1051     // through RM, RM needs to control the [0, 128TB) VA range at least for now.
1052     // On Maxwell, limit RMs VA to [0, 128GB) that should easily fit
1053     // all RM allocations and leave enough space for UVM.
1054     NvU64 rm_va_base;
1055     NvU64 rm_va_size;
1056 
1057     // Base and size of the GPU VA used for uvm_mem_t allocations mapped in the
1058     // internal address_space_tree.
1059     NvU64 uvm_mem_va_base;
1060     NvU64 uvm_mem_va_size;
1061 
1062     // Base of the GPU VAs used for the vidmem and sysmem flat mappings.
1063     NvU64 flat_vidmem_va_base;
1064     NvU64 flat_sysmem_va_base;
1065 
1066     // Bitmap of allocation sizes for user memory supported by a GPU. PAGE_SIZE
1067     // is guaranteed to be both present and the smallest size.
1068     uvm_chunk_sizes_mask_t mmu_user_chunk_sizes;
1069 
1070     // Bitmap of allocation sizes that could be requested by the page tree for
1071     // a GPU
1072     uvm_chunk_sizes_mask_t mmu_kernel_chunk_sizes;
1073 
1074     struct
1075     {
1076         struct proc_dir_entry *dir;
1077 
1078         struct proc_dir_entry *fault_stats_file;
1079 
1080         struct proc_dir_entry *access_counters_file;
1081     } procfs;
1082 
1083     // Interrupt handling state and locks
1084     uvm_isr_info_t isr;
1085 
1086     // Fault buffer info. This is only valid if supports_replayable_faults is
1087     // set to true.
1088     uvm_fault_buffer_info_t fault_buffer_info;
1089 
1090     // PMM lazy free processing queue.
1091     // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
1092     nv_kthread_q_t lazy_free_q;
1093 
1094     // Access counter buffer info. This is only valid if
1095     // supports_access_counters is set to true.
1096     uvm_access_counter_buffer_info_t access_counter_buffer_info;
1097 
1098     // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
1099     NvU32 utlb_per_gpc_count;
1100 
1101     // In order to service GPU faults, UVM must be able to obtain the VA
1102     // space for each reported fault. The fault packet contains the
1103     // instance_ptr of the channel that was bound when the SMs triggered
1104     // the fault. On fault any instance pointer in the TSG may be
1105     // reported. This is a problem on Volta, which allow different channels
1106     // in the TSG to be bound to different VA spaces in order to support
1107     // subcontexts. In order to be able to obtain the correct VA space, HW
1108     // provides the subcontext id (or VEID) in addition to the instance_ptr.
1109     //
1110     // Summary:
1111     //
1112     // 1) Channels in a TSG may be in different VA spaces, identified by their
1113     // subcontext ID.
1114     // 2) Different subcontext IDs may map to the same or different VA spaces.
1115     // 3) On fault, any instance pointer in the TSG may be reported. The
1116     // reported subcontext ID identifies which VA space within the TSG actually
1117     // encountered the fault.
1118     //
1119     // Thus, UVM needs to keep track of all the instance pointers that belong
1120     // to the same TSG. We use two tables:
1121     //
1122     // - instance_ptr_table (instance_ptr -> subctx_info) this table maps
1123     // instance pointers to the subcontext info descriptor for the channel. If
1124     // the channel belongs to a subcontext, this descriptor will contain all
1125     // the VA spaces for the subcontexts in the same TSG. If the channel does
1126     // not belong to a subcontext, it will only contain a pointer to its VA
1127     // space.
1128     // - tsg_table (tsg_id -> subctx_info): this table also stores the
1129     // subctx information, but in this case it is indexed by TSG ID. Thus,
1130     // when a new channel bound to a subcontext is registered, it will check
1131     // first in this table if the subcontext information descriptor for its TSG
1132     // already exists, otherwise it will create it. Channels not bound to
1133     // subcontexts will not use this table.
1134     //
1135     // The bottom half reads the tables under
1136     // isr.replayable_faults_handler.lock, but a separate lock is necessary
1137     // because entries are added and removed from the table under the va_space
1138     // lock, and we can't take isr.replayable_faults_handler.lock while holding
1139     // the va_space lock.
1140     uvm_rb_tree_t tsg_table;
1141 
1142     uvm_rb_tree_t instance_ptr_table;
1143     uvm_spinlock_t instance_ptr_table_lock;
1144 
1145     // This is set to true if the GPU belongs to an SLI group.
1146     bool sli_enabled;
1147 
1148     struct
1149     {
1150         bool supported;
1151 
1152         bool enabled;
1153     } smc;
1154 
1155     // Global statistics. These fields are per-GPU and most of them are only
1156     // updated during fault servicing, and can be safely incremented.
1157     struct
1158     {
1159         NvU64          num_replayable_faults;
1160 
1161         NvU64      num_non_replayable_faults;
1162 
1163         atomic64_t             num_pages_out;
1164 
1165         atomic64_t              num_pages_in;
1166     } stats;
1167 
1168     // Structure to hold nvswitch specific information. In an nvswitch
1169     // environment, rather than using the peer-id field of the PTE (which can
1170     // only address 8 gpus), all gpus are assigned a 47-bit physical address
1171     // space by the fabric manager. Any physical address access to these
1172     // physical address spaces are routed through the switch to the
1173     // corresponding peer.
1174     struct
1175     {
1176         bool is_nvswitch_connected;
1177 
1178         // 47-bit fabric memory physical offset that peer gpus need to access
1179         // to read a peer's memory
1180         NvU64 fabric_memory_window_start;
1181     } nvswitch_info;
1182 
1183     struct
1184     {
1185         // Note that this represents the link to system memory, not the link the
1186         // system used to discover the GPU. There are some cases such as NVLINK2
1187         // where the GPU is still on the PCIe bus, but it accesses memory over
1188         // this link rather than PCIe.
1189         uvm_gpu_link_type_t link;
1190         NvU32 link_rate_mbyte_per_s;
1191 
1192         // Range in the system physical address space where the memory of this
1193         // GPU is exposed as coherent. memory_window_end is inclusive.
1194         // memory_window_start == memory_window_end indicates that no window is
1195         // present (coherence is not supported).
1196         NvU64 memory_window_start;
1197         NvU64 memory_window_end;
1198     } system_bus;
1199 
1200     // WAR to issue ATS TLB invalidation commands ourselves.
1201     struct
1202     {
1203         uvm_mutex_t smmu_lock;
1204         struct page *smmu_cmdq;
1205         void __iomem *smmu_cmdqv_base;
1206         unsigned long smmu_prod;
1207         unsigned long smmu_cons;
1208     } smmu_war;
1209 };
1210 
1211 static const char *uvm_parent_gpu_name(uvm_parent_gpu_t *parent_gpu)
1212 {
1213     return parent_gpu->name;
1214 }
1215 
1216 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
1217 {
1218     return gpu->name;
1219 }
1220 
1221 static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu)
1222 {
1223     if (gpu->parent->smc.enabled)
1224         return gpu->smc.rm_device;
1225     return gpu->parent->rm_device;
1226 }
1227 
1228 struct uvm_gpu_peer_struct
1229 {
1230     // The fields in this global structure can only be inspected under one of
1231     // the following conditions:
1232     //
1233     // - The VA space lock is held for either read or write, both GPUs are
1234     //   registered in the VA space, and the corresponding bit in the
1235     //   va_space.enabled_peers bitmap is set.
1236     //
1237     // - The global lock is held.
1238     //
1239     // - While the global lock was held in the past, the two GPUs were detected
1240     //   to be SMC peers and were both retained.
1241     //
1242     // - While the global lock was held in the past, the two GPUs were detected
1243     //   to be NVLINK peers and were both retained.
1244     //
1245     // - While the global lock was held in the past, the two GPUs were detected
1246     //   to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was called.
1247     //
1248     // - The peer_gpus_lock is held on one of the GPUs. In this case, the other
1249     //   GPU must be read from the original GPU's peer_gpus table. The fields
1250     //   will not change while the lock is held, but they may no longer be valid
1251     //   because the other GPU might be in teardown.
1252 
1253     // Peer Id associated with this device w.r.t. to a peer GPU.
1254     // Note: peerId (A -> B) != peerId (B -> A)
1255     // peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2)
1256     // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
1257     NvU8 peer_ids[2];
1258 
1259     // Indirect peers are GPUs which can coherently access each others' memory
1260     // over NVLINK, but are routed through the CPU using the SYS aperture rather
1261     // than a PEER aperture
1262     NvU8 is_indirect_peer : 1;
1263 
1264     // The link type between the peer GPUs, currently either PCIe or NVLINK.
1265     // This field is used to determine the when this peer struct has been
1266     // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are
1267     // initialized at GPU registration time. PCIe peers are initialized when
1268     // the refcount below goes from 0 to 1.
1269     uvm_gpu_link_type_t link_type;
1270 
1271     // Maximum unidirectional bandwidth between the peers in megabytes per
1272     // second, not taking into account the protocols' overhead. The reported
1273     // bandwidth for indirect peers is zero. See UvmGpuP2PCapsParams.
1274     NvU32 total_link_line_rate_mbyte_per_s;
1275 
1276     // For PCIe, the number of times that this has been retained by a VA space.
1277     // For NVLINK this will always be 1.
1278     NvU64 ref_count;
1279 
1280     // This handle gets populated when enable_peer_access successfully creates
1281     // an NV50_P2P object. disable_peer_access resets the same on the object
1282     // deletion.
1283     NvHandle p2p_handle;
1284 
1285     struct
1286     {
1287         struct proc_dir_entry *peer_file[2];
1288         struct proc_dir_entry *peer_symlink_file[2];
1289 
1290         // GPU-A <-> GPU-B link is bidirectional, pairs[x][0] is always the
1291         // local GPU, while pairs[x][1] is the remote GPU. The table shall be
1292         // filled like so: [[GPU-A, GPU-B], [GPU-B, GPU-A]].
1293         uvm_gpu_t *pairs[2][2];
1294     } procfs;
1295 };
1296 
1297 // Initialize global gpu state
1298 NV_STATUS uvm_gpu_init(void);
1299 
1300 // Deinitialize global state (called from module exit)
1301 void uvm_gpu_exit(void);
1302 
1303 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space);
1304 
1305 void uvm_gpu_exit_va_space(uvm_va_space_t *va_space);
1306 
1307 static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu)
1308 {
1309     UVM_ASSERT(gpu->mem_info.numa.enabled);
1310     return gpu->mem_info.numa.node_id;
1311 }
1312 
1313 static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struct page *page)
1314 {
1315     unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT;
1316     unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start;
1317 
1318     UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu));
1319     UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start);
1320     UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end);
1321 
1322     return uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_offset);
1323 }
1324 
1325 // Note that there is a uvm_gpu_get() function defined in uvm_global.h to break
1326 // a circular dep between global and gpu modules.
1327 
1328 // Get a uvm_gpu_t by UUID (physical GPU UUID if SMC is not enabled, otherwise
1329 // GPU instance UUID).
1330 // This returns NULL if the GPU is not present.
1331 // This is the general purpose call that should be used normally.
1332 //
1333 // LOCKING: requires the global lock to be held
1334 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1335 
1336 // Get a uvm_parent_gpu_t by UUID (physical GPU UUID).
1337 // Like uvm_gpu_get_by_uuid(), this function returns NULL if the GPU has not
1338 // been registered.
1339 //
1340 // LOCKING: requires the global lock to be held
1341 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1342 
1343 // Like uvm_parent_gpu_get_by_uuid(), but this variant does not assertion-check
1344 // that the caller is holding the global_lock.  This is a narrower-purpose
1345 // function, and is only intended for use by the top-half ISR, or other very
1346 // limited cases.
1347 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid);
1348 
1349 // Retain a gpu by uuid
1350 // Returns the retained uvm_gpu_t in gpu_out on success
1351 //
1352 // LOCKING: Takes and releases the global lock for the caller.
1353 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid,
1354                                  const uvm_rm_user_object_t *user_rm_device,
1355                                  uvm_gpu_t **gpu_out);
1356 
1357 // Retain a gpu which is known to already be retained. Does NOT require the
1358 // global lock to be held.
1359 void uvm_gpu_retain(uvm_gpu_t *gpu);
1360 
1361 // Release a gpu
1362 // LOCKING: requires the global lock to be held
1363 void uvm_gpu_release_locked(uvm_gpu_t *gpu);
1364 
1365 // Like uvm_gpu_release_locked, but takes and releases the global lock for the
1366 // caller.
1367 void uvm_gpu_release(uvm_gpu_t *gpu);
1368 
1369 static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
1370 {
1371     return atomic64_read(&gpu->retained_count);
1372 }
1373 
1374 // Decrease the refcount on the parent GPU object, and actually delete the object
1375 // if the refcount hits zero.
1376 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);
1377 
1378 // Calculates peer table index using GPU ids.
1379 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1380 
1381 // Either retains an existing PCIe peer entry or creates a new one. In both
1382 // cases the two GPUs are also each retained.
1383 // LOCKING: requires the global lock to be held
1384 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1385 
1386 // Releases a PCIe peer entry and the two GPUs.
1387 // LOCKING: requires the global lock to be held
1388 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1389 
1390 // Get the aperture for local_gpu to use to map memory resident on remote_gpu.
1391 // They must not be the same gpu.
1392 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);
1393 
1394 // Get the processor id accessible by the given GPU for the given physical
1395 // address.
1396 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
1397 
1398 // Get the P2P capabilities between the gpus with the given indexes
1399 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1400 
1401 // Get the P2P capabilities between the given gpus
1402 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1403 {
1404     return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
1405 }
1406 
1407 static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1408 {
1409     if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
1410         UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
1411         return true;
1412     }
1413 
1414     return false;
1415 }
1416 
1417 static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1418 {
1419     uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
1420 
1421     if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) {
1422         UVM_ASSERT(gpu0->mem_info.numa.enabled);
1423         UVM_ASSERT(gpu1->mem_info.numa.enabled);
1424         UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE);
1425         UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1));
1426         return true;
1427     }
1428 
1429     return false;
1430 }
1431 
1432 // Retrieve the virtual address corresponding to the given vidmem physical
1433 // address, according to the linear vidmem mapping in the GPU kernel address
1434 // space.
1435 //
1436 // The actual GPU mapping only exists if a full flat mapping, or a partial flat
1437 // mapping covering the passed address, has been previously created.
1438 static uvm_gpu_address_t uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t *gpu, NvU64 pa)
1439 {
1440     UVM_ASSERT(uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1441                uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent));
1442     UVM_ASSERT(pa <= gpu->mem_info.max_allocatable_address);
1443 
1444     if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent))
1445         UVM_ASSERT(gpu->static_flat_mapping.ready);
1446 
1447     return uvm_gpu_address_virtual(gpu->parent->flat_vidmem_va_base + pa);
1448 }
1449 
1450 // Retrieve the virtual address corresponding to the given sysmem physical
1451 // address, according to the linear sysmem mapping in the GPU kernel address
1452 // space.
1453 //
1454 // The actual GPU mapping only exists if a linear mapping covering the passed
1455 // address has been previously created.
1456 static uvm_gpu_address_t uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t *parent_gpu, NvU64 pa)
1457 {
1458     UVM_ASSERT(uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(parent_gpu));
1459     UVM_ASSERT(pa <= (parent_gpu->dma_addressable_limit - parent_gpu->dma_addressable_start));
1460 
1461     return uvm_gpu_address_virtual(parent_gpu->flat_sysmem_va_base + pa);
1462 }
1463 
1464 // Given a GPU or CPU physical address (not peer), retrieve an address suitable
1465 // for CE access.
1466 static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
1467 {
1468     UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS);
1469 
1470     if (phys_addr.aperture == UVM_APERTURE_VID) {
1471         if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1472             uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent))
1473             return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
1474     }
1475     else if (uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(gpu->parent)) {
1476         return uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, phys_addr.address);
1477     }
1478 
1479     return uvm_gpu_address_from_phys(phys_addr);
1480 }
1481 
1482 static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_gpu_id_t peer_id)
1483 {
1484     return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)];
1485 }
1486 
1487 // Check for ECC errors
1488 //
1489 // Notably this check cannot be performed where it's not safe to call into RM.
1490 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
1491 
1492 // Check for ECC errors without calling into RM
1493 //
1494 // Calling into RM is problematic in many places, this check is always safe to
1495 // do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
1496 // and it's required to call uvm_gpu_check_ecc_error() to be sure.
1497 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
1498 
1499 // Map size bytes of contiguous sysmem on the GPU for physical access
1500 //
1501 // size has to be aligned to PAGE_SIZE.
1502 //
1503 // Returns the physical address of the pages that can be used to access them on
1504 // the GPU.
1505 NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out);
1506 
1507 // Unmap num_pages pages previously mapped with uvm_parent_gpu_map_cpu_pages().
1508 void uvm_parent_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size);
1509 
1510 static NV_STATUS uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t *parent_gpu, struct page *page, NvU64 *dma_address_out)
1511 {
1512     return uvm_parent_gpu_map_cpu_pages(parent_gpu, page, PAGE_SIZE, dma_address_out);
1513 }
1514 
1515 static void uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address)
1516 {
1517     uvm_parent_gpu_unmap_cpu_pages(parent_gpu, dma_address, PAGE_SIZE);
1518 }
1519 
1520 // Allocate and map a page of system DMA memory on the GPU for physical access
1521 //
1522 // Returns
1523 // - the address of the page that can be used to access them on
1524 //   the GPU in the dma_address_out parameter.
1525 // - the address of allocated memory in CPU virtual address space.
1526 void *uvm_parent_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu,
1527                                     gfp_t gfp_flags,
1528                                     NvU64 *dma_address_out);
1529 
1530 // Unmap and free size bytes of contiguous sysmem DMA previously allocated
1531 // with uvm_parent_gpu_map_cpu_pages().
1532 void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address);
1533 
1534 // Returns whether the given range is within the GPU's addressable VA ranges.
1535 // It requires the input 'addr' to be in canonical form for platforms compliant
1536 // to canonical form addresses, i.e., ARM64, and x86.
1537 // Warning: This only checks whether the GPU's MMU can support the given
1538 // address. Some HW units on that GPU might only support a smaller range.
1539 //
1540 // The GPU must be initialized before calling this function.
1541 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1542 
1543 // Returns whether the given range is within the GPU's addressable VA ranges in
1544 // the internal GPU VA "kernel" address space, which is a linear address space.
1545 // Therefore, the input 'addr' must not be in canonical form, even platforms
1546 // that use to the canonical form addresses, i.e., ARM64, and x86.
1547 // Warning: This only checks whether the GPU's MMU can support the given
1548 // address. Some HW units on that GPU might only support a smaller range.
1549 //
1550 // The GPU must be initialized before calling this function.
1551 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1552 
1553 bool uvm_platform_uses_canonical_form_address(void);
1554 
1555 // Returns addr's canonical form for host systems that use canonical form
1556 // addresses.
1557 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
1558 
1559 static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
1560 {
1561     return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
1562 }
1563 
1564 static bool uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t *parent_gpu)
1565 {
1566     return parent_gpu->max_host_va > (1ull << 40);
1567 }
1568 
1569 static bool uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t *parent_gpu)
1570 {
1571     // Eviction is supported only if the GPU supports replayable faults
1572     return parent_gpu->replayable_faults_supported;
1573 }
1574 
1575 static bool uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t *parent_gpu)
1576 {
1577     return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_HEAVY;
1578 }
1579 
1580 static bool uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t *parent_gpu)
1581 {
1582     return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_STANDARD;
1583 }
1584 
1585 // Returns true if the virtualization mode is SR-IOV heavy or SR-IOV standard.
1586 static bool uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t *parent_gpu)
1587 {
1588     return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu) ||
1589            uvm_parent_gpu_is_virt_mode_sriov_standard(parent_gpu);
1590 }
1591 
1592 static bool uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t *parent_gpu)
1593 {
1594     return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu);
1595 }
1596 
1597 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu);
1598 
1599 // Debug print of GPU properties
1600 void uvm_gpu_print(uvm_gpu_t *gpu);
1601 
1602 // Add the given instance pointer -> user_channel mapping to this GPU. The
1603 // bottom half GPU page fault handler uses this to look up the VA space for GPU
1604 // faults.
1605 NV_STATUS uvm_parent_gpu_add_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1606 void uvm_parent_gpu_remove_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1607 
1608 // Looks up an entry added by uvm_gpu_add_user_channel. Return codes:
1609 //  NV_OK                        Translation successful
1610 //  NV_ERR_INVALID_CHANNEL       Entry's instance pointer was not found
1611 //  NV_ERR_PAGE_TABLE_NOT_AVAIL  Entry's instance pointer is valid but the entry
1612 //                               targets an invalid subcontext
1613 //
1614 // out_va_space is valid if NV_OK is returned, otherwise it's NULL. The caller
1615 // is responsibile for ensuring that the returned va_space can't be destroyed,
1616 // so these functions should only be called from the bottom half.
1617 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1618                                                  uvm_fault_buffer_entry_t *fault,
1619                                                  uvm_va_space_t **out_va_space);
1620 
1621 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1622                                                           uvm_access_counter_buffer_entry_t *entry,
1623                                                           uvm_va_space_t **out_va_space);
1624 
1625 typedef enum
1626 {
1627     UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT,
1628     UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1629     UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
1630 } uvm_gpu_buffer_flush_mode_t;
1631 
1632 #endif // __UVM_GPU_H__
1633