1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_GPU_H__
25 #define __UVM_GPU_H__
26 
27 #include "nvtypes.h"
28 #include "nvmisc.h"
29 #include "uvm_types.h"
30 #include "nv_uvm_types.h"
31 #include "uvm_linux.h"
32 #include "nv-kref.h"
33 #include "uvm_common.h"
34 #include "ctrl2080mc.h"
35 #include "uvm_forward_decl.h"
36 #include "uvm_processors.h"
37 #include "uvm_pmm_gpu.h"
38 #include "uvm_pmm_sysmem.h"
39 #include "uvm_mmu.h"
40 #include "uvm_gpu_replayable_faults.h"
41 #include "uvm_gpu_isr.h"
42 #include "uvm_hal_types.h"
43 #include "uvm_hmm.h"
44 #include "uvm_va_block_types.h"
45 #include "uvm_perf_module.h"
46 #include "uvm_rb_tree.h"
47 #include "uvm_perf_prefetch.h"
48 #include "nv-kthread-q.h"
49 #include <linux/mmu_notifier.h>
50 #include "uvm_conf_computing.h"
51 
52 // Buffer length to store uvm gpu id, RM device name and gpu uuid.
53 #define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \
54             UVM_GPU_NAME_LENGTH + UVM_GPU_UUID_TEXT_BUFFER_LENGTH)
55 
56 #define UVM_GPU_MAGIC_VALUE 0xc001d00d12341993ULL
57 
58 typedef struct
59 {
60     // Number of faults from this uTLB that have been fetched but have not been
61     // serviced yet.
62     NvU32 num_pending_faults;
63 
64     // Whether the uTLB contains fatal faults
65     bool has_fatal_faults;
66 
67     // We have issued a replay of type START_ACK_ALL while containing fatal
68     // faults. This puts the uTLB in lockdown mode and no new translations are
69     // accepted.
70     bool in_lockdown;
71 
72     // We have issued a cancel on this uTLB
73     bool cancelled;
74 
75     uvm_fault_buffer_entry_t prev_fatal_fault;
76 
77     // Last fetched fault that was originated from this uTLB. Used for fault
78     // filtering.
79     uvm_fault_buffer_entry_t *last_fault;
80 } uvm_fault_utlb_info_t;
81 
82 struct uvm_service_block_context_struct
83 {
84     //
85     // Fields initialized by CPU/GPU fault handling and access counter routines
86     //
87 
88     // Whether the information refers to replayable/non-replayable faults or
89     // access counters
90     uvm_service_operation_t operation;
91 
92     // Processors that will be the residency of pages after the operation has
93     // been serviced
94     uvm_processor_mask_t resident_processors;
95 
96     // VA block region that contains all the pages affected by the operation
97     uvm_va_block_region_t region;
98 
99     // Array of type uvm_fault_access_type_t that contains the type of the
100     // access that caused the fault/access_counter notification to be serviced
101     // for each page.
102     NvU8 access_type[PAGES_PER_UVM_VA_BLOCK];
103 
104     // Number of times the service operation has been retried
105     unsigned num_retries;
106 
107     // Pages that need to be pinned due to thrashing
108     uvm_page_mask_t thrashing_pin_mask;
109 
110     // Number of pages that need to be pinned due to thrashing. This is the same
111     // value as the result of bitmap_weight(thrashing_pin_mask)
112     unsigned thrashing_pin_count;
113 
114     // Pages that can be read-duplicated
115     uvm_page_mask_t read_duplicate_mask;
116 
117     // Number of pages that can be read-duplicated. This is the same value as
118     // the result of bitmap_weight(read_duplicate_count_mask)
119     unsigned read_duplicate_count;
120 
121     //
122     // Fields used by the CPU fault handling routine
123     //
124 
125     struct
126     {
127         // Node of the list of fault service contexts used by the CPU
128         struct list_head service_context_list;
129 
130         // A mask of GPUs that need to be checked for ECC errors before the CPU
131         // fault handler returns, but after the VA space lock has been unlocked
132         // to avoid the RM/UVM VA space lock deadlocks.
133         uvm_processor_mask_t gpus_to_check_for_ecc;
134 
135         // This is set to throttle page fault thrashing.
136         NvU64 wakeup_time_stamp;
137 
138         // This is set if the page migrated to/from the GPU and CPU.
139         bool did_migrate;
140 
141         // Sequence number used to start a mmu notifier read side critical
142         // section.
143         unsigned long notifier_seq;
144 
145         struct vm_fault *vmf;
146     } cpu_fault;
147 
148     //
149     // Fields managed by the common operation servicing routine
150     //
151 
152     uvm_prot_page_mask_array_t mappings_by_prot;
153 
154     // Mask with the pages that did not migrate to the processor (they were
155     // already resident) in the last call to uvm_va_block_make_resident.
156     // This is used to compute the pages that need to revoke mapping permissions
157     // from other processors.
158     uvm_page_mask_t did_not_migrate_mask;
159 
160     // Pages whose permissions need to be revoked from other processors
161     uvm_page_mask_t revocation_mask;
162 
163     // Temporary mask used in service_va_block_locked() in
164     // uvm_gpu_access_counters.c.
165     uvm_processor_mask_t update_processors;
166 
167     struct
168     {
169         // Per-processor mask with the pages that will be resident after
170         // servicing. We need one mask per processor because we may coalesce
171         // faults that trigger migrations to different processors.
172         uvm_page_mask_t new_residency;
173     } per_processor_masks[UVM_ID_MAX_PROCESSORS];
174 
175     // State used by the VA block routines called by the servicing routine
176     uvm_va_block_context_t *block_context;
177 
178     // Prefetch state hint
179     uvm_perf_prefetch_hint_t prefetch_hint;
180 
181     // Prefetch temporary state.
182     uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree;
183 };
184 
185 typedef struct
186 {
187     // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
188     // VMA. Used for batching ATS faults in a vma. This is unused for access
189     // counter service requests.
190     uvm_page_mask_t read_fault_mask;
191 
192     // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a
193     // SAM VMA. Used for batching ATS faults in a vma. This is unused for access
194     // counter service requests.
195     uvm_page_mask_t write_fault_mask;
196 
197     // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region
198     // of a SAM VMA. Used to return ATS fault status. This is unused for access
199     // counter service requests.
200     uvm_page_mask_t faults_serviced_mask;
201 
202     // Mask of successfully serviced read faults on pages in write_fault_mask.
203     // This is unused for access counter service requests.
204     uvm_page_mask_t reads_serviced_mask;
205 
206     // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
207     // VMA. This is used as input for access counter service requests and output
208     // of fault service requests.
209     uvm_page_mask_t accessed_mask;
210 
211     // Client type of the service requestor.
212     uvm_fault_client_type_t client_type;
213 
214     // New residency ID of the faulting region.
215     uvm_processor_id_t residency_id;
216 
217     // New residency NUMA node ID of the faulting region.
218     int residency_node;
219 
220     struct
221     {
222         // True if preferred_location was set on this faulting region.
223         // UVM_VA_BLOCK_SIZE sized region in the faulting region bound by the
224         // VMA is is prefetched if preferred_location was set and if first_touch
225         // is true;
226         bool has_preferred_location;
227 
228         // True if the UVM_VA_BLOCK_SIZE sized region isn't resident on any
229         // node. False if any page in the region is resident somewhere.
230         bool first_touch;
231 
232         // Mask of prefetched pages in a UVM_VA_BLOCK_SIZE aligned region of a
233         // SAM VMA.
234         uvm_page_mask_t prefetch_pages_mask;
235 
236         // PFN info of the faulting region
237         unsigned long pfns[PAGES_PER_UVM_VA_BLOCK];
238 
239         // Faulting/preferred processor residency mask of the faulting region.
240         uvm_page_mask_t residency_mask;
241 
242 #if defined(NV_MMU_INTERVAL_NOTIFIER)
243         // MMU notifier used to compute residency of this faulting region.
244         struct mmu_interval_notifier notifier;
245 #endif
246 
247         uvm_va_space_t *va_space;
248 
249         // Prefetch temporary state.
250         uvm_perf_prefetch_bitmap_tree_t bitmap_tree;
251     } prefetch_state;
252 
253 } uvm_ats_fault_context_t;
254 
255 struct uvm_fault_service_batch_context_struct
256 {
257     // Array of elements fetched from the GPU fault buffer. The number of
258     // elements in this array is exactly max_batch_size
259     uvm_fault_buffer_entry_t *fault_cache;
260 
261     // Array of pointers to elements in fault cache used for fault
262     // preprocessing. The number of elements in this array is exactly
263     // max_batch_size
264     uvm_fault_buffer_entry_t **ordered_fault_cache;
265 
266     // Per uTLB fault information. Used for replay policies and fault
267     // cancellation on Pascal
268     uvm_fault_utlb_info_t *utlbs;
269 
270     // Largest uTLB id seen in a GPU fault
271     NvU32 max_utlb_id;
272 
273     NvU32 num_cached_faults;
274 
275     NvU32 num_coalesced_faults;
276 
277     // One of the VA spaces in this batch which had fatal faults. If NULL, no
278     // faults were fatal. More than one VA space could have fatal faults, but we
279     // pick one to be the target of the cancel sequence.
280     uvm_va_space_t *fatal_va_space;
281 
282     bool has_throttled_faults;
283 
284     NvU32 num_invalid_prefetch_faults;
285 
286     NvU32 num_duplicate_faults;
287 
288     NvU32 num_replays;
289 
290     uvm_ats_fault_context_t ats_context;
291 
292     // Unique id (per-GPU) generated for tools events recording
293     NvU32 batch_id;
294 
295     uvm_tracker_t tracker;
296 
297     // Boolean used to avoid sorting the fault batch by instance_ptr if we
298     // determine at fetch time that all the faults in the batch report the same
299     // instance_ptr
300     bool is_single_instance_ptr;
301 
302     // Last fetched fault. Used for fault filtering.
303     uvm_fault_buffer_entry_t *last_fault;
304 };
305 
306 struct uvm_ats_fault_invalidate_struct
307 {
308     bool            tlb_batch_pending;
309     uvm_tlb_batch_t tlb_batch;
310 };
311 
312 typedef struct
313 {
314     // Fault buffer information and structures provided by RM
315     UvmGpuFaultInfo rm_info;
316 
317     // Maximum number of faults to be processed in batch before fetching new
318     // entries from the GPU buffer
319     NvU32 max_batch_size;
320 
321     struct uvm_replayable_fault_buffer_info_struct
322     {
323         // Maximum number of faults entries that can be stored in the buffer
324         NvU32 max_faults;
325 
326         // Cached value of the GPU GET register to minimize the round-trips
327         // over PCIe
328         NvU32 cached_get;
329 
330         // Cached value of the GPU PUT register to minimize the round-trips over
331         // PCIe
332         NvU32 cached_put;
333 
334         // Policy that determines when GPU replays are issued during normal
335         // fault servicing
336         uvm_perf_fault_replay_policy_t replay_policy;
337 
338         // Tracker used to aggregate replay operations, needed for fault cancel
339         // and GPU removal
340         uvm_tracker_t replay_tracker;
341 
342         // If there is a ratio larger than replay_update_put_ratio of duplicate
343         // faults in a batch, PUT pointer is updated before flushing the buffer
344         // that comes before the replay method.
345         NvU32 replay_update_put_ratio;
346 
347         // Fault statistics. These fields are per-GPU and most of them are only
348         // updated during fault servicing, and can be safely incremented.
349         // Migrations may be triggered by different GPUs and need to be
350         // incremented using atomics
351         struct
352         {
353             NvU64 num_prefetch_faults;
354 
355             NvU64 num_read_faults;
356 
357             NvU64 num_write_faults;
358 
359             NvU64 num_atomic_faults;
360 
361             NvU64 num_duplicate_faults;
362 
363             atomic64_t num_pages_out;
364 
365             atomic64_t num_pages_in;
366 
367             NvU64 num_replays;
368 
369             NvU64 num_replays_ack_all;
370         } stats;
371 
372         // Number of uTLBs in the chip
373         NvU32 utlb_count;
374 
375         // Context structure used to service a GPU fault batch
376         uvm_fault_service_batch_context_t batch_service_context;
377 
378         // Structure used to coalesce fault servicing in a VA block
379         uvm_service_block_context_t block_service_context;
380 
381         // Information required to invalidate stale ATS PTEs from the GPU TLBs
382         uvm_ats_fault_invalidate_t ats_invalidate;
383     } replayable;
384 
385     struct uvm_non_replayable_fault_buffer_info_struct
386     {
387         // Maximum number of faults entries that can be stored in the buffer
388         NvU32 max_faults;
389 
390         // Tracker used to aggregate clear faulted operations, needed for GPU
391         // removal
392         uvm_tracker_t clear_faulted_tracker;
393 
394         // Buffer used to store elements popped out from the queue shared with
395         // RM for fault servicing.
396         void *shadow_buffer_copy;
397 
398         // Array of elements fetched from the GPU fault buffer. The number of
399         // elements in this array is exactly max_batch_size
400         uvm_fault_buffer_entry_t *fault_cache;
401 
402         // Fault statistics. See replayable fault stats for more details.
403         struct
404         {
405             NvU64 num_read_faults;
406 
407             NvU64 num_write_faults;
408 
409             NvU64 num_atomic_faults;
410 
411             NvU64 num_physical_faults;
412 
413             atomic64_t num_pages_out;
414 
415             atomic64_t num_pages_in;
416         } stats;
417 
418         // Tracker which temporarily holds the work pushed to service faults
419         uvm_tracker_t fault_service_tracker;
420 
421         // Structure used to coalesce fault servicing in a VA block
422         uvm_service_block_context_t block_service_context;
423 
424         // Unique id (per-GPU) generated for tools events recording
425         NvU32 batch_id;
426 
427         // Information required to service ATS faults.
428         uvm_ats_fault_context_t ats_context;
429 
430         // Information required to invalidate stale ATS PTEs from the GPU TLBs
431         uvm_ats_fault_invalidate_t ats_invalidate;
432     } non_replayable;
433 
434     // Flag that tells if prefetch faults are enabled in HW
435     bool prefetch_faults_enabled;
436 
437     // Timestamp when prefetch faults where disabled last time
438     NvU64 disable_prefetch_faults_timestamp;
439 } uvm_fault_buffer_info_t;
440 
441 struct uvm_access_counter_service_batch_context_struct
442 {
443     uvm_access_counter_buffer_entry_t *notification_cache;
444 
445     NvU32 num_cached_notifications;
446 
447     struct
448     {
449         uvm_access_counter_buffer_entry_t   **notifications;
450 
451         NvU32                             num_notifications;
452 
453         // Boolean used to avoid sorting the fault batch by instance_ptr if we
454         // determine at fetch time that all the access counter notifications in
455         // the batch report the same instance_ptr
456         bool is_single_instance_ptr;
457     } virt;
458 
459     struct
460     {
461         uvm_access_counter_buffer_entry_t    **notifications;
462         uvm_reverse_map_t                      *translations;
463 
464         NvU32                              num_notifications;
465 
466         // Boolean used to avoid sorting the fault batch by aperture if we
467         // determine at fetch time that all the access counter notifications in
468         // the batch report the same aperture
469         bool                              is_single_aperture;
470     } phys;
471 
472     // Helper page mask to compute the accessed pages within a VA block
473     uvm_page_mask_t accessed_pages;
474 
475     // Structure used to coalesce access counter servicing in a VA block
476     uvm_service_block_context_t block_service_context;
477 
478     // Structure used to service access counter migrations in an ATS block.
479     uvm_ats_fault_context_t ats_context;
480 
481     // Unique id (per-GPU) generated for tools events recording
482     NvU32 batch_id;
483 };
484 
485 typedef struct
486 {
487     // Values used to configure access counters in RM
488     struct
489     {
490         UVM_ACCESS_COUNTER_GRANULARITY  granularity;
491         UVM_ACCESS_COUNTER_USE_LIMIT    use_limit;
492     } rm;
493 
494     // The following values are precomputed by the access counter notification
495     // handling code. See comments for UVM_MAX_TRANSLATION_SIZE in
496     // uvm_gpu_access_counters.c for more details.
497     NvU64 translation_size;
498 
499     NvU64 translations_per_counter;
500 
501     NvU64 sub_granularity_region_size;
502 
503     NvU64 sub_granularity_regions_per_translation;
504 } uvm_gpu_access_counter_type_config_t;
505 
506 typedef struct
507 {
508     UvmGpuAccessCntrInfo rm_info;
509 
510     NvU32 max_notifications;
511 
512     NvU32 max_batch_size;
513 
514     // Cached value of the GPU GET register to minimize the round-trips
515     // over PCIe
516     NvU32 cached_get;
517 
518     // Cached value of the GPU PUT register to minimize the round-trips over
519     // PCIe
520     NvU32 cached_put;
521 
522     // Tracker used to aggregate access counters clear operations, needed for
523     // GPU removal
524     uvm_tracker_t clear_tracker;
525 
526     // Current access counter configuration. During normal operation this
527     // information is computed once during GPU initialization. However, tests
528     // may override it to try different configuration values.
529     struct
530     {
531         uvm_gpu_access_counter_type_config_t mimc;
532         uvm_gpu_access_counter_type_config_t momc;
533 
534         NvU32                                threshold;
535     } current_config;
536 
537     // Access counter statistics
538     struct
539     {
540         atomic64_t num_pages_out;
541 
542         atomic64_t num_pages_in;
543     } stats;
544 
545     // Ignoring access counters means that notifications are left in the HW
546     // buffer without being serviced.  Requests to ignore access counters
547     // are counted since the suspend path inhibits access counter interrupts,
548     // and the resume path needs to know whether to reenable them.
549     NvU32 notifications_ignored_count;
550 
551     // Context structure used to service a GPU access counter batch
552     uvm_access_counter_service_batch_context_t batch_service_context;
553 
554     // VA space that reconfigured the access counters configuration, if any.
555     // Used in builtin tests only, to avoid reconfigurations from different
556     // processes
557     //
558     // Locking: both readers and writers must hold the access counters ISR lock
559     uvm_va_space_t *reconfiguration_owner;
560 } uvm_access_counter_buffer_info_t;
561 
562 typedef struct
563 {
564     // VA where the identity mapping should be mapped in the internal VA
565     // space managed by uvm_gpu_t.address_space_tree (see below).
566     NvU64 base;
567 
568     // Page tables with the mapping.
569     uvm_page_table_range_vec_t *range_vec;
570 
571     // Used during init to indicate whether the mapping has been fully
572     // initialized.
573     bool ready;
574 } uvm_gpu_identity_mapping_t;
575 
576 // Root chunk mapping
577 typedef struct
578 {
579     // Page table range representation of the mapping. Because a root chunk
580     // fits into a single 2MB page, in practice the range consists of a single
581     // 2MB PTE.
582     uvm_page_table_range_t *range;
583 
584     // Number of mapped pages of size PAGE_SIZE.
585     NvU32 num_mapped_pages;
586 } uvm_gpu_root_chunk_mapping_t;
587 
588 typedef enum
589 {
590     UVM_GPU_LINK_INVALID = 0,
591     UVM_GPU_LINK_PCIE,
592     UVM_GPU_LINK_NVLINK_1,
593     UVM_GPU_LINK_NVLINK_2,
594     UVM_GPU_LINK_NVLINK_3,
595     UVM_GPU_LINK_NVLINK_4,
596     UVM_GPU_LINK_C2C,
597     UVM_GPU_LINK_MAX
598 } uvm_gpu_link_type_t;
599 
600 typedef enum
601 {
602     // Peer copies can be disallowed for a variety of reasons. For example,
603     // P2P transfers are disabled in pre-Pascal GPUs because there is no
604     // compelling use case for direct peer migrations.
605     UVM_GPU_PEER_COPY_MODE_UNSUPPORTED,
606 
607     // Pascal+ GPUs support virtual addresses in P2P copies. Virtual peer copies
608     // require the creation of peer identity mappings.
609     UVM_GPU_PEER_COPY_MODE_VIRTUAL,
610 
611     // Ampere+ GPUs support virtual and physical peer copies. Physical peer
612     // copies do not depend on peer identity mappings.
613     UVM_GPU_PEER_COPY_MODE_PHYSICAL,
614 
615     UVM_GPU_PEER_COPY_MODE_COUNT
616 } uvm_gpu_peer_copy_mode_t;
617 
618 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
619 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
620 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
621 // partitions within the parent. The parent GPU and partition GPU have
622 // different "id" and "uuid".
623 struct uvm_gpu_struct
624 {
625     uvm_parent_gpu_t *parent;
626 
627     // The gpu's GI uuid if SMC is enabled; otherwise, a copy of parent->uuid.
628     NvProcessorUuid uuid;
629 
630     // Nice printable name in the format:
631     // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>.
632     // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character.
633     char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
634 
635     // Refcount of the gpu, i.e. how many times it has been retained. This is
636     // roughly a count of how many times it has been registered with a VA space,
637     // except that some paths retain the GPU temporarily without a VA space.
638     //
639     // While this is >0, the GPU can't be removed. This differs from gpu_kref,
640     // which merely prevents the uvm_gpu_t object from being freed.
641     //
642     // In most cases this count is protected by the global lock: retaining a GPU
643     // from a UUID and any release require the global lock to be taken. But it's
644     // also useful for a caller to retain a GPU they've already retained, in
645     // which case there's no need to take the global lock. This can happen when
646     // an operation needs to drop the VA space lock but continue operating on a
647     // GPU. This is an atomic variable to handle those cases.
648     //
649     // Security note: keep it as a 64-bit counter to prevent overflow cases (a
650     // user can create a lot of va spaces and register the gpu with them).
651     atomic64_t retained_count;
652 
653     // A unique uvm gpu id in range [1, UVM_ID_MAX_PROCESSORS).
654     uvm_gpu_id_t id;
655 
656     // Should be UVM_GPU_MAGIC_VALUE. Used for memory checking.
657     NvU64 magic;
658 
659     struct
660     {
661         // The amount of memory the GPU has in total, in bytes. If the GPU is in
662         // ZeroFB testing mode, this will be 0.
663         NvU64 size;
664 
665         // Max (inclusive) physical address of this GPU's memory that the driver
666         // can allocate through PMM (PMA).
667         NvU64 max_allocatable_address;
668 
669         // Max supported vidmem page size may be smaller than the max GMMU page
670         // size, because of the vMMU supported page sizes.
671         NvU64 max_vidmem_page_size;
672 
673         struct
674         {
675             // True if the platform supports HW coherence and the GPU's memory
676             // is exposed as a NUMA node to the kernel.
677             bool enabled;
678             unsigned int node_id;
679         } numa;
680     } mem_info;
681 
682     struct
683     {
684         // Big page size used by the internal UVM VA space
685         // Notably it may be different than the big page size used by a user's
686         // VA space in general.
687         NvU32 internal_size;
688     } big_page;
689 
690     // Mapped registers needed to obtain the current GPU timestamp
691     struct
692     {
693         volatile NvU32 *time0_register;
694         volatile NvU32 *time1_register;
695     } time;
696 
697     // Identity peer mappings are only defined when
698     // peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL
699     uvm_gpu_identity_mapping_t peer_mappings[UVM_ID_MAX_GPUS];
700 
701     struct
702     {
703         // Mask of peer_gpus set
704         //
705         // We can use a regular processor id because P2P is not allowed between
706         // partitioned GPUs when SMC is enabled
707         uvm_processor_mask_t peer_gpu_mask;
708 
709         // lazily-populated array of peer GPUs, indexed by the peer's GPU index
710         uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];
711 
712         // Leaf spinlock used to synchronize access to the peer_gpus table so
713         // that it can be safely accessed from the access counters bottom half
714         uvm_spinlock_t peer_gpus_lock;
715     } peer_info;
716 
717     // Maximum number of subcontexts supported
718     NvU32 max_subcontexts;
719 
720     // RM address space handle used in many of the UVM/RM APIs
721     // Represents a GPU VA space within rm_device.
722     //
723     // In SR-IOV heavy, proxy channels are not associated with this address
724     // space.
725     uvmGpuAddressSpaceHandle rm_address_space;
726 
727     // Page tree used for the internal UVM VA space shared with RM
728     uvm_page_tree_t address_space_tree;
729 
730     // Set to true during add_gpu() as soon as the RM's address space is moved
731     // to the address_space_tree.
732     bool rm_address_space_moved_to_page_tree;
733 
734     uvm_gpu_semaphore_pool_t *semaphore_pool;
735 
736     uvm_gpu_semaphore_pool_t *secure_semaphore_pool;
737 
738     uvm_channel_manager_t *channel_manager;
739 
740     uvm_pmm_gpu_t pmm;
741 
742     // Flat linear mapping covering vidmem. This is a kernel mapping that is
743     // only created in certain configurations.
744     //
745     // There are two mutually exclusive versions of the mapping. The simplest
746     // version covers the entire GPU memory, and it is created during GPU
747     // initialization. The dynamic version is a partial vidmem mapping that
748     // creates and destroys mappings to GPU root chunks on demand.
749     union
750     {
751         // Static mapping covering the whole GPU memory.
752         uvm_gpu_identity_mapping_t static_flat_mapping;
753 
754         // Dynamic mapping of GPU memory.
755         struct
756         {
757             // Array of root chunk mappings.
758             uvm_gpu_root_chunk_mapping_t *array;
759 
760             // Number of elements in the array.
761             size_t count;
762 
763             // Each bit in the bitlock protects a single root chunk mapping.
764             uvm_bit_locks_t bitlocks;
765 
766         } root_chunk_mappings;
767     };
768 
769     // Linear sysmem mappings. Mappings are added on demand, and removed upon
770     // GPU deinitialization. The mappings are added to UVM's internal address
771     // space i.e. they are kernel mappings.
772     //
773     // Only used in SR-IOV heavy.
774     struct
775     {
776         // Size of each mapping, in bytes.
777         NvU64 mapping_size;
778 
779         // Array of sysmem mappings.
780         uvm_gpu_identity_mapping_t *array;
781 
782         // Number of elements in the array.
783         size_t count;
784 
785         // Each bit in the bitlock protects a sysmem mapping.
786         uvm_bit_locks_t bitlocks;
787     } sysmem_mappings;
788 
789     // Reverse lookup table used to query the user mapping associated with a
790     // sysmem (DMA) physical address.
791     //
792     // The system memory mapping information referred to by this field is
793     // different from that of sysmem_mappings, because it relates to user
794     // mappings (instead of kernel), and it is used in most configurations.
795     uvm_pmm_sysmem_mappings_t pmm_reverse_sysmem_mappings;
796 
797     struct
798     {
799         uvm_conf_computing_dma_buffer_pool_t dma_buffer_pool;
800 
801         // Dummy memory used to store the IV contents during CE encryption.
802         // This memory location is also only available after CE channels
803         // because we use them to write PTEs for allocations such as this one.
804         // This location is used when a physical addressing for the IV buffer
805         // is required. See uvm_hal_hopper_ce_encrypt().
806         uvm_mem_t *iv_mem;
807 
808         // Dummy memory used to store the IV contents during CE encryption.
809         // Because of the limitations of `iv_mem', and the need to have such
810         // buffer at channel initialization, we use an RM allocation.
811         // This location is used when a virtual addressing for the IV buffer
812         // is required. See uvm_hal_hopper_ce_encrypt().
813         uvm_rm_mem_t *iv_rm_mem;
814     } conf_computing;
815 
816     // ECC handling
817     // In order to trap ECC errors as soon as possible the driver has the hw
818     // interrupt register mapped directly. If an ECC interrupt is ever noticed
819     // to be pending, then the UVM driver needs to:
820     //
821     //   1) ask RM to service interrupts, and then
822     //   2) inspect the ECC error notifier state.
823     //
824     // Notably, checking for channel errors is not enough, because ECC errors
825     // can be pending, even after a channel has become idle.
826     //
827     // See more details in uvm_gpu_check_ecc_error().
828     struct
829     {
830         // Does the GPU have ECC enabled?
831         bool enabled;
832 
833         // Direct mapping of the 32-bit part of the hw interrupt tree that has
834         // the ECC bits.
835         volatile NvU32 *hw_interrupt_tree_location;
836 
837         // Mask to get the ECC interrupt bits from the 32-bits above.
838         NvU32 mask;
839 
840         // Set to true by RM when a fatal ECC error is encountered (requires
841         // asking RM to service pending interrupts to be current).
842         NvBool *error_notifier;
843     } ecc;
844 
845     struct
846     {
847         NvU32 swizz_id;
848 
849         // RM device handle used in many of the UVM/RM APIs.
850         //
851         // Do not read this field directly, use uvm_gpu_device_handle instead.
852         uvmGpuDeviceHandle rm_device;
853     } smc;
854 
855     struct
856     {
857         struct proc_dir_entry *dir;
858 
859         struct proc_dir_entry *dir_symlink;
860 
861         // The GPU instance UUID symlink if SMC is enabled.
862         struct proc_dir_entry *gpu_instance_uuid_symlink;
863 
864         struct proc_dir_entry *info_file;
865 
866         struct proc_dir_entry *dir_peers;
867     } procfs;
868 
869     // Placeholder for per-GPU performance heuristics information
870     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
871 
872     // Force pushbuffer's GPU VA to be >= 1TB; used only for testing purposes.
873     bool uvm_test_force_upper_pushbuffer_segment;
874 };
875 
876 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
877 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
878 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
879 // partitions within the parent. The parent GPU and partition GPU have
880 // different "id" and "uuid".
881 struct uvm_parent_gpu_struct
882 {
883     // Reference count for how many places are holding on to a parent GPU
884     // (internal to the UVM driver).  This includes any GPUs we know about, not
885     // just GPUs that are registered with a VA space.  Most GPUs end up being
886     // registered, but there are brief periods when they are not registered,
887     // such as during interrupt handling, and in add_gpu() or remove_gpu().
888     nv_kref_t gpu_kref;
889 
890     // The number of uvm_gpu_ts referencing this uvm_parent_gpu_t.
891     NvU32 num_retained_gpus;
892 
893     uvm_gpu_t *gpus[UVM_PARENT_ID_MAX_SUB_PROCESSORS];
894 
895     // Bitmap of valid child entries in the gpus[] table.  Used to retrieve a
896     // usable child GPU in bottom-halves.
897     DECLARE_BITMAP(valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
898 
899     // The gpu's uuid
900     NvProcessorUuid uuid;
901 
902     // Nice printable name including the uvm gpu id, ascii name from RM and uuid
903     char name[UVM_GPU_NICE_NAME_BUFFER_LENGTH];
904 
905     // GPU information and provided by RM (architecture, implementation,
906     // hardware classes, etc.).
907     UvmGpuInfo rm_info;
908 
909     // A unique uvm gpu id in range [1, UVM_PARENT_ID_MAX_PROCESSORS)
910     uvm_parent_gpu_id_t id;
911 
912     // Reference to the Linux PCI device
913     //
914     // The reference to the PCI device remains valid as long as the GPU is
915     // registered with RM's Linux layer (between nvUvmInterfaceRegisterGpu() and
916     // nvUvmInterfaceUnregisterGpu()).
917     struct pci_dev *pci_dev;
918 
919     // NVLINK Processing Unit (NPU) on PowerPC platforms. The NPU is a
920     // collection of CPU-side PCI devices which bridge GPU NVLINKs and the CPU
921     // memory bus.
922     //
923     // There is one PCI device per NVLINK. A set of NVLINKs connects to a single
924     // GPU, and all NVLINKs for a given socket are collected logically under
925     // this UVM NPU because some resources (such as register mappings) are
926     // shared by all those NVLINKs. This means multiple GPUs may connect to the
927     // same UVM NPU.
928     uvm_ibm_npu_t *npu;
929 
930     // On kernels with NUMA support, this entry contains the closest CPU NUMA
931     // node to this GPU. Otherwise, the value will be -1.
932     int closest_cpu_numa_node;
933 
934     // RM device handle used in many of the UVM/RM APIs.
935     //
936     // Do not read this field directly, use uvm_gpu_device_handle instead.
937     uvmGpuDeviceHandle rm_device;
938 
939     // The physical address range addressable by the GPU
940     //
941     // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
942     // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when
943     // referencing sysmem from the GPU, dma_addressable_start should be
944     // subtracted from the physical address. The DMA mapping helpers like
945     // uvm_parent_gpu_map_cpu_pages() and uvm_parent_gpu_dma_alloc_page() take
946     // care of that.
947     NvU64 dma_addressable_start;
948     NvU64 dma_addressable_limit;
949 
950     // Total size (in bytes) of physically mapped (with
951     // uvm_parent_gpu_map_cpu_pages) sysmem pages, used for leak detection.
952     atomic64_t mapped_cpu_pages_size;
953 
954     // Hardware Abstraction Layer
955     uvm_host_hal_t *host_hal;
956     uvm_ce_hal_t *ce_hal;
957     uvm_arch_hal_t *arch_hal;
958     uvm_fault_buffer_hal_t *fault_buffer_hal;
959     uvm_access_counter_buffer_hal_t *access_counter_buffer_hal;
960     uvm_sec2_hal_t *sec2_hal;
961 
962     // Whether CE supports physical addressing mode for writes to vidmem
963     bool ce_phys_vidmem_write_supported;
964 
965     uvm_gpu_peer_copy_mode_t peer_copy_mode;
966 
967     // Virtualization mode of the GPU.
968     UVM_VIRT_MODE virt_mode;
969 
970     // Pascal+ GPUs can trigger faults on prefetch instructions. If false, this
971     // feature must be disabled at all times in GPUs of the given architecture.
972     // If true, the feature can be toggled at will by SW.
973     //
974     // The field should not be used unless the GPU supports replayable faults.
975     bool prefetch_fault_supported;
976 
977     // Number of membars required to flush out HSHUB following a TLB invalidate
978     NvU32 num_hshub_tlb_invalidate_membars;
979 
980     // Whether the channels can configure GPFIFO in vidmem
981     bool gpfifo_in_vidmem_supported;
982 
983     bool replayable_faults_supported;
984 
985     bool non_replayable_faults_supported;
986 
987     bool access_counters_supported;
988 
989     // If this is true, physical address based access counter notifications are
990     // potentially generated. If false, only virtual address based notifications
991     // are generated (assuming access_counters_supported is true too).
992     bool access_counters_can_use_physical_addresses;
993 
994     bool fault_cancel_va_supported;
995 
996     // True if the GPU has hardware support for scoped atomics
997     bool scoped_atomics_supported;
998 
999     // If true, a HW method can be used to clear a faulted channel.
1000     // If false, then the GPU supports clearing faulted channels using registers
1001     // instead of a HW method.
1002     // This value is only defined for GPUs that support non-replayable faults.
1003     bool has_clear_faulted_channel_method;
1004 
1005     // If true, a SW method can be used to clear a faulted channel.
1006     // If false, the HW method or the registers (whichever is available
1007     // according to has_clear_faulted_channel_method) needs to be used.
1008     //
1009     // This value is only defined for GPUs that support non-replayable faults.
1010     bool has_clear_faulted_channel_sw_method;
1011 
1012     bool sparse_mappings_supported;
1013 
1014     // Ampere(GA100) requires map->invalidate->remap->invalidate for page size
1015     // promotion
1016     bool map_remap_larger_page_promotion;
1017 
1018     bool plc_supported;
1019 
1020     // If true, page_tree initialization pre-populates no_ats_ranges. It only
1021     // affects ATS systems.
1022     bool no_ats_range_required;
1023 
1024     // Parameters used by the TLB batching API
1025     struct
1026     {
1027         // Is the targeted (single page) VA invalidate supported at all?
1028         NvBool va_invalidate_supported;
1029 
1030         // Is the VA range invalidate supported?
1031         NvBool va_range_invalidate_supported;
1032 
1033         union
1034         {
1035             // Maximum (inclusive) number of single page invalidations before
1036             // falling back to invalidate all
1037             NvU32 max_pages;
1038 
1039             // Maximum (inclusive) number of range invalidations before falling
1040             // back to invalidate all
1041             NvU32 max_ranges;
1042         };
1043     } tlb_batch;
1044 
1045     // Largest VA (exclusive) which can be used for channel buffer mappings
1046     NvU64 max_channel_va;
1047 
1048     // Largest VA (exclusive) which Host can operate.
1049     NvU64 max_host_va;
1050 
1051     // Indicates whether the GPU can map sysmem with pages larger than 4k
1052     bool can_map_sysmem_with_large_pages;
1053 
1054     // VA base and size of the RM managed part of the internal UVM VA space.
1055     //
1056     // The internal UVM VA is shared with RM by RM controlling some of the top
1057     // level PDEs and leaving the rest for UVM to control.
1058     // On Pascal a single top level PDE covers 128 TB of VA and given that
1059     // semaphores and other allocations limited to 40bit are currently allocated
1060     // through RM, RM needs to control the [0, 128TB) VA range at least for now.
1061     // On Maxwell, limit RMs VA to [0, 128GB) that should easily fit
1062     // all RM allocations and leave enough space for UVM.
1063     NvU64 rm_va_base;
1064     NvU64 rm_va_size;
1065 
1066     // Base and size of the GPU VA used for uvm_mem_t allocations mapped in the
1067     // internal address_space_tree.
1068     NvU64 uvm_mem_va_base;
1069     NvU64 uvm_mem_va_size;
1070 
1071     // Base of the GPU VAs used for the vidmem and sysmem flat mappings.
1072     NvU64 flat_vidmem_va_base;
1073     NvU64 flat_sysmem_va_base;
1074 
1075     // Bitmap of allocation sizes for user memory supported by a GPU. PAGE_SIZE
1076     // is guaranteed to be both present and the smallest size.
1077     uvm_chunk_sizes_mask_t mmu_user_chunk_sizes;
1078 
1079     // Bitmap of allocation sizes that could be requested by the page tree for
1080     // a GPU
1081     uvm_chunk_sizes_mask_t mmu_kernel_chunk_sizes;
1082 
1083     struct
1084     {
1085         struct proc_dir_entry *dir;
1086 
1087         struct proc_dir_entry *fault_stats_file;
1088 
1089         struct proc_dir_entry *access_counters_file;
1090     } procfs;
1091 
1092     // Interrupt handling state and locks
1093     uvm_isr_info_t isr;
1094 
1095     // Fault buffer info. This is only valid if supports_replayable_faults is
1096     // set to true.
1097     uvm_fault_buffer_info_t fault_buffer_info;
1098 
1099     // PMM lazy free processing queue.
1100     // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
1101     nv_kthread_q_t lazy_free_q;
1102 
1103     // Access counter buffer info. This is only valid if
1104     // supports_access_counters is set to true.
1105     uvm_access_counter_buffer_info_t access_counter_buffer_info;
1106 
1107     // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
1108     NvU32 utlb_per_gpc_count;
1109 
1110     // In order to service GPU faults, UVM must be able to obtain the VA
1111     // space for each reported fault. The fault packet contains the
1112     // instance_ptr of the channel that was bound when the SMs triggered
1113     // the fault. On fault any instance pointer in the TSG may be
1114     // reported. This is a problem on Volta, which allow different channels
1115     // in the TSG to be bound to different VA spaces in order to support
1116     // subcontexts. In order to be able to obtain the correct VA space, HW
1117     // provides the subcontext id (or VEID) in addition to the instance_ptr.
1118     //
1119     // Summary:
1120     //
1121     // 1) Channels in a TSG may be in different VA spaces, identified by their
1122     // subcontext ID.
1123     // 2) Different subcontext IDs may map to the same or different VA spaces.
1124     // 3) On fault, any instance pointer in the TSG may be reported. The
1125     // reported subcontext ID identifies which VA space within the TSG actually
1126     // encountered the fault.
1127     //
1128     // Thus, UVM needs to keep track of all the instance pointers that belong
1129     // to the same TSG. We use two tables:
1130     //
1131     // - instance_ptr_table (instance_ptr -> subctx_info) this table maps
1132     // instance pointers to the subcontext info descriptor for the channel. If
1133     // the channel belongs to a subcontext, this descriptor will contain all
1134     // the VA spaces for the subcontexts in the same TSG. If the channel does
1135     // not belong to a subcontext, it will only contain a pointer to its VA
1136     // space.
1137     // - tsg_table (tsg_id -> subctx_info): this table also stores the
1138     // subctx information, but in this case it is indexed by TSG ID. Thus,
1139     // when a new channel bound to a subcontext is registered, it will check
1140     // first in this table if the subcontext information descriptor for its TSG
1141     // already exists, otherwise it will create it. Channels not bound to
1142     // subcontexts will not use this table.
1143     //
1144     // The bottom half reads the tables under
1145     // isr.replayable_faults_handler.lock, but a separate lock is necessary
1146     // because entries are added and removed from the table under the va_space
1147     // lock, and we can't take isr.replayable_faults_handler.lock while holding
1148     // the va_space lock.
1149     uvm_rb_tree_t tsg_table;
1150 
1151     uvm_rb_tree_t instance_ptr_table;
1152     uvm_spinlock_t instance_ptr_table_lock;
1153 
1154     // This is set to true if the GPU belongs to an SLI group.
1155     bool sli_enabled;
1156 
1157     struct
1158     {
1159         bool supported;
1160 
1161         bool enabled;
1162     } smc;
1163 
1164     // Global statistics. These fields are per-GPU and most of them are only
1165     // updated during fault servicing, and can be safely incremented.
1166     struct
1167     {
1168         NvU64          num_replayable_faults;
1169 
1170         NvU64      num_non_replayable_faults;
1171 
1172         atomic64_t             num_pages_out;
1173 
1174         atomic64_t              num_pages_in;
1175     } stats;
1176 
1177     // Structure to hold nvswitch specific information. In an nvswitch
1178     // environment, rather than using the peer-id field of the PTE (which can
1179     // only address 8 gpus), all gpus are assigned a 47-bit physical address
1180     // space by the fabric manager. Any physical address access to these
1181     // physical address spaces are routed through the switch to the
1182     // corresponding peer.
1183     struct
1184     {
1185         bool is_nvswitch_connected;
1186 
1187         // 47-bit fabric memory physical offset that peer gpus need to access
1188         // to read a peer's memory
1189         NvU64 fabric_memory_window_start;
1190     } nvswitch_info;
1191 
1192     struct
1193     {
1194         // Note that this represents the link to system memory, not the link the
1195         // system used to discover the GPU. There are some cases such as NVLINK2
1196         // where the GPU is still on the PCIe bus, but it accesses memory over
1197         // this link rather than PCIe.
1198         uvm_gpu_link_type_t link;
1199         NvU32 link_rate_mbyte_per_s;
1200 
1201         // Range in the system physical address space where the memory of this
1202         // GPU is exposed as coherent. memory_window_end is inclusive.
1203         // memory_window_start == memory_window_end indicates that no window is
1204         // present (coherence is not supported).
1205         NvU64 memory_window_start;
1206         NvU64 memory_window_end;
1207     } system_bus;
1208 
1209     // WAR to issue ATS TLB invalidation commands ourselves.
1210     struct
1211     {
1212         uvm_mutex_t smmu_lock;
1213         struct page *smmu_cmdq;
1214         void __iomem *smmu_cmdqv_base;
1215         unsigned long smmu_prod;
1216         unsigned long smmu_cons;
1217     } smmu_war;
1218 };
1219 
uvm_parent_gpu_name(uvm_parent_gpu_t * parent_gpu)1220 static const char *uvm_parent_gpu_name(uvm_parent_gpu_t *parent_gpu)
1221 {
1222     return parent_gpu->name;
1223 }
1224 
uvm_gpu_name(uvm_gpu_t * gpu)1225 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
1226 {
1227     return gpu->name;
1228 }
1229 
uvm_gpu_device_handle(uvm_gpu_t * gpu)1230 static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu)
1231 {
1232     if (gpu->parent->smc.enabled)
1233         return gpu->smc.rm_device;
1234     return gpu->parent->rm_device;
1235 }
1236 
1237 struct uvm_gpu_peer_struct
1238 {
1239     // The fields in this global structure can only be inspected under one of
1240     // the following conditions:
1241     //
1242     // - The VA space lock is held for either read or write, both GPUs are
1243     //   registered in the VA space, and the corresponding bit in the
1244     //   va_space.enabled_peers bitmap is set.
1245     //
1246     // - The global lock is held.
1247     //
1248     // - While the global lock was held in the past, the two GPUs were detected
1249     //   to be SMC peers and were both retained.
1250     //
1251     // - While the global lock was held in the past, the two GPUs were detected
1252     //   to be NVLINK peers and were both retained.
1253     //
1254     // - While the global lock was held in the past, the two GPUs were detected
1255     //   to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was called.
1256     //
1257     // - The peer_gpus_lock is held on one of the GPUs. In this case, the other
1258     //   GPU must be read from the original GPU's peer_gpus table. The fields
1259     //   will not change while the lock is held, but they may no longer be valid
1260     //   because the other GPU might be in teardown.
1261 
1262     // Peer Id associated with this device w.r.t. to a peer GPU.
1263     // Note: peerId (A -> B) != peerId (B -> A)
1264     // peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2)
1265     // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
1266     NvU8 peer_ids[2];
1267 
1268     // Indirect peers are GPUs which can coherently access each others' memory
1269     // over NVLINK, but are routed through the CPU using the SYS aperture rather
1270     // than a PEER aperture
1271     NvU8 is_indirect_peer : 1;
1272 
1273     // The link type between the peer GPUs, currently either PCIe or NVLINK.
1274     // This field is used to determine the when this peer struct has been
1275     // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are
1276     // initialized at GPU registration time. PCIe peers are initialized when
1277     // the refcount below goes from 0 to 1.
1278     uvm_gpu_link_type_t link_type;
1279 
1280     // Maximum unidirectional bandwidth between the peers in megabytes per
1281     // second, not taking into account the protocols' overhead. The reported
1282     // bandwidth for indirect peers is zero. See UvmGpuP2PCapsParams.
1283     NvU32 total_link_line_rate_mbyte_per_s;
1284 
1285     // For PCIe, the number of times that this has been retained by a VA space.
1286     // For NVLINK this will always be 1.
1287     NvU64 ref_count;
1288 
1289     // This handle gets populated when enable_peer_access successfully creates
1290     // an NV50_P2P object. disable_peer_access resets the same on the object
1291     // deletion.
1292     NvHandle p2p_handle;
1293 
1294     struct
1295     {
1296         struct proc_dir_entry *peer_file[2];
1297         struct proc_dir_entry *peer_symlink_file[2];
1298 
1299         // GPU-A <-> GPU-B link is bidirectional, pairs[x][0] is always the
1300         // local GPU, while pairs[x][1] is the remote GPU. The table shall be
1301         // filled like so: [[GPU-A, GPU-B], [GPU-B, GPU-A]].
1302         uvm_gpu_t *pairs[2][2];
1303     } procfs;
1304 };
1305 
1306 // Initialize global gpu state
1307 NV_STATUS uvm_gpu_init(void);
1308 
1309 // Deinitialize global state (called from module exit)
1310 void uvm_gpu_exit(void);
1311 
1312 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space);
1313 
1314 void uvm_gpu_exit_va_space(uvm_va_space_t *va_space);
1315 
uvm_gpu_numa_node(uvm_gpu_t * gpu)1316 static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu)
1317 {
1318     UVM_ASSERT(gpu->mem_info.numa.enabled);
1319     return gpu->mem_info.numa.node_id;
1320 }
1321 
uvm_gpu_page_to_phys_address(uvm_gpu_t * gpu,struct page * page)1322 static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struct page *page)
1323 {
1324     unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT;
1325     unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start;
1326 
1327     UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu));
1328     UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start);
1329     UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end);
1330 
1331     return uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_offset);
1332 }
1333 
1334 // Note that there is a uvm_gpu_get() function defined in uvm_global.h to break
1335 // a circular dep between global and gpu modules.
1336 
1337 // Get a uvm_gpu_t by UUID (physical GPU UUID if SMC is not enabled, otherwise
1338 // GPU instance UUID).
1339 // This returns NULL if the GPU is not present.
1340 // This is the general purpose call that should be used normally.
1341 //
1342 // LOCKING: requires the global lock to be held
1343 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1344 
1345 // Get a uvm_parent_gpu_t by UUID (physical GPU UUID).
1346 // Like uvm_gpu_get_by_uuid(), this function returns NULL if the GPU has not
1347 // been registered.
1348 //
1349 // LOCKING: requires the global lock to be held
1350 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1351 
1352 // Like uvm_parent_gpu_get_by_uuid(), but this variant does not assertion-check
1353 // that the caller is holding the global_lock.  This is a narrower-purpose
1354 // function, and is only intended for use by the top-half ISR, or other very
1355 // limited cases.
1356 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid);
1357 
1358 // Retain a gpu by uuid
1359 // Returns the retained uvm_gpu_t in gpu_out on success
1360 //
1361 // LOCKING: Takes and releases the global lock for the caller.
1362 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid,
1363                                  const uvm_rm_user_object_t *user_rm_device,
1364                                  uvm_gpu_t **gpu_out);
1365 
1366 // Retain a gpu which is known to already be retained. Does NOT require the
1367 // global lock to be held.
1368 void uvm_gpu_retain(uvm_gpu_t *gpu);
1369 
1370 // Release a gpu
1371 // LOCKING: requires the global lock to be held
1372 void uvm_gpu_release_locked(uvm_gpu_t *gpu);
1373 
1374 // Like uvm_gpu_release_locked, but takes and releases the global lock for the
1375 // caller.
1376 void uvm_gpu_release(uvm_gpu_t *gpu);
1377 
uvm_gpu_retained_count(uvm_gpu_t * gpu)1378 static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
1379 {
1380     return atomic64_read(&gpu->retained_count);
1381 }
1382 
1383 // Decrease the refcount on the parent GPU object, and actually delete the object
1384 // if the refcount hits zero.
1385 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);
1386 
1387 // Calculates peer table index using GPU ids.
1388 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1389 
1390 // Either retains an existing PCIe peer entry or creates a new one. In both
1391 // cases the two GPUs are also each retained.
1392 // LOCKING: requires the global lock to be held
1393 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1394 
1395 // Releases a PCIe peer entry and the two GPUs.
1396 // LOCKING: requires the global lock to be held
1397 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1398 
1399 // Get the aperture for local_gpu to use to map memory resident on remote_gpu.
1400 // They must not be the same gpu.
1401 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);
1402 
1403 // Get the processor id accessible by the given GPU for the given physical
1404 // address.
1405 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
1406 
1407 // Get the P2P capabilities between the gpus with the given indexes
1408 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1409 
1410 // Get the P2P capabilities between the given gpus
uvm_gpu_peer_caps(const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1411 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1412 {
1413     return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
1414 }
1415 
uvm_gpus_are_nvswitch_connected(const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1416 static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1417 {
1418     if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
1419         UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
1420         return true;
1421     }
1422 
1423     return false;
1424 }
1425 
uvm_gpus_are_indirect_peers(uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1426 static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1427 {
1428     uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
1429 
1430     if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) {
1431         UVM_ASSERT(gpu0->mem_info.numa.enabled);
1432         UVM_ASSERT(gpu1->mem_info.numa.enabled);
1433         UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE);
1434         UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1));
1435         return true;
1436     }
1437 
1438     return false;
1439 }
1440 
1441 // Retrieve the virtual address corresponding to the given vidmem physical
1442 // address, according to the linear vidmem mapping in the GPU kernel address
1443 // space.
1444 //
1445 // The actual GPU mapping only exists if a full flat mapping, or a partial flat
1446 // mapping covering the passed address, has been previously created.
uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t * gpu,NvU64 pa)1447 static uvm_gpu_address_t uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t *gpu, NvU64 pa)
1448 {
1449     UVM_ASSERT(uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1450                uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent));
1451     UVM_ASSERT(pa <= gpu->mem_info.max_allocatable_address);
1452 
1453     if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent))
1454         UVM_ASSERT(gpu->static_flat_mapping.ready);
1455 
1456     return uvm_gpu_address_virtual(gpu->parent->flat_vidmem_va_base + pa);
1457 }
1458 
1459 // Retrieve the virtual address corresponding to the given sysmem physical
1460 // address, according to the linear sysmem mapping in the GPU kernel address
1461 // space.
1462 //
1463 // The actual GPU mapping only exists if a linear mapping covering the passed
1464 // address has been previously created.
uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t * parent_gpu,NvU64 pa)1465 static uvm_gpu_address_t uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t *parent_gpu, NvU64 pa)
1466 {
1467     UVM_ASSERT(uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(parent_gpu));
1468     UVM_ASSERT(pa <= (parent_gpu->dma_addressable_limit - parent_gpu->dma_addressable_start));
1469 
1470     return uvm_gpu_address_virtual(parent_gpu->flat_sysmem_va_base + pa);
1471 }
1472 
1473 // Given a GPU or CPU physical address (not peer), retrieve an address suitable
1474 // for CE access.
uvm_gpu_address_copy(uvm_gpu_t * gpu,uvm_gpu_phys_address_t phys_addr)1475 static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
1476 {
1477     UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS);
1478 
1479     if (phys_addr.aperture == UVM_APERTURE_VID) {
1480         if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1481             uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent))
1482             return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
1483     }
1484     else if (uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(gpu->parent)) {
1485         return uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, phys_addr.address);
1486     }
1487 
1488     return uvm_gpu_address_from_phys(phys_addr);
1489 }
1490 
uvm_gpu_get_peer_mapping(uvm_gpu_t * gpu,uvm_gpu_id_t peer_id)1491 static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_gpu_id_t peer_id)
1492 {
1493     return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)];
1494 }
1495 
1496 // Check for ECC errors
1497 //
1498 // Notably this check cannot be performed where it's not safe to call into RM.
1499 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
1500 
1501 // Check for ECC errors without calling into RM
1502 //
1503 // Calling into RM is problematic in many places, this check is always safe to
1504 // do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
1505 // and it's required to call uvm_gpu_check_ecc_error() to be sure.
1506 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
1507 
1508 // Map size bytes of contiguous sysmem on the GPU for physical access
1509 //
1510 // size has to be aligned to PAGE_SIZE.
1511 //
1512 // Returns the physical address of the pages that can be used to access them on
1513 // the GPU.
1514 NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out);
1515 
1516 // Unmap num_pages pages previously mapped with uvm_parent_gpu_map_cpu_pages().
1517 void uvm_parent_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size);
1518 
uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t * parent_gpu,struct page * page,NvU64 * dma_address_out)1519 static NV_STATUS uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t *parent_gpu, struct page *page, NvU64 *dma_address_out)
1520 {
1521     return uvm_parent_gpu_map_cpu_pages(parent_gpu, page, PAGE_SIZE, dma_address_out);
1522 }
1523 
uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t * parent_gpu,NvU64 dma_address)1524 static void uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address)
1525 {
1526     uvm_parent_gpu_unmap_cpu_pages(parent_gpu, dma_address, PAGE_SIZE);
1527 }
1528 
1529 // Allocate and map a page of system DMA memory on the GPU for physical access
1530 //
1531 // Returns
1532 // - the address of the page that can be used to access them on
1533 //   the GPU in the dma_address_out parameter.
1534 // - the address of allocated memory in CPU virtual address space.
1535 void *uvm_parent_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu,
1536                                     gfp_t gfp_flags,
1537                                     NvU64 *dma_address_out);
1538 
1539 // Unmap and free size bytes of contiguous sysmem DMA previously allocated
1540 // with uvm_parent_gpu_map_cpu_pages().
1541 void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address);
1542 
1543 // Returns whether the given range is within the GPU's addressable VA ranges.
1544 // It requires the input 'addr' to be in canonical form for platforms compliant
1545 // to canonical form addresses, i.e., ARM64, and x86.
1546 // Warning: This only checks whether the GPU's MMU can support the given
1547 // address. Some HW units on that GPU might only support a smaller range.
1548 //
1549 // The GPU must be initialized before calling this function.
1550 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1551 
1552 // Returns whether the given range is within the GPU's addressable VA ranges in
1553 // the internal GPU VA "kernel" address space, which is a linear address space.
1554 // Therefore, the input 'addr' must not be in canonical form, even platforms
1555 // that use to the canonical form addresses, i.e., ARM64, and x86.
1556 // Warning: This only checks whether the GPU's MMU can support the given
1557 // address. Some HW units on that GPU might only support a smaller range.
1558 //
1559 // The GPU must be initialized before calling this function.
1560 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1561 
1562 bool uvm_platform_uses_canonical_form_address(void);
1563 
1564 // Returns addr's canonical form for host systems that use canonical form
1565 // addresses.
1566 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
1567 
uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t * parent_gpu)1568 static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
1569 {
1570     return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
1571 }
1572 
uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t * parent_gpu)1573 static bool uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t *parent_gpu)
1574 {
1575     return parent_gpu->max_host_va > (1ull << 40);
1576 }
1577 
uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t * parent_gpu)1578 static bool uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t *parent_gpu)
1579 {
1580     // Eviction is supported only if the GPU supports replayable faults
1581     return parent_gpu->replayable_faults_supported;
1582 }
1583 
uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t * parent_gpu)1584 static bool uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t *parent_gpu)
1585 {
1586     return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_HEAVY;
1587 }
1588 
uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t * parent_gpu)1589 static bool uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t *parent_gpu)
1590 {
1591     return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_STANDARD;
1592 }
1593 
1594 // Returns true if the virtualization mode is SR-IOV heavy or SR-IOV standard.
uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t * parent_gpu)1595 static bool uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t *parent_gpu)
1596 {
1597     return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu) ||
1598            uvm_parent_gpu_is_virt_mode_sriov_standard(parent_gpu);
1599 }
1600 
uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t * parent_gpu)1601 static bool uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t *parent_gpu)
1602 {
1603     return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu);
1604 }
1605 
1606 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu);
1607 
1608 // Debug print of GPU properties
1609 void uvm_gpu_print(uvm_gpu_t *gpu);
1610 
1611 // Add the given instance pointer -> user_channel mapping to this GPU. The
1612 // bottom half GPU page fault handler uses this to look up the VA space for GPU
1613 // faults.
1614 NV_STATUS uvm_parent_gpu_add_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1615 void uvm_parent_gpu_remove_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1616 
1617 // Looks up an entry added by uvm_gpu_add_user_channel. Return codes:
1618 //  NV_OK                        Translation successful
1619 //  NV_ERR_INVALID_CHANNEL       Entry's instance pointer was not found
1620 //  NV_ERR_PAGE_TABLE_NOT_AVAIL  Entry's instance pointer is valid but the entry
1621 //                               targets an invalid subcontext
1622 //
1623 // out_va_space is valid if NV_OK is returned, otherwise it's NULL. The caller
1624 // is responsibile for ensuring that the returned va_space can't be destroyed,
1625 // so these functions should only be called from the bottom half.
1626 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1627                                                  uvm_fault_buffer_entry_t *fault,
1628                                                  uvm_va_space_t **out_va_space);
1629 
1630 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1631                                                           uvm_access_counter_buffer_entry_t *entry,
1632                                                           uvm_va_space_t **out_va_space);
1633 
1634 typedef enum
1635 {
1636     UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT,
1637     UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1638     UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
1639 } uvm_gpu_buffer_flush_mode_t;
1640 
1641 #endif // __UVM_GPU_H__
1642