1 /*******************************************************************************
2 Copyright (c) 2015-2023 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #ifndef __UVM_GPU_H__
25 #define __UVM_GPU_H__
26
27 #include "nvtypes.h"
28 #include "nvmisc.h"
29 #include "uvm_types.h"
30 #include "nv_uvm_types.h"
31 #include "uvm_linux.h"
32 #include "nv-kref.h"
33 #include "uvm_common.h"
34 #include "ctrl2080mc.h"
35 #include "uvm_forward_decl.h"
36 #include "uvm_processors.h"
37 #include "uvm_pmm_gpu.h"
38 #include "uvm_pmm_sysmem.h"
39 #include "uvm_mmu.h"
40 #include "uvm_gpu_replayable_faults.h"
41 #include "uvm_gpu_isr.h"
42 #include "uvm_hal_types.h"
43 #include "uvm_hmm.h"
44 #include "uvm_va_block_types.h"
45 #include "uvm_perf_module.h"
46 #include "uvm_rb_tree.h"
47 #include "uvm_perf_prefetch.h"
48 #include "nv-kthread-q.h"
49 #include <linux/mmu_notifier.h>
50 #include "uvm_conf_computing.h"
51
52 // Buffer length to store uvm gpu id, RM device name and gpu uuid.
53 #define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \
54 UVM_GPU_NAME_LENGTH + UVM_GPU_UUID_TEXT_BUFFER_LENGTH)
55
56 #define UVM_GPU_MAGIC_VALUE 0xc001d00d12341993ULL
57
58 typedef struct
59 {
60 // Number of faults from this uTLB that have been fetched but have not been
61 // serviced yet.
62 NvU32 num_pending_faults;
63
64 // Whether the uTLB contains fatal faults
65 bool has_fatal_faults;
66
67 // We have issued a replay of type START_ACK_ALL while containing fatal
68 // faults. This puts the uTLB in lockdown mode and no new translations are
69 // accepted.
70 bool in_lockdown;
71
72 // We have issued a cancel on this uTLB
73 bool cancelled;
74
75 uvm_fault_buffer_entry_t prev_fatal_fault;
76
77 // Last fetched fault that was originated from this uTLB. Used for fault
78 // filtering.
79 uvm_fault_buffer_entry_t *last_fault;
80 } uvm_fault_utlb_info_t;
81
82 struct uvm_service_block_context_struct
83 {
84 //
85 // Fields initialized by CPU/GPU fault handling and access counter routines
86 //
87
88 // Whether the information refers to replayable/non-replayable faults or
89 // access counters
90 uvm_service_operation_t operation;
91
92 // Processors that will be the residency of pages after the operation has
93 // been serviced
94 uvm_processor_mask_t resident_processors;
95
96 // VA block region that contains all the pages affected by the operation
97 uvm_va_block_region_t region;
98
99 // Array of type uvm_fault_access_type_t that contains the type of the
100 // access that caused the fault/access_counter notification to be serviced
101 // for each page.
102 NvU8 access_type[PAGES_PER_UVM_VA_BLOCK];
103
104 // Number of times the service operation has been retried
105 unsigned num_retries;
106
107 // Pages that need to be pinned due to thrashing
108 uvm_page_mask_t thrashing_pin_mask;
109
110 // Number of pages that need to be pinned due to thrashing. This is the same
111 // value as the result of bitmap_weight(thrashing_pin_mask)
112 unsigned thrashing_pin_count;
113
114 // Pages that can be read-duplicated
115 uvm_page_mask_t read_duplicate_mask;
116
117 // Number of pages that can be read-duplicated. This is the same value as
118 // the result of bitmap_weight(read_duplicate_count_mask)
119 unsigned read_duplicate_count;
120
121 //
122 // Fields used by the CPU fault handling routine
123 //
124
125 struct
126 {
127 // Node of the list of fault service contexts used by the CPU
128 struct list_head service_context_list;
129
130 // A mask of GPUs that need to be checked for ECC errors before the CPU
131 // fault handler returns, but after the VA space lock has been unlocked
132 // to avoid the RM/UVM VA space lock deadlocks.
133 uvm_processor_mask_t gpus_to_check_for_ecc;
134
135 // This is set to throttle page fault thrashing.
136 NvU64 wakeup_time_stamp;
137
138 // This is set if the page migrated to/from the GPU and CPU.
139 bool did_migrate;
140
141 // Sequence number used to start a mmu notifier read side critical
142 // section.
143 unsigned long notifier_seq;
144
145 struct vm_fault *vmf;
146 } cpu_fault;
147
148 //
149 // Fields managed by the common operation servicing routine
150 //
151
152 uvm_prot_page_mask_array_t mappings_by_prot;
153
154 // Mask with the pages that did not migrate to the processor (they were
155 // already resident) in the last call to uvm_va_block_make_resident.
156 // This is used to compute the pages that need to revoke mapping permissions
157 // from other processors.
158 uvm_page_mask_t did_not_migrate_mask;
159
160 // Pages whose permissions need to be revoked from other processors
161 uvm_page_mask_t revocation_mask;
162
163 // Temporary mask used in service_va_block_locked() in
164 // uvm_gpu_access_counters.c.
165 uvm_processor_mask_t update_processors;
166
167 struct
168 {
169 // Per-processor mask with the pages that will be resident after
170 // servicing. We need one mask per processor because we may coalesce
171 // faults that trigger migrations to different processors.
172 uvm_page_mask_t new_residency;
173 } per_processor_masks[UVM_ID_MAX_PROCESSORS];
174
175 // State used by the VA block routines called by the servicing routine
176 uvm_va_block_context_t *block_context;
177
178 // Prefetch state hint
179 uvm_perf_prefetch_hint_t prefetch_hint;
180
181 // Prefetch temporary state.
182 uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree;
183 };
184
185 typedef struct
186 {
187 // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
188 // VMA. Used for batching ATS faults in a vma. This is unused for access
189 // counter service requests.
190 uvm_page_mask_t read_fault_mask;
191
192 // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a
193 // SAM VMA. Used for batching ATS faults in a vma. This is unused for access
194 // counter service requests.
195 uvm_page_mask_t write_fault_mask;
196
197 // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region
198 // of a SAM VMA. Used to return ATS fault status. This is unused for access
199 // counter service requests.
200 uvm_page_mask_t faults_serviced_mask;
201
202 // Mask of successfully serviced read faults on pages in write_fault_mask.
203 // This is unused for access counter service requests.
204 uvm_page_mask_t reads_serviced_mask;
205
206 // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
207 // VMA. This is used as input for access counter service requests and output
208 // of fault service requests.
209 uvm_page_mask_t accessed_mask;
210
211 // Client type of the service requestor.
212 uvm_fault_client_type_t client_type;
213
214 // New residency ID of the faulting region.
215 uvm_processor_id_t residency_id;
216
217 // New residency NUMA node ID of the faulting region.
218 int residency_node;
219
220 struct
221 {
222 // True if preferred_location was set on this faulting region.
223 // UVM_VA_BLOCK_SIZE sized region in the faulting region bound by the
224 // VMA is is prefetched if preferred_location was set and if first_touch
225 // is true;
226 bool has_preferred_location;
227
228 // True if the UVM_VA_BLOCK_SIZE sized region isn't resident on any
229 // node. False if any page in the region is resident somewhere.
230 bool first_touch;
231
232 // Mask of prefetched pages in a UVM_VA_BLOCK_SIZE aligned region of a
233 // SAM VMA.
234 uvm_page_mask_t prefetch_pages_mask;
235
236 // PFN info of the faulting region
237 unsigned long pfns[PAGES_PER_UVM_VA_BLOCK];
238
239 // Faulting/preferred processor residency mask of the faulting region.
240 uvm_page_mask_t residency_mask;
241
242 #if defined(NV_MMU_INTERVAL_NOTIFIER)
243 // MMU notifier used to compute residency of this faulting region.
244 struct mmu_interval_notifier notifier;
245 #endif
246
247 uvm_va_space_t *va_space;
248
249 // Prefetch temporary state.
250 uvm_perf_prefetch_bitmap_tree_t bitmap_tree;
251 } prefetch_state;
252
253 } uvm_ats_fault_context_t;
254
255 struct uvm_fault_service_batch_context_struct
256 {
257 // Array of elements fetched from the GPU fault buffer. The number of
258 // elements in this array is exactly max_batch_size
259 uvm_fault_buffer_entry_t *fault_cache;
260
261 // Array of pointers to elements in fault cache used for fault
262 // preprocessing. The number of elements in this array is exactly
263 // max_batch_size
264 uvm_fault_buffer_entry_t **ordered_fault_cache;
265
266 // Per uTLB fault information. Used for replay policies and fault
267 // cancellation on Pascal
268 uvm_fault_utlb_info_t *utlbs;
269
270 // Largest uTLB id seen in a GPU fault
271 NvU32 max_utlb_id;
272
273 NvU32 num_cached_faults;
274
275 NvU32 num_coalesced_faults;
276
277 // One of the VA spaces in this batch which had fatal faults. If NULL, no
278 // faults were fatal. More than one VA space could have fatal faults, but we
279 // pick one to be the target of the cancel sequence.
280 uvm_va_space_t *fatal_va_space;
281
282 bool has_throttled_faults;
283
284 NvU32 num_invalid_prefetch_faults;
285
286 NvU32 num_duplicate_faults;
287
288 NvU32 num_replays;
289
290 uvm_ats_fault_context_t ats_context;
291
292 // Unique id (per-GPU) generated for tools events recording
293 NvU32 batch_id;
294
295 uvm_tracker_t tracker;
296
297 // Boolean used to avoid sorting the fault batch by instance_ptr if we
298 // determine at fetch time that all the faults in the batch report the same
299 // instance_ptr
300 bool is_single_instance_ptr;
301
302 // Last fetched fault. Used for fault filtering.
303 uvm_fault_buffer_entry_t *last_fault;
304 };
305
306 struct uvm_ats_fault_invalidate_struct
307 {
308 bool tlb_batch_pending;
309 uvm_tlb_batch_t tlb_batch;
310 };
311
312 typedef struct
313 {
314 // Fault buffer information and structures provided by RM
315 UvmGpuFaultInfo rm_info;
316
317 // Maximum number of faults to be processed in batch before fetching new
318 // entries from the GPU buffer
319 NvU32 max_batch_size;
320
321 struct uvm_replayable_fault_buffer_info_struct
322 {
323 // Maximum number of faults entries that can be stored in the buffer
324 NvU32 max_faults;
325
326 // Cached value of the GPU GET register to minimize the round-trips
327 // over PCIe
328 NvU32 cached_get;
329
330 // Cached value of the GPU PUT register to minimize the round-trips over
331 // PCIe
332 NvU32 cached_put;
333
334 // Policy that determines when GPU replays are issued during normal
335 // fault servicing
336 uvm_perf_fault_replay_policy_t replay_policy;
337
338 // Tracker used to aggregate replay operations, needed for fault cancel
339 // and GPU removal
340 uvm_tracker_t replay_tracker;
341
342 // If there is a ratio larger than replay_update_put_ratio of duplicate
343 // faults in a batch, PUT pointer is updated before flushing the buffer
344 // that comes before the replay method.
345 NvU32 replay_update_put_ratio;
346
347 // Fault statistics. These fields are per-GPU and most of them are only
348 // updated during fault servicing, and can be safely incremented.
349 // Migrations may be triggered by different GPUs and need to be
350 // incremented using atomics
351 struct
352 {
353 NvU64 num_prefetch_faults;
354
355 NvU64 num_read_faults;
356
357 NvU64 num_write_faults;
358
359 NvU64 num_atomic_faults;
360
361 NvU64 num_duplicate_faults;
362
363 atomic64_t num_pages_out;
364
365 atomic64_t num_pages_in;
366
367 NvU64 num_replays;
368
369 NvU64 num_replays_ack_all;
370 } stats;
371
372 // Number of uTLBs in the chip
373 NvU32 utlb_count;
374
375 // Context structure used to service a GPU fault batch
376 uvm_fault_service_batch_context_t batch_service_context;
377
378 // Structure used to coalesce fault servicing in a VA block
379 uvm_service_block_context_t block_service_context;
380
381 // Information required to invalidate stale ATS PTEs from the GPU TLBs
382 uvm_ats_fault_invalidate_t ats_invalidate;
383 } replayable;
384
385 struct uvm_non_replayable_fault_buffer_info_struct
386 {
387 // Maximum number of faults entries that can be stored in the buffer
388 NvU32 max_faults;
389
390 // Tracker used to aggregate clear faulted operations, needed for GPU
391 // removal
392 uvm_tracker_t clear_faulted_tracker;
393
394 // Buffer used to store elements popped out from the queue shared with
395 // RM for fault servicing.
396 void *shadow_buffer_copy;
397
398 // Array of elements fetched from the GPU fault buffer. The number of
399 // elements in this array is exactly max_batch_size
400 uvm_fault_buffer_entry_t *fault_cache;
401
402 // Fault statistics. See replayable fault stats for more details.
403 struct
404 {
405 NvU64 num_read_faults;
406
407 NvU64 num_write_faults;
408
409 NvU64 num_atomic_faults;
410
411 NvU64 num_physical_faults;
412
413 atomic64_t num_pages_out;
414
415 atomic64_t num_pages_in;
416 } stats;
417
418 // Tracker which temporarily holds the work pushed to service faults
419 uvm_tracker_t fault_service_tracker;
420
421 // Structure used to coalesce fault servicing in a VA block
422 uvm_service_block_context_t block_service_context;
423
424 // Unique id (per-GPU) generated for tools events recording
425 NvU32 batch_id;
426
427 // Information required to service ATS faults.
428 uvm_ats_fault_context_t ats_context;
429
430 // Information required to invalidate stale ATS PTEs from the GPU TLBs
431 uvm_ats_fault_invalidate_t ats_invalidate;
432 } non_replayable;
433
434 // Flag that tells if prefetch faults are enabled in HW
435 bool prefetch_faults_enabled;
436
437 // Timestamp when prefetch faults where disabled last time
438 NvU64 disable_prefetch_faults_timestamp;
439 } uvm_fault_buffer_info_t;
440
441 struct uvm_access_counter_service_batch_context_struct
442 {
443 uvm_access_counter_buffer_entry_t *notification_cache;
444
445 NvU32 num_cached_notifications;
446
447 struct
448 {
449 uvm_access_counter_buffer_entry_t **notifications;
450
451 NvU32 num_notifications;
452
453 // Boolean used to avoid sorting the fault batch by instance_ptr if we
454 // determine at fetch time that all the access counter notifications in
455 // the batch report the same instance_ptr
456 bool is_single_instance_ptr;
457 } virt;
458
459 struct
460 {
461 uvm_access_counter_buffer_entry_t **notifications;
462 uvm_reverse_map_t *translations;
463
464 NvU32 num_notifications;
465
466 // Boolean used to avoid sorting the fault batch by aperture if we
467 // determine at fetch time that all the access counter notifications in
468 // the batch report the same aperture
469 bool is_single_aperture;
470 } phys;
471
472 // Helper page mask to compute the accessed pages within a VA block
473 uvm_page_mask_t accessed_pages;
474
475 // Structure used to coalesce access counter servicing in a VA block
476 uvm_service_block_context_t block_service_context;
477
478 // Structure used to service access counter migrations in an ATS block.
479 uvm_ats_fault_context_t ats_context;
480
481 // Unique id (per-GPU) generated for tools events recording
482 NvU32 batch_id;
483 };
484
485 typedef struct
486 {
487 // Values used to configure access counters in RM
488 struct
489 {
490 UVM_ACCESS_COUNTER_GRANULARITY granularity;
491 UVM_ACCESS_COUNTER_USE_LIMIT use_limit;
492 } rm;
493
494 // The following values are precomputed by the access counter notification
495 // handling code. See comments for UVM_MAX_TRANSLATION_SIZE in
496 // uvm_gpu_access_counters.c for more details.
497 NvU64 translation_size;
498
499 NvU64 translations_per_counter;
500
501 NvU64 sub_granularity_region_size;
502
503 NvU64 sub_granularity_regions_per_translation;
504 } uvm_gpu_access_counter_type_config_t;
505
506 typedef struct
507 {
508 UvmGpuAccessCntrInfo rm_info;
509
510 NvU32 max_notifications;
511
512 NvU32 max_batch_size;
513
514 // Cached value of the GPU GET register to minimize the round-trips
515 // over PCIe
516 NvU32 cached_get;
517
518 // Cached value of the GPU PUT register to minimize the round-trips over
519 // PCIe
520 NvU32 cached_put;
521
522 // Tracker used to aggregate access counters clear operations, needed for
523 // GPU removal
524 uvm_tracker_t clear_tracker;
525
526 // Current access counter configuration. During normal operation this
527 // information is computed once during GPU initialization. However, tests
528 // may override it to try different configuration values.
529 struct
530 {
531 uvm_gpu_access_counter_type_config_t mimc;
532 uvm_gpu_access_counter_type_config_t momc;
533
534 NvU32 threshold;
535 } current_config;
536
537 // Access counter statistics
538 struct
539 {
540 atomic64_t num_pages_out;
541
542 atomic64_t num_pages_in;
543 } stats;
544
545 // Ignoring access counters means that notifications are left in the HW
546 // buffer without being serviced. Requests to ignore access counters
547 // are counted since the suspend path inhibits access counter interrupts,
548 // and the resume path needs to know whether to reenable them.
549 NvU32 notifications_ignored_count;
550
551 // Context structure used to service a GPU access counter batch
552 uvm_access_counter_service_batch_context_t batch_service_context;
553
554 // VA space that reconfigured the access counters configuration, if any.
555 // Used in builtin tests only, to avoid reconfigurations from different
556 // processes
557 //
558 // Locking: both readers and writers must hold the access counters ISR lock
559 uvm_va_space_t *reconfiguration_owner;
560 } uvm_access_counter_buffer_info_t;
561
562 typedef struct
563 {
564 // VA where the identity mapping should be mapped in the internal VA
565 // space managed by uvm_gpu_t.address_space_tree (see below).
566 NvU64 base;
567
568 // Page tables with the mapping.
569 uvm_page_table_range_vec_t *range_vec;
570
571 // Used during init to indicate whether the mapping has been fully
572 // initialized.
573 bool ready;
574 } uvm_gpu_identity_mapping_t;
575
576 // Root chunk mapping
577 typedef struct
578 {
579 // Page table range representation of the mapping. Because a root chunk
580 // fits into a single 2MB page, in practice the range consists of a single
581 // 2MB PTE.
582 uvm_page_table_range_t *range;
583
584 // Number of mapped pages of size PAGE_SIZE.
585 NvU32 num_mapped_pages;
586 } uvm_gpu_root_chunk_mapping_t;
587
588 typedef enum
589 {
590 UVM_GPU_LINK_INVALID = 0,
591 UVM_GPU_LINK_PCIE,
592 UVM_GPU_LINK_NVLINK_1,
593 UVM_GPU_LINK_NVLINK_2,
594 UVM_GPU_LINK_NVLINK_3,
595 UVM_GPU_LINK_NVLINK_4,
596 UVM_GPU_LINK_C2C,
597 UVM_GPU_LINK_MAX
598 } uvm_gpu_link_type_t;
599
600 typedef enum
601 {
602 // Peer copies can be disallowed for a variety of reasons. For example,
603 // P2P transfers are disabled in pre-Pascal GPUs because there is no
604 // compelling use case for direct peer migrations.
605 UVM_GPU_PEER_COPY_MODE_UNSUPPORTED,
606
607 // Pascal+ GPUs support virtual addresses in P2P copies. Virtual peer copies
608 // require the creation of peer identity mappings.
609 UVM_GPU_PEER_COPY_MODE_VIRTUAL,
610
611 // Ampere+ GPUs support virtual and physical peer copies. Physical peer
612 // copies do not depend on peer identity mappings.
613 UVM_GPU_PEER_COPY_MODE_PHYSICAL,
614
615 UVM_GPU_PEER_COPY_MODE_COUNT
616 } uvm_gpu_peer_copy_mode_t;
617
618 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
619 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
620 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
621 // partitions within the parent. The parent GPU and partition GPU have
622 // different "id" and "uuid".
623 struct uvm_gpu_struct
624 {
625 uvm_parent_gpu_t *parent;
626
627 // The gpu's GI uuid if SMC is enabled; otherwise, a copy of parent->uuid.
628 NvProcessorUuid uuid;
629
630 // Nice printable name in the format:
631 // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>.
632 // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character.
633 char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
634
635 // Refcount of the gpu, i.e. how many times it has been retained. This is
636 // roughly a count of how many times it has been registered with a VA space,
637 // except that some paths retain the GPU temporarily without a VA space.
638 //
639 // While this is >0, the GPU can't be removed. This differs from gpu_kref,
640 // which merely prevents the uvm_gpu_t object from being freed.
641 //
642 // In most cases this count is protected by the global lock: retaining a GPU
643 // from a UUID and any release require the global lock to be taken. But it's
644 // also useful for a caller to retain a GPU they've already retained, in
645 // which case there's no need to take the global lock. This can happen when
646 // an operation needs to drop the VA space lock but continue operating on a
647 // GPU. This is an atomic variable to handle those cases.
648 //
649 // Security note: keep it as a 64-bit counter to prevent overflow cases (a
650 // user can create a lot of va spaces and register the gpu with them).
651 atomic64_t retained_count;
652
653 // A unique uvm gpu id in range [1, UVM_ID_MAX_PROCESSORS).
654 uvm_gpu_id_t id;
655
656 // Should be UVM_GPU_MAGIC_VALUE. Used for memory checking.
657 NvU64 magic;
658
659 struct
660 {
661 // The amount of memory the GPU has in total, in bytes. If the GPU is in
662 // ZeroFB testing mode, this will be 0.
663 NvU64 size;
664
665 // Max (inclusive) physical address of this GPU's memory that the driver
666 // can allocate through PMM (PMA).
667 NvU64 max_allocatable_address;
668
669 // Max supported vidmem page size may be smaller than the max GMMU page
670 // size, because of the vMMU supported page sizes.
671 NvU64 max_vidmem_page_size;
672
673 struct
674 {
675 // True if the platform supports HW coherence and the GPU's memory
676 // is exposed as a NUMA node to the kernel.
677 bool enabled;
678 unsigned int node_id;
679 } numa;
680 } mem_info;
681
682 struct
683 {
684 // Big page size used by the internal UVM VA space
685 // Notably it may be different than the big page size used by a user's
686 // VA space in general.
687 NvU32 internal_size;
688 } big_page;
689
690 // Mapped registers needed to obtain the current GPU timestamp
691 struct
692 {
693 volatile NvU32 *time0_register;
694 volatile NvU32 *time1_register;
695 } time;
696
697 // Identity peer mappings are only defined when
698 // peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL
699 uvm_gpu_identity_mapping_t peer_mappings[UVM_ID_MAX_GPUS];
700
701 struct
702 {
703 // Mask of peer_gpus set
704 //
705 // We can use a regular processor id because P2P is not allowed between
706 // partitioned GPUs when SMC is enabled
707 uvm_processor_mask_t peer_gpu_mask;
708
709 // lazily-populated array of peer GPUs, indexed by the peer's GPU index
710 uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];
711
712 // Leaf spinlock used to synchronize access to the peer_gpus table so
713 // that it can be safely accessed from the access counters bottom half
714 uvm_spinlock_t peer_gpus_lock;
715 } peer_info;
716
717 // Maximum number of subcontexts supported
718 NvU32 max_subcontexts;
719
720 // RM address space handle used in many of the UVM/RM APIs
721 // Represents a GPU VA space within rm_device.
722 //
723 // In SR-IOV heavy, proxy channels are not associated with this address
724 // space.
725 uvmGpuAddressSpaceHandle rm_address_space;
726
727 // Page tree used for the internal UVM VA space shared with RM
728 uvm_page_tree_t address_space_tree;
729
730 // Set to true during add_gpu() as soon as the RM's address space is moved
731 // to the address_space_tree.
732 bool rm_address_space_moved_to_page_tree;
733
734 uvm_gpu_semaphore_pool_t *semaphore_pool;
735
736 uvm_gpu_semaphore_pool_t *secure_semaphore_pool;
737
738 uvm_channel_manager_t *channel_manager;
739
740 uvm_pmm_gpu_t pmm;
741
742 // Flat linear mapping covering vidmem. This is a kernel mapping that is
743 // only created in certain configurations.
744 //
745 // There are two mutually exclusive versions of the mapping. The simplest
746 // version covers the entire GPU memory, and it is created during GPU
747 // initialization. The dynamic version is a partial vidmem mapping that
748 // creates and destroys mappings to GPU root chunks on demand.
749 union
750 {
751 // Static mapping covering the whole GPU memory.
752 uvm_gpu_identity_mapping_t static_flat_mapping;
753
754 // Dynamic mapping of GPU memory.
755 struct
756 {
757 // Array of root chunk mappings.
758 uvm_gpu_root_chunk_mapping_t *array;
759
760 // Number of elements in the array.
761 size_t count;
762
763 // Each bit in the bitlock protects a single root chunk mapping.
764 uvm_bit_locks_t bitlocks;
765
766 } root_chunk_mappings;
767 };
768
769 // Linear sysmem mappings. Mappings are added on demand, and removed upon
770 // GPU deinitialization. The mappings are added to UVM's internal address
771 // space i.e. they are kernel mappings.
772 //
773 // Only used in SR-IOV heavy.
774 struct
775 {
776 // Size of each mapping, in bytes.
777 NvU64 mapping_size;
778
779 // Array of sysmem mappings.
780 uvm_gpu_identity_mapping_t *array;
781
782 // Number of elements in the array.
783 size_t count;
784
785 // Each bit in the bitlock protects a sysmem mapping.
786 uvm_bit_locks_t bitlocks;
787 } sysmem_mappings;
788
789 // Reverse lookup table used to query the user mapping associated with a
790 // sysmem (DMA) physical address.
791 //
792 // The system memory mapping information referred to by this field is
793 // different from that of sysmem_mappings, because it relates to user
794 // mappings (instead of kernel), and it is used in most configurations.
795 uvm_pmm_sysmem_mappings_t pmm_reverse_sysmem_mappings;
796
797 struct
798 {
799 uvm_conf_computing_dma_buffer_pool_t dma_buffer_pool;
800
801 // Dummy memory used to store the IV contents during CE encryption.
802 // This memory location is also only available after CE channels
803 // because we use them to write PTEs for allocations such as this one.
804 // This location is used when a physical addressing for the IV buffer
805 // is required. See uvm_hal_hopper_ce_encrypt().
806 uvm_mem_t *iv_mem;
807
808 // Dummy memory used to store the IV contents during CE encryption.
809 // Because of the limitations of `iv_mem', and the need to have such
810 // buffer at channel initialization, we use an RM allocation.
811 // This location is used when a virtual addressing for the IV buffer
812 // is required. See uvm_hal_hopper_ce_encrypt().
813 uvm_rm_mem_t *iv_rm_mem;
814 } conf_computing;
815
816 // ECC handling
817 // In order to trap ECC errors as soon as possible the driver has the hw
818 // interrupt register mapped directly. If an ECC interrupt is ever noticed
819 // to be pending, then the UVM driver needs to:
820 //
821 // 1) ask RM to service interrupts, and then
822 // 2) inspect the ECC error notifier state.
823 //
824 // Notably, checking for channel errors is not enough, because ECC errors
825 // can be pending, even after a channel has become idle.
826 //
827 // See more details in uvm_gpu_check_ecc_error().
828 struct
829 {
830 // Does the GPU have ECC enabled?
831 bool enabled;
832
833 // Direct mapping of the 32-bit part of the hw interrupt tree that has
834 // the ECC bits.
835 volatile NvU32 *hw_interrupt_tree_location;
836
837 // Mask to get the ECC interrupt bits from the 32-bits above.
838 NvU32 mask;
839
840 // Set to true by RM when a fatal ECC error is encountered (requires
841 // asking RM to service pending interrupts to be current).
842 NvBool *error_notifier;
843 } ecc;
844
845 struct
846 {
847 NvU32 swizz_id;
848
849 // RM device handle used in many of the UVM/RM APIs.
850 //
851 // Do not read this field directly, use uvm_gpu_device_handle instead.
852 uvmGpuDeviceHandle rm_device;
853 } smc;
854
855 struct
856 {
857 struct proc_dir_entry *dir;
858
859 struct proc_dir_entry *dir_symlink;
860
861 // The GPU instance UUID symlink if SMC is enabled.
862 struct proc_dir_entry *gpu_instance_uuid_symlink;
863
864 struct proc_dir_entry *info_file;
865
866 struct proc_dir_entry *dir_peers;
867 } procfs;
868
869 // Placeholder for per-GPU performance heuristics information
870 uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
871
872 // Force pushbuffer's GPU VA to be >= 1TB; used only for testing purposes.
873 bool uvm_test_force_upper_pushbuffer_segment;
874 };
875
876 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
877 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices
878 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual
879 // partitions within the parent. The parent GPU and partition GPU have
880 // different "id" and "uuid".
881 struct uvm_parent_gpu_struct
882 {
883 // Reference count for how many places are holding on to a parent GPU
884 // (internal to the UVM driver). This includes any GPUs we know about, not
885 // just GPUs that are registered with a VA space. Most GPUs end up being
886 // registered, but there are brief periods when they are not registered,
887 // such as during interrupt handling, and in add_gpu() or remove_gpu().
888 nv_kref_t gpu_kref;
889
890 // The number of uvm_gpu_ts referencing this uvm_parent_gpu_t.
891 NvU32 num_retained_gpus;
892
893 uvm_gpu_t *gpus[UVM_PARENT_ID_MAX_SUB_PROCESSORS];
894
895 // Bitmap of valid child entries in the gpus[] table. Used to retrieve a
896 // usable child GPU in bottom-halves.
897 DECLARE_BITMAP(valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
898
899 // The gpu's uuid
900 NvProcessorUuid uuid;
901
902 // Nice printable name including the uvm gpu id, ascii name from RM and uuid
903 char name[UVM_GPU_NICE_NAME_BUFFER_LENGTH];
904
905 // GPU information and provided by RM (architecture, implementation,
906 // hardware classes, etc.).
907 UvmGpuInfo rm_info;
908
909 // A unique uvm gpu id in range [1, UVM_PARENT_ID_MAX_PROCESSORS)
910 uvm_parent_gpu_id_t id;
911
912 // Reference to the Linux PCI device
913 //
914 // The reference to the PCI device remains valid as long as the GPU is
915 // registered with RM's Linux layer (between nvUvmInterfaceRegisterGpu() and
916 // nvUvmInterfaceUnregisterGpu()).
917 struct pci_dev *pci_dev;
918
919 // NVLINK Processing Unit (NPU) on PowerPC platforms. The NPU is a
920 // collection of CPU-side PCI devices which bridge GPU NVLINKs and the CPU
921 // memory bus.
922 //
923 // There is one PCI device per NVLINK. A set of NVLINKs connects to a single
924 // GPU, and all NVLINKs for a given socket are collected logically under
925 // this UVM NPU because some resources (such as register mappings) are
926 // shared by all those NVLINKs. This means multiple GPUs may connect to the
927 // same UVM NPU.
928 uvm_ibm_npu_t *npu;
929
930 // On kernels with NUMA support, this entry contains the closest CPU NUMA
931 // node to this GPU. Otherwise, the value will be -1.
932 int closest_cpu_numa_node;
933
934 // RM device handle used in many of the UVM/RM APIs.
935 //
936 // Do not read this field directly, use uvm_gpu_device_handle instead.
937 uvmGpuDeviceHandle rm_device;
938
939 // The physical address range addressable by the GPU
940 //
941 // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
942 // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when
943 // referencing sysmem from the GPU, dma_addressable_start should be
944 // subtracted from the physical address. The DMA mapping helpers like
945 // uvm_parent_gpu_map_cpu_pages() and uvm_parent_gpu_dma_alloc_page() take
946 // care of that.
947 NvU64 dma_addressable_start;
948 NvU64 dma_addressable_limit;
949
950 // Total size (in bytes) of physically mapped (with
951 // uvm_parent_gpu_map_cpu_pages) sysmem pages, used for leak detection.
952 atomic64_t mapped_cpu_pages_size;
953
954 // Hardware Abstraction Layer
955 uvm_host_hal_t *host_hal;
956 uvm_ce_hal_t *ce_hal;
957 uvm_arch_hal_t *arch_hal;
958 uvm_fault_buffer_hal_t *fault_buffer_hal;
959 uvm_access_counter_buffer_hal_t *access_counter_buffer_hal;
960 uvm_sec2_hal_t *sec2_hal;
961
962 // Whether CE supports physical addressing mode for writes to vidmem
963 bool ce_phys_vidmem_write_supported;
964
965 uvm_gpu_peer_copy_mode_t peer_copy_mode;
966
967 // Virtualization mode of the GPU.
968 UVM_VIRT_MODE virt_mode;
969
970 // Pascal+ GPUs can trigger faults on prefetch instructions. If false, this
971 // feature must be disabled at all times in GPUs of the given architecture.
972 // If true, the feature can be toggled at will by SW.
973 //
974 // The field should not be used unless the GPU supports replayable faults.
975 bool prefetch_fault_supported;
976
977 // Number of membars required to flush out HSHUB following a TLB invalidate
978 NvU32 num_hshub_tlb_invalidate_membars;
979
980 // Whether the channels can configure GPFIFO in vidmem
981 bool gpfifo_in_vidmem_supported;
982
983 bool replayable_faults_supported;
984
985 bool non_replayable_faults_supported;
986
987 bool access_counters_supported;
988
989 // If this is true, physical address based access counter notifications are
990 // potentially generated. If false, only virtual address based notifications
991 // are generated (assuming access_counters_supported is true too).
992 bool access_counters_can_use_physical_addresses;
993
994 bool fault_cancel_va_supported;
995
996 // True if the GPU has hardware support for scoped atomics
997 bool scoped_atomics_supported;
998
999 // If true, a HW method can be used to clear a faulted channel.
1000 // If false, then the GPU supports clearing faulted channels using registers
1001 // instead of a HW method.
1002 // This value is only defined for GPUs that support non-replayable faults.
1003 bool has_clear_faulted_channel_method;
1004
1005 // If true, a SW method can be used to clear a faulted channel.
1006 // If false, the HW method or the registers (whichever is available
1007 // according to has_clear_faulted_channel_method) needs to be used.
1008 //
1009 // This value is only defined for GPUs that support non-replayable faults.
1010 bool has_clear_faulted_channel_sw_method;
1011
1012 bool sparse_mappings_supported;
1013
1014 // Ampere(GA100) requires map->invalidate->remap->invalidate for page size
1015 // promotion
1016 bool map_remap_larger_page_promotion;
1017
1018 bool plc_supported;
1019
1020 // If true, page_tree initialization pre-populates no_ats_ranges. It only
1021 // affects ATS systems.
1022 bool no_ats_range_required;
1023
1024 // Parameters used by the TLB batching API
1025 struct
1026 {
1027 // Is the targeted (single page) VA invalidate supported at all?
1028 NvBool va_invalidate_supported;
1029
1030 // Is the VA range invalidate supported?
1031 NvBool va_range_invalidate_supported;
1032
1033 union
1034 {
1035 // Maximum (inclusive) number of single page invalidations before
1036 // falling back to invalidate all
1037 NvU32 max_pages;
1038
1039 // Maximum (inclusive) number of range invalidations before falling
1040 // back to invalidate all
1041 NvU32 max_ranges;
1042 };
1043 } tlb_batch;
1044
1045 // Largest VA (exclusive) which can be used for channel buffer mappings
1046 NvU64 max_channel_va;
1047
1048 // Largest VA (exclusive) which Host can operate.
1049 NvU64 max_host_va;
1050
1051 // Indicates whether the GPU can map sysmem with pages larger than 4k
1052 bool can_map_sysmem_with_large_pages;
1053
1054 // VA base and size of the RM managed part of the internal UVM VA space.
1055 //
1056 // The internal UVM VA is shared with RM by RM controlling some of the top
1057 // level PDEs and leaving the rest for UVM to control.
1058 // On Pascal a single top level PDE covers 128 TB of VA and given that
1059 // semaphores and other allocations limited to 40bit are currently allocated
1060 // through RM, RM needs to control the [0, 128TB) VA range at least for now.
1061 // On Maxwell, limit RMs VA to [0, 128GB) that should easily fit
1062 // all RM allocations and leave enough space for UVM.
1063 NvU64 rm_va_base;
1064 NvU64 rm_va_size;
1065
1066 // Base and size of the GPU VA used for uvm_mem_t allocations mapped in the
1067 // internal address_space_tree.
1068 NvU64 uvm_mem_va_base;
1069 NvU64 uvm_mem_va_size;
1070
1071 // Base of the GPU VAs used for the vidmem and sysmem flat mappings.
1072 NvU64 flat_vidmem_va_base;
1073 NvU64 flat_sysmem_va_base;
1074
1075 // Bitmap of allocation sizes for user memory supported by a GPU. PAGE_SIZE
1076 // is guaranteed to be both present and the smallest size.
1077 uvm_chunk_sizes_mask_t mmu_user_chunk_sizes;
1078
1079 // Bitmap of allocation sizes that could be requested by the page tree for
1080 // a GPU
1081 uvm_chunk_sizes_mask_t mmu_kernel_chunk_sizes;
1082
1083 struct
1084 {
1085 struct proc_dir_entry *dir;
1086
1087 struct proc_dir_entry *fault_stats_file;
1088
1089 struct proc_dir_entry *access_counters_file;
1090 } procfs;
1091
1092 // Interrupt handling state and locks
1093 uvm_isr_info_t isr;
1094
1095 // Fault buffer info. This is only valid if supports_replayable_faults is
1096 // set to true.
1097 uvm_fault_buffer_info_t fault_buffer_info;
1098
1099 // PMM lazy free processing queue.
1100 // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
1101 nv_kthread_q_t lazy_free_q;
1102
1103 // Access counter buffer info. This is only valid if
1104 // supports_access_counters is set to true.
1105 uvm_access_counter_buffer_info_t access_counter_buffer_info;
1106
1107 // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
1108 NvU32 utlb_per_gpc_count;
1109
1110 // In order to service GPU faults, UVM must be able to obtain the VA
1111 // space for each reported fault. The fault packet contains the
1112 // instance_ptr of the channel that was bound when the SMs triggered
1113 // the fault. On fault any instance pointer in the TSG may be
1114 // reported. This is a problem on Volta, which allow different channels
1115 // in the TSG to be bound to different VA spaces in order to support
1116 // subcontexts. In order to be able to obtain the correct VA space, HW
1117 // provides the subcontext id (or VEID) in addition to the instance_ptr.
1118 //
1119 // Summary:
1120 //
1121 // 1) Channels in a TSG may be in different VA spaces, identified by their
1122 // subcontext ID.
1123 // 2) Different subcontext IDs may map to the same or different VA spaces.
1124 // 3) On fault, any instance pointer in the TSG may be reported. The
1125 // reported subcontext ID identifies which VA space within the TSG actually
1126 // encountered the fault.
1127 //
1128 // Thus, UVM needs to keep track of all the instance pointers that belong
1129 // to the same TSG. We use two tables:
1130 //
1131 // - instance_ptr_table (instance_ptr -> subctx_info) this table maps
1132 // instance pointers to the subcontext info descriptor for the channel. If
1133 // the channel belongs to a subcontext, this descriptor will contain all
1134 // the VA spaces for the subcontexts in the same TSG. If the channel does
1135 // not belong to a subcontext, it will only contain a pointer to its VA
1136 // space.
1137 // - tsg_table (tsg_id -> subctx_info): this table also stores the
1138 // subctx information, but in this case it is indexed by TSG ID. Thus,
1139 // when a new channel bound to a subcontext is registered, it will check
1140 // first in this table if the subcontext information descriptor for its TSG
1141 // already exists, otherwise it will create it. Channels not bound to
1142 // subcontexts will not use this table.
1143 //
1144 // The bottom half reads the tables under
1145 // isr.replayable_faults_handler.lock, but a separate lock is necessary
1146 // because entries are added and removed from the table under the va_space
1147 // lock, and we can't take isr.replayable_faults_handler.lock while holding
1148 // the va_space lock.
1149 uvm_rb_tree_t tsg_table;
1150
1151 uvm_rb_tree_t instance_ptr_table;
1152 uvm_spinlock_t instance_ptr_table_lock;
1153
1154 // This is set to true if the GPU belongs to an SLI group.
1155 bool sli_enabled;
1156
1157 struct
1158 {
1159 bool supported;
1160
1161 bool enabled;
1162 } smc;
1163
1164 // Global statistics. These fields are per-GPU and most of them are only
1165 // updated during fault servicing, and can be safely incremented.
1166 struct
1167 {
1168 NvU64 num_replayable_faults;
1169
1170 NvU64 num_non_replayable_faults;
1171
1172 atomic64_t num_pages_out;
1173
1174 atomic64_t num_pages_in;
1175 } stats;
1176
1177 // Structure to hold nvswitch specific information. In an nvswitch
1178 // environment, rather than using the peer-id field of the PTE (which can
1179 // only address 8 gpus), all gpus are assigned a 47-bit physical address
1180 // space by the fabric manager. Any physical address access to these
1181 // physical address spaces are routed through the switch to the
1182 // corresponding peer.
1183 struct
1184 {
1185 bool is_nvswitch_connected;
1186
1187 // 47-bit fabric memory physical offset that peer gpus need to access
1188 // to read a peer's memory
1189 NvU64 fabric_memory_window_start;
1190 } nvswitch_info;
1191
1192 struct
1193 {
1194 // Note that this represents the link to system memory, not the link the
1195 // system used to discover the GPU. There are some cases such as NVLINK2
1196 // where the GPU is still on the PCIe bus, but it accesses memory over
1197 // this link rather than PCIe.
1198 uvm_gpu_link_type_t link;
1199 NvU32 link_rate_mbyte_per_s;
1200
1201 // Range in the system physical address space where the memory of this
1202 // GPU is exposed as coherent. memory_window_end is inclusive.
1203 // memory_window_start == memory_window_end indicates that no window is
1204 // present (coherence is not supported).
1205 NvU64 memory_window_start;
1206 NvU64 memory_window_end;
1207 } system_bus;
1208
1209 // WAR to issue ATS TLB invalidation commands ourselves.
1210 struct
1211 {
1212 uvm_mutex_t smmu_lock;
1213 struct page *smmu_cmdq;
1214 void __iomem *smmu_cmdqv_base;
1215 unsigned long smmu_prod;
1216 unsigned long smmu_cons;
1217 } smmu_war;
1218 };
1219
uvm_parent_gpu_name(uvm_parent_gpu_t * parent_gpu)1220 static const char *uvm_parent_gpu_name(uvm_parent_gpu_t *parent_gpu)
1221 {
1222 return parent_gpu->name;
1223 }
1224
uvm_gpu_name(uvm_gpu_t * gpu)1225 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
1226 {
1227 return gpu->name;
1228 }
1229
uvm_gpu_device_handle(uvm_gpu_t * gpu)1230 static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu)
1231 {
1232 if (gpu->parent->smc.enabled)
1233 return gpu->smc.rm_device;
1234 return gpu->parent->rm_device;
1235 }
1236
1237 struct uvm_gpu_peer_struct
1238 {
1239 // The fields in this global structure can only be inspected under one of
1240 // the following conditions:
1241 //
1242 // - The VA space lock is held for either read or write, both GPUs are
1243 // registered in the VA space, and the corresponding bit in the
1244 // va_space.enabled_peers bitmap is set.
1245 //
1246 // - The global lock is held.
1247 //
1248 // - While the global lock was held in the past, the two GPUs were detected
1249 // to be SMC peers and were both retained.
1250 //
1251 // - While the global lock was held in the past, the two GPUs were detected
1252 // to be NVLINK peers and were both retained.
1253 //
1254 // - While the global lock was held in the past, the two GPUs were detected
1255 // to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was called.
1256 //
1257 // - The peer_gpus_lock is held on one of the GPUs. In this case, the other
1258 // GPU must be read from the original GPU's peer_gpus table. The fields
1259 // will not change while the lock is held, but they may no longer be valid
1260 // because the other GPU might be in teardown.
1261
1262 // Peer Id associated with this device w.r.t. to a peer GPU.
1263 // Note: peerId (A -> B) != peerId (B -> A)
1264 // peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2)
1265 // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
1266 NvU8 peer_ids[2];
1267
1268 // Indirect peers are GPUs which can coherently access each others' memory
1269 // over NVLINK, but are routed through the CPU using the SYS aperture rather
1270 // than a PEER aperture
1271 NvU8 is_indirect_peer : 1;
1272
1273 // The link type between the peer GPUs, currently either PCIe or NVLINK.
1274 // This field is used to determine the when this peer struct has been
1275 // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are
1276 // initialized at GPU registration time. PCIe peers are initialized when
1277 // the refcount below goes from 0 to 1.
1278 uvm_gpu_link_type_t link_type;
1279
1280 // Maximum unidirectional bandwidth between the peers in megabytes per
1281 // second, not taking into account the protocols' overhead. The reported
1282 // bandwidth for indirect peers is zero. See UvmGpuP2PCapsParams.
1283 NvU32 total_link_line_rate_mbyte_per_s;
1284
1285 // For PCIe, the number of times that this has been retained by a VA space.
1286 // For NVLINK this will always be 1.
1287 NvU64 ref_count;
1288
1289 // This handle gets populated when enable_peer_access successfully creates
1290 // an NV50_P2P object. disable_peer_access resets the same on the object
1291 // deletion.
1292 NvHandle p2p_handle;
1293
1294 struct
1295 {
1296 struct proc_dir_entry *peer_file[2];
1297 struct proc_dir_entry *peer_symlink_file[2];
1298
1299 // GPU-A <-> GPU-B link is bidirectional, pairs[x][0] is always the
1300 // local GPU, while pairs[x][1] is the remote GPU. The table shall be
1301 // filled like so: [[GPU-A, GPU-B], [GPU-B, GPU-A]].
1302 uvm_gpu_t *pairs[2][2];
1303 } procfs;
1304 };
1305
1306 // Initialize global gpu state
1307 NV_STATUS uvm_gpu_init(void);
1308
1309 // Deinitialize global state (called from module exit)
1310 void uvm_gpu_exit(void);
1311
1312 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space);
1313
1314 void uvm_gpu_exit_va_space(uvm_va_space_t *va_space);
1315
uvm_gpu_numa_node(uvm_gpu_t * gpu)1316 static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu)
1317 {
1318 UVM_ASSERT(gpu->mem_info.numa.enabled);
1319 return gpu->mem_info.numa.node_id;
1320 }
1321
uvm_gpu_page_to_phys_address(uvm_gpu_t * gpu,struct page * page)1322 static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struct page *page)
1323 {
1324 unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT;
1325 unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start;
1326
1327 UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu));
1328 UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start);
1329 UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end);
1330
1331 return uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_offset);
1332 }
1333
1334 // Note that there is a uvm_gpu_get() function defined in uvm_global.h to break
1335 // a circular dep between global and gpu modules.
1336
1337 // Get a uvm_gpu_t by UUID (physical GPU UUID if SMC is not enabled, otherwise
1338 // GPU instance UUID).
1339 // This returns NULL if the GPU is not present.
1340 // This is the general purpose call that should be used normally.
1341 //
1342 // LOCKING: requires the global lock to be held
1343 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1344
1345 // Get a uvm_parent_gpu_t by UUID (physical GPU UUID).
1346 // Like uvm_gpu_get_by_uuid(), this function returns NULL if the GPU has not
1347 // been registered.
1348 //
1349 // LOCKING: requires the global lock to be held
1350 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid);
1351
1352 // Like uvm_parent_gpu_get_by_uuid(), but this variant does not assertion-check
1353 // that the caller is holding the global_lock. This is a narrower-purpose
1354 // function, and is only intended for use by the top-half ISR, or other very
1355 // limited cases.
1356 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid);
1357
1358 // Retain a gpu by uuid
1359 // Returns the retained uvm_gpu_t in gpu_out on success
1360 //
1361 // LOCKING: Takes and releases the global lock for the caller.
1362 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid,
1363 const uvm_rm_user_object_t *user_rm_device,
1364 uvm_gpu_t **gpu_out);
1365
1366 // Retain a gpu which is known to already be retained. Does NOT require the
1367 // global lock to be held.
1368 void uvm_gpu_retain(uvm_gpu_t *gpu);
1369
1370 // Release a gpu
1371 // LOCKING: requires the global lock to be held
1372 void uvm_gpu_release_locked(uvm_gpu_t *gpu);
1373
1374 // Like uvm_gpu_release_locked, but takes and releases the global lock for the
1375 // caller.
1376 void uvm_gpu_release(uvm_gpu_t *gpu);
1377
uvm_gpu_retained_count(uvm_gpu_t * gpu)1378 static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
1379 {
1380 return atomic64_read(&gpu->retained_count);
1381 }
1382
1383 // Decrease the refcount on the parent GPU object, and actually delete the object
1384 // if the refcount hits zero.
1385 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);
1386
1387 // Calculates peer table index using GPU ids.
1388 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1389
1390 // Either retains an existing PCIe peer entry or creates a new one. In both
1391 // cases the two GPUs are also each retained.
1392 // LOCKING: requires the global lock to be held
1393 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1394
1395 // Releases a PCIe peer entry and the two GPUs.
1396 // LOCKING: requires the global lock to be held
1397 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1398
1399 // Get the aperture for local_gpu to use to map memory resident on remote_gpu.
1400 // They must not be the same gpu.
1401 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);
1402
1403 // Get the processor id accessible by the given GPU for the given physical
1404 // address.
1405 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);
1406
1407 // Get the P2P capabilities between the gpus with the given indexes
1408 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
1409
1410 // Get the P2P capabilities between the given gpus
uvm_gpu_peer_caps(const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1411 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1412 {
1413 return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
1414 }
1415
uvm_gpus_are_nvswitch_connected(const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1416 static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1417 {
1418 if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
1419 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
1420 return true;
1421 }
1422
1423 return false;
1424 }
1425
uvm_gpus_are_indirect_peers(uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1426 static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1427 {
1428 uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
1429
1430 if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) {
1431 UVM_ASSERT(gpu0->mem_info.numa.enabled);
1432 UVM_ASSERT(gpu1->mem_info.numa.enabled);
1433 UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE);
1434 UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1));
1435 return true;
1436 }
1437
1438 return false;
1439 }
1440
1441 // Retrieve the virtual address corresponding to the given vidmem physical
1442 // address, according to the linear vidmem mapping in the GPU kernel address
1443 // space.
1444 //
1445 // The actual GPU mapping only exists if a full flat mapping, or a partial flat
1446 // mapping covering the passed address, has been previously created.
uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t * gpu,NvU64 pa)1447 static uvm_gpu_address_t uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t *gpu, NvU64 pa)
1448 {
1449 UVM_ASSERT(uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1450 uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent));
1451 UVM_ASSERT(pa <= gpu->mem_info.max_allocatable_address);
1452
1453 if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent))
1454 UVM_ASSERT(gpu->static_flat_mapping.ready);
1455
1456 return uvm_gpu_address_virtual(gpu->parent->flat_vidmem_va_base + pa);
1457 }
1458
1459 // Retrieve the virtual address corresponding to the given sysmem physical
1460 // address, according to the linear sysmem mapping in the GPU kernel address
1461 // space.
1462 //
1463 // The actual GPU mapping only exists if a linear mapping covering the passed
1464 // address has been previously created.
uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t * parent_gpu,NvU64 pa)1465 static uvm_gpu_address_t uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t *parent_gpu, NvU64 pa)
1466 {
1467 UVM_ASSERT(uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(parent_gpu));
1468 UVM_ASSERT(pa <= (parent_gpu->dma_addressable_limit - parent_gpu->dma_addressable_start));
1469
1470 return uvm_gpu_address_virtual(parent_gpu->flat_sysmem_va_base + pa);
1471 }
1472
1473 // Given a GPU or CPU physical address (not peer), retrieve an address suitable
1474 // for CE access.
uvm_gpu_address_copy(uvm_gpu_t * gpu,uvm_gpu_phys_address_t phys_addr)1475 static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr)
1476 {
1477 UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS);
1478
1479 if (phys_addr.aperture == UVM_APERTURE_VID) {
1480 if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
1481 uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent))
1482 return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address);
1483 }
1484 else if (uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(gpu->parent)) {
1485 return uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, phys_addr.address);
1486 }
1487
1488 return uvm_gpu_address_from_phys(phys_addr);
1489 }
1490
uvm_gpu_get_peer_mapping(uvm_gpu_t * gpu,uvm_gpu_id_t peer_id)1491 static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_gpu_id_t peer_id)
1492 {
1493 return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)];
1494 }
1495
1496 // Check for ECC errors
1497 //
1498 // Notably this check cannot be performed where it's not safe to call into RM.
1499 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);
1500
1501 // Check for ECC errors without calling into RM
1502 //
1503 // Calling into RM is problematic in many places, this check is always safe to
1504 // do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
1505 // and it's required to call uvm_gpu_check_ecc_error() to be sure.
1506 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
1507
1508 // Map size bytes of contiguous sysmem on the GPU for physical access
1509 //
1510 // size has to be aligned to PAGE_SIZE.
1511 //
1512 // Returns the physical address of the pages that can be used to access them on
1513 // the GPU.
1514 NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out);
1515
1516 // Unmap num_pages pages previously mapped with uvm_parent_gpu_map_cpu_pages().
1517 void uvm_parent_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size);
1518
uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t * parent_gpu,struct page * page,NvU64 * dma_address_out)1519 static NV_STATUS uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t *parent_gpu, struct page *page, NvU64 *dma_address_out)
1520 {
1521 return uvm_parent_gpu_map_cpu_pages(parent_gpu, page, PAGE_SIZE, dma_address_out);
1522 }
1523
uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t * parent_gpu,NvU64 dma_address)1524 static void uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address)
1525 {
1526 uvm_parent_gpu_unmap_cpu_pages(parent_gpu, dma_address, PAGE_SIZE);
1527 }
1528
1529 // Allocate and map a page of system DMA memory on the GPU for physical access
1530 //
1531 // Returns
1532 // - the address of the page that can be used to access them on
1533 // the GPU in the dma_address_out parameter.
1534 // - the address of allocated memory in CPU virtual address space.
1535 void *uvm_parent_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu,
1536 gfp_t gfp_flags,
1537 NvU64 *dma_address_out);
1538
1539 // Unmap and free size bytes of contiguous sysmem DMA previously allocated
1540 // with uvm_parent_gpu_map_cpu_pages().
1541 void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address);
1542
1543 // Returns whether the given range is within the GPU's addressable VA ranges.
1544 // It requires the input 'addr' to be in canonical form for platforms compliant
1545 // to canonical form addresses, i.e., ARM64, and x86.
1546 // Warning: This only checks whether the GPU's MMU can support the given
1547 // address. Some HW units on that GPU might only support a smaller range.
1548 //
1549 // The GPU must be initialized before calling this function.
1550 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1551
1552 // Returns whether the given range is within the GPU's addressable VA ranges in
1553 // the internal GPU VA "kernel" address space, which is a linear address space.
1554 // Therefore, the input 'addr' must not be in canonical form, even platforms
1555 // that use to the canonical form addresses, i.e., ARM64, and x86.
1556 // Warning: This only checks whether the GPU's MMU can support the given
1557 // address. Some HW units on that GPU might only support a smaller range.
1558 //
1559 // The GPU must be initialized before calling this function.
1560 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
1561
1562 bool uvm_platform_uses_canonical_form_address(void);
1563
1564 // Returns addr's canonical form for host systems that use canonical form
1565 // addresses.
1566 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
1567
uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t * parent_gpu)1568 static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
1569 {
1570 return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
1571 }
1572
uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t * parent_gpu)1573 static bool uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t *parent_gpu)
1574 {
1575 return parent_gpu->max_host_va > (1ull << 40);
1576 }
1577
uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t * parent_gpu)1578 static bool uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t *parent_gpu)
1579 {
1580 // Eviction is supported only if the GPU supports replayable faults
1581 return parent_gpu->replayable_faults_supported;
1582 }
1583
uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t * parent_gpu)1584 static bool uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t *parent_gpu)
1585 {
1586 return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_HEAVY;
1587 }
1588
uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t * parent_gpu)1589 static bool uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t *parent_gpu)
1590 {
1591 return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_STANDARD;
1592 }
1593
1594 // Returns true if the virtualization mode is SR-IOV heavy or SR-IOV standard.
uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t * parent_gpu)1595 static bool uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t *parent_gpu)
1596 {
1597 return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu) ||
1598 uvm_parent_gpu_is_virt_mode_sriov_standard(parent_gpu);
1599 }
1600
uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t * parent_gpu)1601 static bool uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t *parent_gpu)
1602 {
1603 return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu);
1604 }
1605
1606 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu);
1607
1608 // Debug print of GPU properties
1609 void uvm_gpu_print(uvm_gpu_t *gpu);
1610
1611 // Add the given instance pointer -> user_channel mapping to this GPU. The
1612 // bottom half GPU page fault handler uses this to look up the VA space for GPU
1613 // faults.
1614 NV_STATUS uvm_parent_gpu_add_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1615 void uvm_parent_gpu_remove_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel);
1616
1617 // Looks up an entry added by uvm_gpu_add_user_channel. Return codes:
1618 // NV_OK Translation successful
1619 // NV_ERR_INVALID_CHANNEL Entry's instance pointer was not found
1620 // NV_ERR_PAGE_TABLE_NOT_AVAIL Entry's instance pointer is valid but the entry
1621 // targets an invalid subcontext
1622 //
1623 // out_va_space is valid if NV_OK is returned, otherwise it's NULL. The caller
1624 // is responsibile for ensuring that the returned va_space can't be destroyed,
1625 // so these functions should only be called from the bottom half.
1626 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1627 uvm_fault_buffer_entry_t *fault,
1628 uvm_va_space_t **out_va_space);
1629
1630 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu,
1631 uvm_access_counter_buffer_entry_t *entry,
1632 uvm_va_space_t **out_va_space);
1633
1634 typedef enum
1635 {
1636 UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT,
1637 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1638 UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
1639 } uvm_gpu_buffer_flush_mode_t;
1640
1641 #endif // __UVM_GPU_H__
1642