1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_GPU_H__ 25 #define __UVM_GPU_H__ 26 27 #include "nvtypes.h" 28 #include "nvmisc.h" 29 #include "uvm_types.h" 30 #include "nv_uvm_types.h" 31 #include "uvm_linux.h" 32 #include "nv-kref.h" 33 #include "uvm_common.h" 34 #include "ctrl2080mc.h" 35 #include "uvm_forward_decl.h" 36 #include "uvm_processors.h" 37 #include "uvm_pmm_gpu.h" 38 #include "uvm_pmm_sysmem.h" 39 #include "uvm_mmu.h" 40 #include "uvm_gpu_replayable_faults.h" 41 #include "uvm_gpu_isr.h" 42 #include "uvm_hal_types.h" 43 #include "uvm_hmm.h" 44 #include "uvm_va_block_types.h" 45 #include "uvm_perf_module.h" 46 #include "uvm_rb_tree.h" 47 #include "uvm_perf_prefetch.h" 48 #include "nv-kthread-q.h" 49 #include <linux/mmu_notifier.h> 50 #include "uvm_conf_computing.h" 51 52 // Buffer length to store uvm gpu id, RM device name and gpu uuid. 53 #define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \ 54 UVM_GPU_NAME_LENGTH + UVM_GPU_UUID_TEXT_BUFFER_LENGTH) 55 56 #define UVM_GPU_MAGIC_VALUE 0xc001d00d12341993ULL 57 58 typedef struct 59 { 60 // Number of faults from this uTLB that have been fetched but have not been 61 // serviced yet. 62 NvU32 num_pending_faults; 63 64 // Whether the uTLB contains fatal faults 65 bool has_fatal_faults; 66 67 // We have issued a replay of type START_ACK_ALL while containing fatal 68 // faults. This puts the uTLB in lockdown mode and no new translations are 69 // accepted. 70 bool in_lockdown; 71 72 // We have issued a cancel on this uTLB 73 bool cancelled; 74 75 uvm_fault_buffer_entry_t prev_fatal_fault; 76 77 // Last fetched fault that was originated from this uTLB. Used for fault 78 // filtering. 79 uvm_fault_buffer_entry_t *last_fault; 80 } uvm_fault_utlb_info_t; 81 82 struct uvm_service_block_context_struct 83 { 84 // 85 // Fields initialized by CPU/GPU fault handling and access counter routines 86 // 87 88 // Whether the information refers to replayable/non-replayable faults or 89 // access counters 90 uvm_service_operation_t operation; 91 92 // Processors that will be the residency of pages after the operation has 93 // been serviced 94 uvm_processor_mask_t resident_processors; 95 96 // VA block region that contains all the pages affected by the operation 97 uvm_va_block_region_t region; 98 99 // Array of type uvm_fault_access_type_t that contains the type of the 100 // access that caused the fault/access_counter notification to be serviced 101 // for each page. 102 NvU8 access_type[PAGES_PER_UVM_VA_BLOCK]; 103 104 // Number of times the service operation has been retried 105 unsigned num_retries; 106 107 // Pages that need to be pinned due to thrashing 108 uvm_page_mask_t thrashing_pin_mask; 109 110 // Number of pages that need to be pinned due to thrashing. This is the same 111 // value as the result of bitmap_weight(thrashing_pin_mask) 112 unsigned thrashing_pin_count; 113 114 // Pages that can be read-duplicated 115 uvm_page_mask_t read_duplicate_mask; 116 117 // Number of pages that can be read-duplicated. This is the same value as 118 // the result of bitmap_weight(read_duplicate_count_mask) 119 unsigned read_duplicate_count; 120 121 // 122 // Fields used by the CPU fault handling routine 123 // 124 125 struct 126 { 127 // Node of the list of fault service contexts used by the CPU 128 struct list_head service_context_list; 129 130 // A mask of GPUs that need to be checked for ECC errors before the CPU 131 // fault handler returns, but after the VA space lock has been unlocked 132 // to avoid the RM/UVM VA space lock deadlocks. 133 uvm_processor_mask_t gpus_to_check_for_ecc; 134 135 // This is set to throttle page fault thrashing. 136 NvU64 wakeup_time_stamp; 137 138 // This is set if the page migrated to/from the GPU and CPU. 139 bool did_migrate; 140 141 // Sequence number used to start a mmu notifier read side critical 142 // section. 143 unsigned long notifier_seq; 144 145 struct vm_fault *vmf; 146 } cpu_fault; 147 148 // 149 // Fields managed by the common operation servicing routine 150 // 151 152 uvm_prot_page_mask_array_t mappings_by_prot; 153 154 // Mask with the pages that did not migrate to the processor (they were 155 // already resident) in the last call to uvm_va_block_make_resident. 156 // This is used to compute the pages that need to revoke mapping permissions 157 // from other processors. 158 uvm_page_mask_t did_not_migrate_mask; 159 160 // Pages whose permissions need to be revoked from other processors 161 uvm_page_mask_t revocation_mask; 162 163 struct 164 { 165 // Per-processor mask with the pages that will be resident after 166 // servicing. We need one mask per processor because we may coalesce 167 // faults that trigger migrations to different processors. 168 uvm_page_mask_t new_residency; 169 } per_processor_masks[UVM_ID_MAX_PROCESSORS]; 170 171 // State used by the VA block routines called by the servicing routine 172 uvm_va_block_context_t *block_context; 173 174 // Prefetch state hint 175 uvm_perf_prefetch_hint_t prefetch_hint; 176 177 // Prefetch temporary state. 178 uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree; 179 }; 180 181 typedef struct 182 { 183 // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM 184 // VMA. Used for batching ATS faults in a vma. This is unused for access 185 // counter service requests. 186 uvm_page_mask_t read_fault_mask; 187 188 // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a 189 // SAM VMA. Used for batching ATS faults in a vma. This is unused for access 190 // counter service requests. 191 uvm_page_mask_t write_fault_mask; 192 193 // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region 194 // of a SAM VMA. Used to return ATS fault status. This is unused for access 195 // counter service requests. 196 uvm_page_mask_t faults_serviced_mask; 197 198 // Mask of successfully serviced read faults on pages in write_fault_mask. 199 // This is unused for access counter service requests. 200 uvm_page_mask_t reads_serviced_mask; 201 202 // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM 203 // VMA. This is used as input for access counter service requests and output 204 // of fault service requests. 205 uvm_page_mask_t accessed_mask; 206 207 // Client type of the service requestor. 208 uvm_fault_client_type_t client_type; 209 210 // New residency ID of the faulting region. 211 uvm_processor_id_t residency_id; 212 213 // New residency NUMA node ID of the faulting region. 214 int residency_node; 215 216 struct 217 { 218 // True if preferred_location was set on this faulting region. 219 // UVM_VA_BLOCK_SIZE sized region in the faulting region bound by the 220 // VMA is is prefetched if preferred_location was set and if first_touch 221 // is true; 222 bool has_preferred_location; 223 224 // True if the UVM_VA_BLOCK_SIZE sized region isn't resident on any 225 // node. False if any page in the region is resident somewhere. 226 bool first_touch; 227 228 // Mask of prefetched pages in a UVM_VA_BLOCK_SIZE aligned region of a 229 // SAM VMA. 230 uvm_page_mask_t prefetch_pages_mask; 231 232 // PFN info of the faulting region 233 unsigned long pfns[PAGES_PER_UVM_VA_BLOCK]; 234 235 // Faulting/preferred processor residency mask of the faulting region. 236 uvm_page_mask_t residency_mask; 237 238 #if defined(NV_MMU_INTERVAL_NOTIFIER) 239 // MMU notifier used to compute residency of this faulting region. 240 struct mmu_interval_notifier notifier; 241 #endif 242 243 uvm_va_space_t *va_space; 244 245 // Prefetch temporary state. 246 uvm_perf_prefetch_bitmap_tree_t bitmap_tree; 247 } prefetch_state; 248 249 } uvm_ats_fault_context_t; 250 251 struct uvm_fault_service_batch_context_struct 252 { 253 // Array of elements fetched from the GPU fault buffer. The number of 254 // elements in this array is exactly max_batch_size 255 uvm_fault_buffer_entry_t *fault_cache; 256 257 // Array of pointers to elements in fault cache used for fault 258 // preprocessing. The number of elements in this array is exactly 259 // max_batch_size 260 uvm_fault_buffer_entry_t **ordered_fault_cache; 261 262 // Per uTLB fault information. Used for replay policies and fault 263 // cancellation on Pascal 264 uvm_fault_utlb_info_t *utlbs; 265 266 // Largest uTLB id seen in a GPU fault 267 NvU32 max_utlb_id; 268 269 NvU32 num_cached_faults; 270 271 NvU32 num_coalesced_faults; 272 273 // One of the VA spaces in this batch which had fatal faults. If NULL, no 274 // faults were fatal. More than one VA space could have fatal faults, but we 275 // pick one to be the target of the cancel sequence. 276 uvm_va_space_t *fatal_va_space; 277 278 bool has_throttled_faults; 279 280 NvU32 num_invalid_prefetch_faults; 281 282 NvU32 num_duplicate_faults; 283 284 NvU32 num_replays; 285 286 uvm_ats_fault_context_t ats_context; 287 288 // Unique id (per-GPU) generated for tools events recording 289 NvU32 batch_id; 290 291 uvm_tracker_t tracker; 292 293 // Boolean used to avoid sorting the fault batch by instance_ptr if we 294 // determine at fetch time that all the faults in the batch report the same 295 // instance_ptr 296 bool is_single_instance_ptr; 297 298 // Last fetched fault. Used for fault filtering. 299 uvm_fault_buffer_entry_t *last_fault; 300 }; 301 302 struct uvm_ats_fault_invalidate_struct 303 { 304 bool tlb_batch_pending; 305 uvm_tlb_batch_t tlb_batch; 306 }; 307 308 typedef struct 309 { 310 // Fault buffer information and structures provided by RM 311 UvmGpuFaultInfo rm_info; 312 313 // Maximum number of faults to be processed in batch before fetching new 314 // entries from the GPU buffer 315 NvU32 max_batch_size; 316 317 struct uvm_replayable_fault_buffer_info_struct 318 { 319 // Maximum number of faults entries that can be stored in the buffer 320 NvU32 max_faults; 321 322 // Cached value of the GPU GET register to minimize the round-trips 323 // over PCIe 324 NvU32 cached_get; 325 326 // Cached value of the GPU PUT register to minimize the round-trips over 327 // PCIe 328 NvU32 cached_put; 329 330 // Policy that determines when GPU replays are issued during normal 331 // fault servicing 332 uvm_perf_fault_replay_policy_t replay_policy; 333 334 // Tracker used to aggregate replay operations, needed for fault cancel 335 // and GPU removal 336 uvm_tracker_t replay_tracker; 337 338 // If there is a ratio larger than replay_update_put_ratio of duplicate 339 // faults in a batch, PUT pointer is updated before flushing the buffer 340 // that comes before the replay method. 341 NvU32 replay_update_put_ratio; 342 343 // Fault statistics. These fields are per-GPU and most of them are only 344 // updated during fault servicing, and can be safely incremented. 345 // Migrations may be triggered by different GPUs and need to be 346 // incremented using atomics 347 struct 348 { 349 NvU64 num_prefetch_faults; 350 351 NvU64 num_read_faults; 352 353 NvU64 num_write_faults; 354 355 NvU64 num_atomic_faults; 356 357 NvU64 num_duplicate_faults; 358 359 atomic64_t num_pages_out; 360 361 atomic64_t num_pages_in; 362 363 NvU64 num_replays; 364 365 NvU64 num_replays_ack_all; 366 } stats; 367 368 // Number of uTLBs in the chip 369 NvU32 utlb_count; 370 371 // Context structure used to service a GPU fault batch 372 uvm_fault_service_batch_context_t batch_service_context; 373 374 // Structure used to coalesce fault servicing in a VA block 375 uvm_service_block_context_t block_service_context; 376 377 // Information required to invalidate stale ATS PTEs from the GPU TLBs 378 uvm_ats_fault_invalidate_t ats_invalidate; 379 } replayable; 380 381 struct uvm_non_replayable_fault_buffer_info_struct 382 { 383 // Maximum number of faults entries that can be stored in the buffer 384 NvU32 max_faults; 385 386 // Tracker used to aggregate clear faulted operations, needed for GPU 387 // removal 388 uvm_tracker_t clear_faulted_tracker; 389 390 // Buffer used to store elements popped out from the queue shared with 391 // RM for fault servicing. 392 void *shadow_buffer_copy; 393 394 // Array of elements fetched from the GPU fault buffer. The number of 395 // elements in this array is exactly max_batch_size 396 uvm_fault_buffer_entry_t *fault_cache; 397 398 // Fault statistics. See replayable fault stats for more details. 399 struct 400 { 401 NvU64 num_read_faults; 402 403 NvU64 num_write_faults; 404 405 NvU64 num_atomic_faults; 406 407 NvU64 num_physical_faults; 408 409 atomic64_t num_pages_out; 410 411 atomic64_t num_pages_in; 412 } stats; 413 414 // Tracker which temporarily holds the work pushed to service faults 415 uvm_tracker_t fault_service_tracker; 416 417 // Structure used to coalesce fault servicing in a VA block 418 uvm_service_block_context_t block_service_context; 419 420 // Unique id (per-GPU) generated for tools events recording 421 NvU32 batch_id; 422 423 // Information required to service ATS faults. 424 uvm_ats_fault_context_t ats_context; 425 426 // Information required to invalidate stale ATS PTEs from the GPU TLBs 427 uvm_ats_fault_invalidate_t ats_invalidate; 428 } non_replayable; 429 430 // Flag that tells if prefetch faults are enabled in HW 431 bool prefetch_faults_enabled; 432 433 // Timestamp when prefetch faults where disabled last time 434 NvU64 disable_prefetch_faults_timestamp; 435 } uvm_fault_buffer_info_t; 436 437 struct uvm_access_counter_service_batch_context_struct 438 { 439 uvm_access_counter_buffer_entry_t *notification_cache; 440 441 NvU32 num_cached_notifications; 442 443 struct 444 { 445 uvm_access_counter_buffer_entry_t **notifications; 446 447 NvU32 num_notifications; 448 449 // Boolean used to avoid sorting the fault batch by instance_ptr if we 450 // determine at fetch time that all the access counter notifications in 451 // the batch report the same instance_ptr 452 bool is_single_instance_ptr; 453 } virt; 454 455 struct 456 { 457 uvm_access_counter_buffer_entry_t **notifications; 458 uvm_reverse_map_t *translations; 459 460 NvU32 num_notifications; 461 462 // Boolean used to avoid sorting the fault batch by aperture if we 463 // determine at fetch time that all the access counter notifications in 464 // the batch report the same aperture 465 bool is_single_aperture; 466 } phys; 467 468 // Helper page mask to compute the accessed pages within a VA block 469 uvm_page_mask_t accessed_pages; 470 471 // Structure used to coalesce access counter servicing in a VA block 472 uvm_service_block_context_t block_service_context; 473 474 // Structure used to service access counter migrations in an ATS block. 475 uvm_ats_fault_context_t ats_context; 476 477 // Unique id (per-GPU) generated for tools events recording 478 NvU32 batch_id; 479 }; 480 481 typedef struct 482 { 483 // Values used to configure access counters in RM 484 struct 485 { 486 UVM_ACCESS_COUNTER_GRANULARITY granularity; 487 UVM_ACCESS_COUNTER_USE_LIMIT use_limit; 488 } rm; 489 490 // The following values are precomputed by the access counter notification 491 // handling code. See comments for UVM_MAX_TRANSLATION_SIZE in 492 // uvm_gpu_access_counters.c for more details. 493 NvU64 translation_size; 494 495 NvU64 translations_per_counter; 496 497 NvU64 sub_granularity_region_size; 498 499 NvU64 sub_granularity_regions_per_translation; 500 } uvm_gpu_access_counter_type_config_t; 501 502 typedef struct 503 { 504 UvmGpuAccessCntrInfo rm_info; 505 506 NvU32 max_notifications; 507 508 NvU32 max_batch_size; 509 510 // Cached value of the GPU GET register to minimize the round-trips 511 // over PCIe 512 NvU32 cached_get; 513 514 // Cached value of the GPU PUT register to minimize the round-trips over 515 // PCIe 516 NvU32 cached_put; 517 518 // Tracker used to aggregate access counters clear operations, needed for 519 // GPU removal 520 uvm_tracker_t clear_tracker; 521 522 // Current access counter configuration. During normal operation this 523 // information is computed once during GPU initialization. However, tests 524 // may override it to try different configuration values. 525 struct 526 { 527 uvm_gpu_access_counter_type_config_t mimc; 528 uvm_gpu_access_counter_type_config_t momc; 529 530 NvU32 threshold; 531 } current_config; 532 533 // Access counter statistics 534 struct 535 { 536 atomic64_t num_pages_out; 537 538 atomic64_t num_pages_in; 539 } stats; 540 541 // Ignoring access counters means that notifications are left in the HW 542 // buffer without being serviced. Requests to ignore access counters 543 // are counted since the suspend path inhibits access counter interrupts, 544 // and the resume path needs to know whether to reenable them. 545 NvU32 notifications_ignored_count; 546 547 // Context structure used to service a GPU access counter batch 548 uvm_access_counter_service_batch_context_t batch_service_context; 549 550 // VA space that reconfigured the access counters configuration, if any. 551 // Used in builtin tests only, to avoid reconfigurations from different 552 // processes 553 // 554 // Locking: both readers and writers must hold the access counters ISR lock 555 uvm_va_space_t *reconfiguration_owner; 556 } uvm_access_counter_buffer_info_t; 557 558 typedef struct 559 { 560 // VA where the identity mapping should be mapped in the internal VA 561 // space managed by uvm_gpu_t.address_space_tree (see below). 562 NvU64 base; 563 564 // Page tables with the mapping. 565 uvm_page_table_range_vec_t *range_vec; 566 567 // Used during init to indicate whether the mapping has been fully 568 // initialized. 569 bool ready; 570 } uvm_gpu_identity_mapping_t; 571 572 // Root chunk mapping 573 typedef struct 574 { 575 // Page table range representation of the mapping. Because a root chunk 576 // fits into a single 2MB page, in practice the range consists of a single 577 // 2MB PTE. 578 uvm_page_table_range_t *range; 579 580 // Number of mapped pages of size PAGE_SIZE. 581 NvU32 num_mapped_pages; 582 } uvm_gpu_root_chunk_mapping_t; 583 584 typedef enum 585 { 586 UVM_GPU_LINK_INVALID = 0, 587 UVM_GPU_LINK_PCIE, 588 UVM_GPU_LINK_NVLINK_1, 589 UVM_GPU_LINK_NVLINK_2, 590 UVM_GPU_LINK_NVLINK_3, 591 UVM_GPU_LINK_NVLINK_4, 592 UVM_GPU_LINK_C2C, 593 UVM_GPU_LINK_MAX 594 } uvm_gpu_link_type_t; 595 596 // UVM does not support P2P copies on pre-Pascal GPUs. Pascal+ GPUs only 597 // support virtual addresses in P2P copies. Therefore, a peer identity mapping 598 // needs to be created. 599 // Ampere+ GPUs support physical peer copies, too, so identity mappings are not 600 // needed 601 typedef enum 602 { 603 UVM_GPU_PEER_COPY_MODE_UNSUPPORTED, 604 UVM_GPU_PEER_COPY_MODE_VIRTUAL, 605 UVM_GPU_PEER_COPY_MODE_PHYSICAL, 606 UVM_GPU_PEER_COPY_MODE_COUNT 607 } uvm_gpu_peer_copy_mode_t; 608 609 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two 610 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices 611 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual 612 // partitions within the parent. The parent GPU and partition GPU have 613 // different "id" and "uuid". 614 struct uvm_gpu_struct 615 { 616 uvm_parent_gpu_t *parent; 617 618 // The gpu's GI uuid if SMC is enabled; otherwise, a copy of parent->uuid. 619 NvProcessorUuid uuid; 620 621 // Nice printable name in the format: 622 // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>. 623 // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character. 624 char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH]; 625 626 // Refcount of the gpu, i.e. how many times it has been retained. This is 627 // roughly a count of how many times it has been registered with a VA space, 628 // except that some paths retain the GPU temporarily without a VA space. 629 // 630 // While this is >0, the GPU can't be removed. This differs from gpu_kref, 631 // which merely prevents the uvm_gpu_t object from being freed. 632 // 633 // In most cases this count is protected by the global lock: retaining a GPU 634 // from a UUID and any release require the global lock to be taken. But it's 635 // also useful for a caller to retain a GPU they've already retained, in 636 // which case there's no need to take the global lock. This can happen when 637 // an operation needs to drop the VA space lock but continue operating on a 638 // GPU. This is an atomic variable to handle those cases. 639 // 640 // Security note: keep it as a 64-bit counter to prevent overflow cases (a 641 // user can create a lot of va spaces and register the gpu with them). 642 atomic64_t retained_count; 643 644 // A unique uvm gpu id in range [1, UVM_ID_MAX_PROCESSORS). 645 uvm_gpu_id_t id; 646 647 // Should be UVM_GPU_MAGIC_VALUE. Used for memory checking. 648 NvU64 magic; 649 650 struct 651 { 652 // The amount of memory the GPU has in total, in bytes. If the GPU is in 653 // ZeroFB testing mode, this will be 0. 654 NvU64 size; 655 656 // Max (inclusive) physical address of this GPU's memory that the driver 657 // can allocate through PMM (PMA). 658 NvU64 max_allocatable_address; 659 660 // Max supported vidmem page size may be smaller than the max GMMU page 661 // size, because of the vMMU supported page sizes. 662 NvU64 max_vidmem_page_size; 663 664 struct 665 { 666 // True if the platform supports HW coherence and the GPU's memory 667 // is exposed as a NUMA node to the kernel. 668 bool enabled; 669 unsigned int node_id; 670 } numa; 671 } mem_info; 672 673 struct 674 { 675 // Big page size used by the internal UVM VA space 676 // Notably it may be different than the big page size used by a user's 677 // VA space in general. 678 NvU32 internal_size; 679 } big_page; 680 681 // Mapped registers needed to obtain the current GPU timestamp 682 struct 683 { 684 volatile NvU32 *time0_register; 685 volatile NvU32 *time1_register; 686 } time; 687 688 // Identity peer mappings are only defined when 689 // peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL 690 uvm_gpu_identity_mapping_t peer_mappings[UVM_ID_MAX_GPUS]; 691 692 struct 693 { 694 // Mask of peer_gpus set 695 // 696 // We can use a regular processor id because P2P is not allowed between 697 // partitioned GPUs when SMC is enabled 698 uvm_processor_mask_t peer_gpu_mask; 699 700 // lazily-populated array of peer GPUs, indexed by the peer's GPU index 701 uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS]; 702 703 // Leaf spinlock used to synchronize access to the peer_gpus table so 704 // that it can be safely accessed from the access counters bottom half 705 uvm_spinlock_t peer_gpus_lock; 706 } peer_info; 707 708 // Maximum number of subcontexts supported 709 NvU32 max_subcontexts; 710 711 // RM address space handle used in many of the UVM/RM APIs 712 // Represents a GPU VA space within rm_device. 713 // 714 // In SR-IOV heavy, proxy channels are not associated with this address 715 // space. 716 uvmGpuAddressSpaceHandle rm_address_space; 717 718 // Page tree used for the internal UVM VA space shared with RM 719 uvm_page_tree_t address_space_tree; 720 721 // Set to true during add_gpu() as soon as the RM's address space is moved 722 // to the address_space_tree. 723 bool rm_address_space_moved_to_page_tree; 724 725 uvm_gpu_semaphore_pool_t *semaphore_pool; 726 727 uvm_gpu_semaphore_pool_t *secure_semaphore_pool; 728 729 uvm_channel_manager_t *channel_manager; 730 731 uvm_pmm_gpu_t pmm; 732 733 // Flat linear mapping covering vidmem. This is a kernel mapping that is 734 // only created in certain configurations. 735 // 736 // There are two mutually exclusive versions of the mapping. The simplest 737 // version covers the entire GPU memory, and it is created during GPU 738 // initialization. The dynamic version is a partial vidmem mapping that 739 // creates and destroys mappings to GPU root chunks on demand. 740 union 741 { 742 // Static mapping covering the whole GPU memory. 743 uvm_gpu_identity_mapping_t static_flat_mapping; 744 745 // Dynamic mapping of GPU memory. 746 struct 747 { 748 // Array of root chunk mappings. 749 uvm_gpu_root_chunk_mapping_t *array; 750 751 // Number of elements in the array. 752 size_t count; 753 754 // Each bit in the bitlock protects a single root chunk mapping. 755 uvm_bit_locks_t bitlocks; 756 757 } root_chunk_mappings; 758 }; 759 760 // Linear sysmem mappings. Mappings are added on demand, and removed upon 761 // GPU deinitialization. The mappings are added to UVM's internal address 762 // space i.e. they are kernel mappings. 763 // 764 // Only used in SR-IOV heavy. 765 struct 766 { 767 // Size of each mapping, in bytes. 768 NvU64 mapping_size; 769 770 // Array of sysmem mappings. 771 uvm_gpu_identity_mapping_t *array; 772 773 // Number of elements in the array. 774 size_t count; 775 776 // Each bit in the bitlock protects a sysmem mapping. 777 uvm_bit_locks_t bitlocks; 778 } sysmem_mappings; 779 780 // Reverse lookup table used to query the user mapping associated with a 781 // sysmem (DMA) physical address. 782 // 783 // The system memory mapping information referred to by this field is 784 // different from that of sysmem_mappings, because it relates to user 785 // mappings (instead of kernel), and it is used in most configurations. 786 uvm_pmm_sysmem_mappings_t pmm_reverse_sysmem_mappings; 787 788 struct 789 { 790 uvm_conf_computing_dma_buffer_pool_t dma_buffer_pool; 791 792 // Dummy memory used to store the IV contents during CE encryption. 793 // This memory location is also only available after CE channels 794 // because we use them to write PTEs for allocations such as this one. 795 // This location is used when a physical addressing for the IV buffer 796 // is required. See uvm_hal_hopper_ce_encrypt(). 797 uvm_mem_t *iv_mem; 798 799 // Dummy memory used to store the IV contents during CE encryption. 800 // Because of the limitations of `iv_mem', and the need to have such 801 // buffer at channel initialization, we use an RM allocation. 802 // This location is used when a virtual addressing for the IV buffer 803 // is required. See uvm_hal_hopper_ce_encrypt(). 804 uvm_rm_mem_t *iv_rm_mem; 805 } conf_computing; 806 807 // ECC handling 808 // In order to trap ECC errors as soon as possible the driver has the hw 809 // interrupt register mapped directly. If an ECC interrupt is ever noticed 810 // to be pending, then the UVM driver needs to: 811 // 812 // 1) ask RM to service interrupts, and then 813 // 2) inspect the ECC error notifier state. 814 // 815 // Notably, checking for channel errors is not enough, because ECC errors 816 // can be pending, even after a channel has become idle. 817 // 818 // See more details in uvm_gpu_check_ecc_error(). 819 struct 820 { 821 // Does the GPU have ECC enabled? 822 bool enabled; 823 824 // Direct mapping of the 32-bit part of the hw interrupt tree that has 825 // the ECC bits. 826 volatile NvU32 *hw_interrupt_tree_location; 827 828 // Mask to get the ECC interrupt bits from the 32-bits above. 829 NvU32 mask; 830 831 // Set to true by RM when a fatal ECC error is encountered (requires 832 // asking RM to service pending interrupts to be current). 833 NvBool *error_notifier; 834 } ecc; 835 836 struct 837 { 838 NvU32 swizz_id; 839 840 // RM device handle used in many of the UVM/RM APIs. 841 // 842 // Do not read this field directly, use uvm_gpu_device_handle instead. 843 uvmGpuDeviceHandle rm_device; 844 } smc; 845 846 struct 847 { 848 struct proc_dir_entry *dir; 849 850 struct proc_dir_entry *dir_symlink; 851 852 // The GPU instance UUID symlink if SMC is enabled. 853 struct proc_dir_entry *gpu_instance_uuid_symlink; 854 855 struct proc_dir_entry *info_file; 856 857 struct proc_dir_entry *dir_peers; 858 } procfs; 859 860 // Placeholder for per-GPU performance heuristics information 861 uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT]; 862 863 // Force pushbuffer's GPU VA to be >= 1TB; used only for testing purposes. 864 bool uvm_test_force_upper_pushbuffer_segment; 865 }; 866 867 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two 868 // parts: parent GPUs (uvm_parent_gpu_t) which represent unique PCIe devices 869 // (including VFs), and sub/child GPUs (uvm_gpu_t) which represent individual 870 // partitions within the parent. The parent GPU and partition GPU have 871 // different "id" and "uuid". 872 struct uvm_parent_gpu_struct 873 { 874 // Reference count for how many places are holding on to a parent GPU 875 // (internal to the UVM driver). This includes any GPUs we know about, not 876 // just GPUs that are registered with a VA space. Most GPUs end up being 877 // registered, but there are brief periods when they are not registered, 878 // such as during interrupt handling, and in add_gpu() or remove_gpu(). 879 nv_kref_t gpu_kref; 880 881 // The number of uvm_gpu_ts referencing this uvm_parent_gpu_t. 882 NvU32 num_retained_gpus; 883 884 uvm_gpu_t *gpus[UVM_PARENT_ID_MAX_SUB_PROCESSORS]; 885 886 // Bitmap of valid child entries in the gpus[] table. Used to retrieve a 887 // usable child GPU in bottom-halves. 888 DECLARE_BITMAP(valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS); 889 890 // The gpu's uuid 891 NvProcessorUuid uuid; 892 893 // Nice printable name including the uvm gpu id, ascii name from RM and uuid 894 char name[UVM_GPU_NICE_NAME_BUFFER_LENGTH]; 895 896 // GPU information and provided by RM (architecture, implementation, 897 // hardware classes, etc.). 898 UvmGpuInfo rm_info; 899 900 // A unique uvm gpu id in range [1, UVM_PARENT_ID_MAX_PROCESSORS) 901 uvm_parent_gpu_id_t id; 902 903 // Reference to the Linux PCI device 904 // 905 // The reference to the PCI device remains valid as long as the GPU is 906 // registered with RM's Linux layer (between nvUvmInterfaceRegisterGpu() and 907 // nvUvmInterfaceUnregisterGpu()). 908 struct pci_dev *pci_dev; 909 910 // NVLINK Processing Unit (NPU) on PowerPC platforms. The NPU is a 911 // collection of CPU-side PCI devices which bridge GPU NVLINKs and the CPU 912 // memory bus. 913 // 914 // There is one PCI device per NVLINK. A set of NVLINKs connects to a single 915 // GPU, and all NVLINKs for a given socket are collected logically under 916 // this UVM NPU because some resources (such as register mappings) are 917 // shared by all those NVLINKs. This means multiple GPUs may connect to the 918 // same UVM NPU. 919 uvm_ibm_npu_t *npu; 920 921 // On kernels with NUMA support, this entry contains the closest CPU NUMA 922 // node to this GPU. Otherwise, the value will be -1. 923 int closest_cpu_numa_node; 924 925 // RM device handle used in many of the UVM/RM APIs. 926 // 927 // Do not read this field directly, use uvm_gpu_device_handle instead. 928 uvmGpuDeviceHandle rm_device; 929 930 // The physical address range addressable by the GPU 931 // 932 // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to 933 // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when 934 // referencing sysmem from the GPU, dma_addressable_start should be 935 // subtracted from the physical address. The DMA mapping helpers like 936 // uvm_parent_gpu_map_cpu_pages() and uvm_parent_gpu_dma_alloc_page() take 937 // care of that. 938 NvU64 dma_addressable_start; 939 NvU64 dma_addressable_limit; 940 941 // Total size (in bytes) of physically mapped (with 942 // uvm_parent_gpu_map_cpu_pages) sysmem pages, used for leak detection. 943 atomic64_t mapped_cpu_pages_size; 944 945 // Hardware Abstraction Layer 946 uvm_host_hal_t *host_hal; 947 uvm_ce_hal_t *ce_hal; 948 uvm_arch_hal_t *arch_hal; 949 uvm_fault_buffer_hal_t *fault_buffer_hal; 950 uvm_access_counter_buffer_hal_t *access_counter_buffer_hal; 951 uvm_sec2_hal_t *sec2_hal; 952 953 // Whether CE supports physical addressing mode for writes to vidmem 954 bool ce_phys_vidmem_write_supported; 955 956 uvm_gpu_peer_copy_mode_t peer_copy_mode; 957 958 // Virtualization mode of the GPU. 959 UVM_VIRT_MODE virt_mode; 960 961 // Pascal+ GPUs can trigger faults on prefetch instructions. If false, this 962 // feature must be disabled at all times in GPUs of the given architecture. 963 // If true, the feature can be toggled at will by SW. 964 // 965 // The field should not be used unless the GPU supports replayable faults. 966 bool prefetch_fault_supported; 967 968 // Number of membars required to flush out HSHUB following a TLB invalidate 969 NvU32 num_hshub_tlb_invalidate_membars; 970 971 // Whether the channels can configure GPFIFO in vidmem 972 bool gpfifo_in_vidmem_supported; 973 974 bool replayable_faults_supported; 975 976 bool non_replayable_faults_supported; 977 978 bool access_counters_supported; 979 980 // If this is true, physical address based access counter notifications are 981 // potentially generated. If false, only virtual address based notifications 982 // are generated (assuming access_counters_supported is true too). 983 bool access_counters_can_use_physical_addresses; 984 985 bool fault_cancel_va_supported; 986 987 // True if the GPU has hardware support for scoped atomics 988 bool scoped_atomics_supported; 989 990 // If true, a HW method can be used to clear a faulted channel. 991 // If false, then the GPU supports clearing faulted channels using registers 992 // instead of a HW method. 993 // This value is only defined for GPUs that support non-replayable faults. 994 bool has_clear_faulted_channel_method; 995 996 // If true, a SW method can be used to clear a faulted channel. 997 // If false, the HW method or the registers (whichever is available 998 // according to has_clear_faulted_channel_method) needs to be used. 999 // 1000 // This value is only defined for GPUs that support non-replayable faults. 1001 bool has_clear_faulted_channel_sw_method; 1002 1003 bool sparse_mappings_supported; 1004 1005 // Ampere(GA100) requires map->invalidate->remap->invalidate for page size 1006 // promotion 1007 bool map_remap_larger_page_promotion; 1008 1009 bool plc_supported; 1010 1011 // If true, page_tree initialization pre-populates no_ats_ranges. It only 1012 // affects ATS systems. 1013 bool no_ats_range_required; 1014 1015 // Parameters used by the TLB batching API 1016 struct 1017 { 1018 // Is the targeted (single page) VA invalidate supported at all? 1019 NvBool va_invalidate_supported; 1020 1021 // Is the VA range invalidate supported? 1022 NvBool va_range_invalidate_supported; 1023 1024 union 1025 { 1026 // Maximum (inclusive) number of single page invalidations before 1027 // falling back to invalidate all 1028 NvU32 max_pages; 1029 1030 // Maximum (inclusive) number of range invalidations before falling 1031 // back to invalidate all 1032 NvU32 max_ranges; 1033 }; 1034 } tlb_batch; 1035 1036 // Largest VA (exclusive) which can be used for channel buffer mappings 1037 NvU64 max_channel_va; 1038 1039 // Largest VA (exclusive) which Host can operate. 1040 NvU64 max_host_va; 1041 1042 // Indicates whether the GPU can map sysmem with pages larger than 4k 1043 bool can_map_sysmem_with_large_pages; 1044 1045 // VA base and size of the RM managed part of the internal UVM VA space. 1046 // 1047 // The internal UVM VA is shared with RM by RM controlling some of the top 1048 // level PDEs and leaving the rest for UVM to control. 1049 // On Pascal a single top level PDE covers 128 TB of VA and given that 1050 // semaphores and other allocations limited to 40bit are currently allocated 1051 // through RM, RM needs to control the [0, 128TB) VA range at least for now. 1052 // On Maxwell, limit RMs VA to [0, 128GB) that should easily fit 1053 // all RM allocations and leave enough space for UVM. 1054 NvU64 rm_va_base; 1055 NvU64 rm_va_size; 1056 1057 // Base and size of the GPU VA used for uvm_mem_t allocations mapped in the 1058 // internal address_space_tree. 1059 NvU64 uvm_mem_va_base; 1060 NvU64 uvm_mem_va_size; 1061 1062 // Base of the GPU VAs used for the vidmem and sysmem flat mappings. 1063 NvU64 flat_vidmem_va_base; 1064 NvU64 flat_sysmem_va_base; 1065 1066 // Bitmap of allocation sizes for user memory supported by a GPU. PAGE_SIZE 1067 // is guaranteed to be both present and the smallest size. 1068 uvm_chunk_sizes_mask_t mmu_user_chunk_sizes; 1069 1070 // Bitmap of allocation sizes that could be requested by the page tree for 1071 // a GPU 1072 uvm_chunk_sizes_mask_t mmu_kernel_chunk_sizes; 1073 1074 struct 1075 { 1076 struct proc_dir_entry *dir; 1077 1078 struct proc_dir_entry *fault_stats_file; 1079 1080 struct proc_dir_entry *access_counters_file; 1081 } procfs; 1082 1083 // Interrupt handling state and locks 1084 uvm_isr_info_t isr; 1085 1086 // Fault buffer info. This is only valid if supports_replayable_faults is 1087 // set to true. 1088 uvm_fault_buffer_info_t fault_buffer_info; 1089 1090 // PMM lazy free processing queue. 1091 // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue. 1092 nv_kthread_q_t lazy_free_q; 1093 1094 // Access counter buffer info. This is only valid if 1095 // supports_access_counters is set to true. 1096 uvm_access_counter_buffer_info_t access_counter_buffer_info; 1097 1098 // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs. 1099 NvU32 utlb_per_gpc_count; 1100 1101 // In order to service GPU faults, UVM must be able to obtain the VA 1102 // space for each reported fault. The fault packet contains the 1103 // instance_ptr of the channel that was bound when the SMs triggered 1104 // the fault. On fault any instance pointer in the TSG may be 1105 // reported. This is a problem on Volta, which allow different channels 1106 // in the TSG to be bound to different VA spaces in order to support 1107 // subcontexts. In order to be able to obtain the correct VA space, HW 1108 // provides the subcontext id (or VEID) in addition to the instance_ptr. 1109 // 1110 // Summary: 1111 // 1112 // 1) Channels in a TSG may be in different VA spaces, identified by their 1113 // subcontext ID. 1114 // 2) Different subcontext IDs may map to the same or different VA spaces. 1115 // 3) On fault, any instance pointer in the TSG may be reported. The 1116 // reported subcontext ID identifies which VA space within the TSG actually 1117 // encountered the fault. 1118 // 1119 // Thus, UVM needs to keep track of all the instance pointers that belong 1120 // to the same TSG. We use two tables: 1121 // 1122 // - instance_ptr_table (instance_ptr -> subctx_info) this table maps 1123 // instance pointers to the subcontext info descriptor for the channel. If 1124 // the channel belongs to a subcontext, this descriptor will contain all 1125 // the VA spaces for the subcontexts in the same TSG. If the channel does 1126 // not belong to a subcontext, it will only contain a pointer to its VA 1127 // space. 1128 // - tsg_table (tsg_id -> subctx_info): this table also stores the 1129 // subctx information, but in this case it is indexed by TSG ID. Thus, 1130 // when a new channel bound to a subcontext is registered, it will check 1131 // first in this table if the subcontext information descriptor for its TSG 1132 // already exists, otherwise it will create it. Channels not bound to 1133 // subcontexts will not use this table. 1134 // 1135 // The bottom half reads the tables under 1136 // isr.replayable_faults_handler.lock, but a separate lock is necessary 1137 // because entries are added and removed from the table under the va_space 1138 // lock, and we can't take isr.replayable_faults_handler.lock while holding 1139 // the va_space lock. 1140 uvm_rb_tree_t tsg_table; 1141 1142 uvm_rb_tree_t instance_ptr_table; 1143 uvm_spinlock_t instance_ptr_table_lock; 1144 1145 // This is set to true if the GPU belongs to an SLI group. 1146 bool sli_enabled; 1147 1148 struct 1149 { 1150 bool supported; 1151 1152 bool enabled; 1153 } smc; 1154 1155 // Global statistics. These fields are per-GPU and most of them are only 1156 // updated during fault servicing, and can be safely incremented. 1157 struct 1158 { 1159 NvU64 num_replayable_faults; 1160 1161 NvU64 num_non_replayable_faults; 1162 1163 atomic64_t num_pages_out; 1164 1165 atomic64_t num_pages_in; 1166 } stats; 1167 1168 // Structure to hold nvswitch specific information. In an nvswitch 1169 // environment, rather than using the peer-id field of the PTE (which can 1170 // only address 8 gpus), all gpus are assigned a 47-bit physical address 1171 // space by the fabric manager. Any physical address access to these 1172 // physical address spaces are routed through the switch to the 1173 // corresponding peer. 1174 struct 1175 { 1176 bool is_nvswitch_connected; 1177 1178 // 47-bit fabric memory physical offset that peer gpus need to access 1179 // to read a peer's memory 1180 NvU64 fabric_memory_window_start; 1181 } nvswitch_info; 1182 1183 struct 1184 { 1185 // Note that this represents the link to system memory, not the link the 1186 // system used to discover the GPU. There are some cases such as NVLINK2 1187 // where the GPU is still on the PCIe bus, but it accesses memory over 1188 // this link rather than PCIe. 1189 uvm_gpu_link_type_t link; 1190 NvU32 link_rate_mbyte_per_s; 1191 1192 // Range in the system physical address space where the memory of this 1193 // GPU is exposed as coherent. memory_window_end is inclusive. 1194 // memory_window_start == memory_window_end indicates that no window is 1195 // present (coherence is not supported). 1196 NvU64 memory_window_start; 1197 NvU64 memory_window_end; 1198 } system_bus; 1199 1200 // WAR to issue ATS TLB invalidation commands ourselves. 1201 struct 1202 { 1203 uvm_mutex_t smmu_lock; 1204 struct page *smmu_cmdq; 1205 void __iomem *smmu_cmdqv_base; 1206 unsigned long smmu_prod; 1207 unsigned long smmu_cons; 1208 } smmu_war; 1209 }; 1210 1211 static const char *uvm_parent_gpu_name(uvm_parent_gpu_t *parent_gpu) 1212 { 1213 return parent_gpu->name; 1214 } 1215 1216 static const char *uvm_gpu_name(uvm_gpu_t *gpu) 1217 { 1218 return gpu->name; 1219 } 1220 1221 static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu) 1222 { 1223 if (gpu->parent->smc.enabled) 1224 return gpu->smc.rm_device; 1225 return gpu->parent->rm_device; 1226 } 1227 1228 struct uvm_gpu_peer_struct 1229 { 1230 // The fields in this global structure can only be inspected under one of 1231 // the following conditions: 1232 // 1233 // - The VA space lock is held for either read or write, both GPUs are 1234 // registered in the VA space, and the corresponding bit in the 1235 // va_space.enabled_peers bitmap is set. 1236 // 1237 // - The global lock is held. 1238 // 1239 // - While the global lock was held in the past, the two GPUs were detected 1240 // to be SMC peers and were both retained. 1241 // 1242 // - While the global lock was held in the past, the two GPUs were detected 1243 // to be NVLINK peers and were both retained. 1244 // 1245 // - While the global lock was held in the past, the two GPUs were detected 1246 // to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was called. 1247 // 1248 // - The peer_gpus_lock is held on one of the GPUs. In this case, the other 1249 // GPU must be read from the original GPU's peer_gpus table. The fields 1250 // will not change while the lock is held, but they may no longer be valid 1251 // because the other GPU might be in teardown. 1252 1253 // Peer Id associated with this device w.r.t. to a peer GPU. 1254 // Note: peerId (A -> B) != peerId (B -> A) 1255 // peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2) 1256 // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2) 1257 NvU8 peer_ids[2]; 1258 1259 // Indirect peers are GPUs which can coherently access each others' memory 1260 // over NVLINK, but are routed through the CPU using the SYS aperture rather 1261 // than a PEER aperture 1262 NvU8 is_indirect_peer : 1; 1263 1264 // The link type between the peer GPUs, currently either PCIe or NVLINK. 1265 // This field is used to determine the when this peer struct has been 1266 // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are 1267 // initialized at GPU registration time. PCIe peers are initialized when 1268 // the refcount below goes from 0 to 1. 1269 uvm_gpu_link_type_t link_type; 1270 1271 // Maximum unidirectional bandwidth between the peers in megabytes per 1272 // second, not taking into account the protocols' overhead. The reported 1273 // bandwidth for indirect peers is zero. See UvmGpuP2PCapsParams. 1274 NvU32 total_link_line_rate_mbyte_per_s; 1275 1276 // For PCIe, the number of times that this has been retained by a VA space. 1277 // For NVLINK this will always be 1. 1278 NvU64 ref_count; 1279 1280 // This handle gets populated when enable_peer_access successfully creates 1281 // an NV50_P2P object. disable_peer_access resets the same on the object 1282 // deletion. 1283 NvHandle p2p_handle; 1284 1285 struct 1286 { 1287 struct proc_dir_entry *peer_file[2]; 1288 struct proc_dir_entry *peer_symlink_file[2]; 1289 1290 // GPU-A <-> GPU-B link is bidirectional, pairs[x][0] is always the 1291 // local GPU, while pairs[x][1] is the remote GPU. The table shall be 1292 // filled like so: [[GPU-A, GPU-B], [GPU-B, GPU-A]]. 1293 uvm_gpu_t *pairs[2][2]; 1294 } procfs; 1295 }; 1296 1297 // Initialize global gpu state 1298 NV_STATUS uvm_gpu_init(void); 1299 1300 // Deinitialize global state (called from module exit) 1301 void uvm_gpu_exit(void); 1302 1303 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space); 1304 1305 void uvm_gpu_exit_va_space(uvm_va_space_t *va_space); 1306 1307 static unsigned int uvm_gpu_numa_node(uvm_gpu_t *gpu) 1308 { 1309 UVM_ASSERT(gpu->mem_info.numa.enabled); 1310 return gpu->mem_info.numa.node_id; 1311 } 1312 1313 static uvm_gpu_phys_address_t uvm_gpu_page_to_phys_address(uvm_gpu_t *gpu, struct page *page) 1314 { 1315 unsigned long sys_addr = page_to_pfn(page) << PAGE_SHIFT; 1316 unsigned long gpu_offset = sys_addr - gpu->parent->system_bus.memory_window_start; 1317 1318 UVM_ASSERT(page_to_nid(page) == uvm_gpu_numa_node(gpu)); 1319 UVM_ASSERT(sys_addr >= gpu->parent->system_bus.memory_window_start); 1320 UVM_ASSERT(sys_addr + PAGE_SIZE - 1 <= gpu->parent->system_bus.memory_window_end); 1321 1322 return uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_offset); 1323 } 1324 1325 // Note that there is a uvm_gpu_get() function defined in uvm_global.h to break 1326 // a circular dep between global and gpu modules. 1327 1328 // Get a uvm_gpu_t by UUID (physical GPU UUID if SMC is not enabled, otherwise 1329 // GPU instance UUID). 1330 // This returns NULL if the GPU is not present. 1331 // This is the general purpose call that should be used normally. 1332 // 1333 // LOCKING: requires the global lock to be held 1334 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid); 1335 1336 // Get a uvm_parent_gpu_t by UUID (physical GPU UUID). 1337 // Like uvm_gpu_get_by_uuid(), this function returns NULL if the GPU has not 1338 // been registered. 1339 // 1340 // LOCKING: requires the global lock to be held 1341 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid); 1342 1343 // Like uvm_parent_gpu_get_by_uuid(), but this variant does not assertion-check 1344 // that the caller is holding the global_lock. This is a narrower-purpose 1345 // function, and is only intended for use by the top-half ISR, or other very 1346 // limited cases. 1347 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid); 1348 1349 // Retain a gpu by uuid 1350 // Returns the retained uvm_gpu_t in gpu_out on success 1351 // 1352 // LOCKING: Takes and releases the global lock for the caller. 1353 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid, 1354 const uvm_rm_user_object_t *user_rm_device, 1355 uvm_gpu_t **gpu_out); 1356 1357 // Retain a gpu which is known to already be retained. Does NOT require the 1358 // global lock to be held. 1359 void uvm_gpu_retain(uvm_gpu_t *gpu); 1360 1361 // Release a gpu 1362 // LOCKING: requires the global lock to be held 1363 void uvm_gpu_release_locked(uvm_gpu_t *gpu); 1364 1365 // Like uvm_gpu_release_locked, but takes and releases the global lock for the 1366 // caller. 1367 void uvm_gpu_release(uvm_gpu_t *gpu); 1368 1369 static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu) 1370 { 1371 return atomic64_read(&gpu->retained_count); 1372 } 1373 1374 // Decrease the refcount on the parent GPU object, and actually delete the object 1375 // if the refcount hits zero. 1376 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu); 1377 1378 // Calculates peer table index using GPU ids. 1379 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1); 1380 1381 // Either retains an existing PCIe peer entry or creates a new one. In both 1382 // cases the two GPUs are also each retained. 1383 // LOCKING: requires the global lock to be held 1384 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1); 1385 1386 // Releases a PCIe peer entry and the two GPUs. 1387 // LOCKING: requires the global lock to be held 1388 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1); 1389 1390 // Get the aperture for local_gpu to use to map memory resident on remote_gpu. 1391 // They must not be the same gpu. 1392 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu); 1393 1394 // Get the processor id accessible by the given GPU for the given physical 1395 // address. 1396 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr); 1397 1398 // Get the P2P capabilities between the gpus with the given indexes 1399 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1); 1400 1401 // Get the P2P capabilities between the given gpus 1402 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1) 1403 { 1404 return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id); 1405 } 1406 1407 static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1) 1408 { 1409 if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) { 1410 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2); 1411 return true; 1412 } 1413 1414 return false; 1415 } 1416 1417 static bool uvm_gpus_are_indirect_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 1418 { 1419 uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 1420 1421 if (peer_caps->link_type != UVM_GPU_LINK_INVALID && peer_caps->is_indirect_peer) { 1422 UVM_ASSERT(gpu0->mem_info.numa.enabled); 1423 UVM_ASSERT(gpu1->mem_info.numa.enabled); 1424 UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_PCIE); 1425 UVM_ASSERT(!uvm_gpus_are_nvswitch_connected(gpu0, gpu1)); 1426 return true; 1427 } 1428 1429 return false; 1430 } 1431 1432 // Retrieve the virtual address corresponding to the given vidmem physical 1433 // address, according to the linear vidmem mapping in the GPU kernel address 1434 // space. 1435 // 1436 // The actual GPU mapping only exists if a full flat mapping, or a partial flat 1437 // mapping covering the passed address, has been previously created. 1438 static uvm_gpu_address_t uvm_gpu_address_virtual_from_vidmem_phys(uvm_gpu_t *gpu, NvU64 pa) 1439 { 1440 UVM_ASSERT(uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) || 1441 uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent)); 1442 UVM_ASSERT(pa <= gpu->mem_info.max_allocatable_address); 1443 1444 if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent)) 1445 UVM_ASSERT(gpu->static_flat_mapping.ready); 1446 1447 return uvm_gpu_address_virtual(gpu->parent->flat_vidmem_va_base + pa); 1448 } 1449 1450 // Retrieve the virtual address corresponding to the given sysmem physical 1451 // address, according to the linear sysmem mapping in the GPU kernel address 1452 // space. 1453 // 1454 // The actual GPU mapping only exists if a linear mapping covering the passed 1455 // address has been previously created. 1456 static uvm_gpu_address_t uvm_parent_gpu_address_virtual_from_sysmem_phys(uvm_parent_gpu_t *parent_gpu, NvU64 pa) 1457 { 1458 UVM_ASSERT(uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(parent_gpu)); 1459 UVM_ASSERT(pa <= (parent_gpu->dma_addressable_limit - parent_gpu->dma_addressable_start)); 1460 1461 return uvm_gpu_address_virtual(parent_gpu->flat_sysmem_va_base + pa); 1462 } 1463 1464 // Given a GPU or CPU physical address (not peer), retrieve an address suitable 1465 // for CE access. 1466 static uvm_gpu_address_t uvm_gpu_address_copy(uvm_gpu_t *gpu, uvm_gpu_phys_address_t phys_addr) 1467 { 1468 UVM_ASSERT(phys_addr.aperture == UVM_APERTURE_VID || phys_addr.aperture == UVM_APERTURE_SYS); 1469 1470 if (phys_addr.aperture == UVM_APERTURE_VID) { 1471 if (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) || 1472 uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent)) 1473 return uvm_gpu_address_virtual_from_vidmem_phys(gpu, phys_addr.address); 1474 } 1475 else if (uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(gpu->parent)) { 1476 return uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, phys_addr.address); 1477 } 1478 1479 return uvm_gpu_address_from_phys(phys_addr); 1480 } 1481 1482 static uvm_gpu_identity_mapping_t *uvm_gpu_get_peer_mapping(uvm_gpu_t *gpu, uvm_gpu_id_t peer_id) 1483 { 1484 return &gpu->peer_mappings[uvm_id_gpu_index(peer_id)]; 1485 } 1486 1487 // Check for ECC errors 1488 // 1489 // Notably this check cannot be performed where it's not safe to call into RM. 1490 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu); 1491 1492 // Check for ECC errors without calling into RM 1493 // 1494 // Calling into RM is problematic in many places, this check is always safe to 1495 // do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error 1496 // and it's required to call uvm_gpu_check_ecc_error() to be sure. 1497 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu); 1498 1499 // Map size bytes of contiguous sysmem on the GPU for physical access 1500 // 1501 // size has to be aligned to PAGE_SIZE. 1502 // 1503 // Returns the physical address of the pages that can be used to access them on 1504 // the GPU. 1505 NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out); 1506 1507 // Unmap num_pages pages previously mapped with uvm_parent_gpu_map_cpu_pages(). 1508 void uvm_parent_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size); 1509 1510 static NV_STATUS uvm_parent_gpu_map_cpu_page(uvm_parent_gpu_t *parent_gpu, struct page *page, NvU64 *dma_address_out) 1511 { 1512 return uvm_parent_gpu_map_cpu_pages(parent_gpu, page, PAGE_SIZE, dma_address_out); 1513 } 1514 1515 static void uvm_parent_gpu_unmap_cpu_page(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address) 1516 { 1517 uvm_parent_gpu_unmap_cpu_pages(parent_gpu, dma_address, PAGE_SIZE); 1518 } 1519 1520 // Allocate and map a page of system DMA memory on the GPU for physical access 1521 // 1522 // Returns 1523 // - the address of the page that can be used to access them on 1524 // the GPU in the dma_address_out parameter. 1525 // - the address of allocated memory in CPU virtual address space. 1526 void *uvm_parent_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu, 1527 gfp_t gfp_flags, 1528 NvU64 *dma_address_out); 1529 1530 // Unmap and free size bytes of contiguous sysmem DMA previously allocated 1531 // with uvm_parent_gpu_map_cpu_pages(). 1532 void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address); 1533 1534 // Returns whether the given range is within the GPU's addressable VA ranges. 1535 // It requires the input 'addr' to be in canonical form for platforms compliant 1536 // to canonical form addresses, i.e., ARM64, and x86. 1537 // Warning: This only checks whether the GPU's MMU can support the given 1538 // address. Some HW units on that GPU might only support a smaller range. 1539 // 1540 // The GPU must be initialized before calling this function. 1541 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size); 1542 1543 // Returns whether the given range is within the GPU's addressable VA ranges in 1544 // the internal GPU VA "kernel" address space, which is a linear address space. 1545 // Therefore, the input 'addr' must not be in canonical form, even platforms 1546 // that use to the canonical form addresses, i.e., ARM64, and x86. 1547 // Warning: This only checks whether the GPU's MMU can support the given 1548 // address. Some HW units on that GPU might only support a smaller range. 1549 // 1550 // The GPU must be initialized before calling this function. 1551 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size); 1552 1553 bool uvm_platform_uses_canonical_form_address(void); 1554 1555 // Returns addr's canonical form for host systems that use canonical form 1556 // addresses. 1557 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr); 1558 1559 static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu) 1560 { 1561 return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start; 1562 } 1563 1564 static bool uvm_parent_gpu_needs_pushbuffer_segments(uvm_parent_gpu_t *parent_gpu) 1565 { 1566 return parent_gpu->max_host_va > (1ull << 40); 1567 } 1568 1569 static bool uvm_parent_gpu_supports_eviction(uvm_parent_gpu_t *parent_gpu) 1570 { 1571 // Eviction is supported only if the GPU supports replayable faults 1572 return parent_gpu->replayable_faults_supported; 1573 } 1574 1575 static bool uvm_parent_gpu_is_virt_mode_sriov_heavy(const uvm_parent_gpu_t *parent_gpu) 1576 { 1577 return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_HEAVY; 1578 } 1579 1580 static bool uvm_parent_gpu_is_virt_mode_sriov_standard(const uvm_parent_gpu_t *parent_gpu) 1581 { 1582 return parent_gpu->virt_mode == UVM_VIRT_MODE_SRIOV_STANDARD; 1583 } 1584 1585 // Returns true if the virtualization mode is SR-IOV heavy or SR-IOV standard. 1586 static bool uvm_parent_gpu_is_virt_mode_sriov(const uvm_parent_gpu_t *parent_gpu) 1587 { 1588 return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu) || 1589 uvm_parent_gpu_is_virt_mode_sriov_standard(parent_gpu); 1590 } 1591 1592 static bool uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t *parent_gpu) 1593 { 1594 return uvm_parent_gpu_is_virt_mode_sriov_heavy(parent_gpu); 1595 } 1596 1597 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu); 1598 1599 // Debug print of GPU properties 1600 void uvm_gpu_print(uvm_gpu_t *gpu); 1601 1602 // Add the given instance pointer -> user_channel mapping to this GPU. The 1603 // bottom half GPU page fault handler uses this to look up the VA space for GPU 1604 // faults. 1605 NV_STATUS uvm_parent_gpu_add_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel); 1606 void uvm_parent_gpu_remove_user_channel(uvm_parent_gpu_t *parent_gpu, uvm_user_channel_t *user_channel); 1607 1608 // Looks up an entry added by uvm_gpu_add_user_channel. Return codes: 1609 // NV_OK Translation successful 1610 // NV_ERR_INVALID_CHANNEL Entry's instance pointer was not found 1611 // NV_ERR_PAGE_TABLE_NOT_AVAIL Entry's instance pointer is valid but the entry 1612 // targets an invalid subcontext 1613 // 1614 // out_va_space is valid if NV_OK is returned, otherwise it's NULL. The caller 1615 // is responsibile for ensuring that the returned va_space can't be destroyed, 1616 // so these functions should only be called from the bottom half. 1617 NV_STATUS uvm_parent_gpu_fault_entry_to_va_space(uvm_parent_gpu_t *parent_gpu, 1618 uvm_fault_buffer_entry_t *fault, 1619 uvm_va_space_t **out_va_space); 1620 1621 NV_STATUS uvm_parent_gpu_access_counter_entry_to_va_space(uvm_parent_gpu_t *parent_gpu, 1622 uvm_access_counter_buffer_entry_t *entry, 1623 uvm_va_space_t **out_va_space); 1624 1625 typedef enum 1626 { 1627 UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT, 1628 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 1629 UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT, 1630 } uvm_gpu_buffer_flush_mode_t; 1631 1632 #endif // __UVM_GPU_H__ 1633