1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "nv_uvm_interface.h"
25 #include "uvm_api.h"
26 #include "uvm_channel.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_gpu_semaphore.h"
30 #include "uvm_hal.h"
31 #include "uvm_procfs.h"
32 #include "uvm_pmm_gpu.h"
33 #include "uvm_pmm_sysmem.h"
34 #include "uvm_va_space.h"
35 #include "uvm_user_channel.h"
36 #include "uvm_perf_events.h"
37 #include "uvm_perf_heuristics.h"
38 #include "uvm_common.h"
39 #include "ctrl2080mc.h"
40 #include "nv-kthread-q.h"
41 #include "uvm_gpu_access_counters.h"
42 #include "uvm_ats.h"
43 #include "uvm_test.h"
44 #include "uvm_conf_computing.h"
45 
46 #include "uvm_linux.h"
47 
48 #define UVM_PROC_GPUS_PEER_DIR_NAME "peers"
49 
50 // The uvm_peer_copy module parameter enables to choose from "phys" or "virt".
51 // It determines the addressing mode for P2P copies.
52 #define UVM_PARAM_PEER_COPY_VIRTUAL "virt"
53 #define UVM_PARAM_PEER_COPY_PHYSICAL "phys"
54 static char *uvm_peer_copy = UVM_PARAM_PEER_COPY_PHYSICAL;
55 module_param(uvm_peer_copy, charp, S_IRUGO);
56 MODULE_PARM_DESC(uvm_peer_copy, "Choose the addressing mode for peer copying, options: "
57                                 UVM_PARAM_PEER_COPY_PHYSICAL " [default] or " UVM_PARAM_PEER_COPY_VIRTUAL ". "
58                                 "Valid for Ampere+ GPUs.");
59 
60 static void remove_gpu(uvm_gpu_t *gpu);
61 static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
62 static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu);
63 static void destroy_nvlink_peers(uvm_gpu_t *gpu);
64 
65 static uvm_user_channel_t *get_user_channel(uvm_rb_tree_node_t *node)
66 {
67     return container_of(node, uvm_user_channel_t, instance_ptr.node);
68 }
69 
70 static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)
71 {
72     switch (link_type) {
73         case UVM_LINK_TYPE_PCIE:
74             return UVM_GPU_LINK_PCIE;
75         case UVM_LINK_TYPE_NVLINK_1:
76             return UVM_GPU_LINK_NVLINK_1;
77         case UVM_LINK_TYPE_NVLINK_2:
78             return UVM_GPU_LINK_NVLINK_2;
79         case UVM_LINK_TYPE_NVLINK_3:
80             return UVM_GPU_LINK_NVLINK_3;
81         case UVM_LINK_TYPE_NVLINK_4:
82             return UVM_GPU_LINK_NVLINK_4;
83         case UVM_LINK_TYPE_C2C:
84             return UVM_GPU_LINK_C2C;
85         default:
86             return UVM_GPU_LINK_INVALID;
87     }
88 }
89 
90 static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_info)
91 {
92     char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
93 
94     parent_gpu->rm_info = *gpu_info;
95 
96     parent_gpu->system_bus.link = get_gpu_link_type(gpu_info->sysmemLink);
97     UVM_ASSERT(parent_gpu->system_bus.link != UVM_GPU_LINK_INVALID);
98 
99     parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps;
100 
101     if (gpu_info->systemMemoryWindowSize > 0) {
102         // memory_window_end is inclusive but uvm_gpu_is_coherent() checks
103         // memory_window_end > memory_window_start as its condition.
104         UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1);
105         parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart;
106         parent_gpu->system_bus.memory_window_end   = gpu_info->systemMemoryWindowStart +
107                                                      gpu_info->systemMemoryWindowSize - 1;
108     }
109 
110     parent_gpu->nvswitch_info.is_nvswitch_connected = gpu_info->connectedToSwitch;
111 
112     // nvswitch is routed via physical pages, where the upper 13-bits of the
113     // 47-bit address space holds the routing information for each peer.
114     // Currently, this is limited to a 16GB framebuffer window size.
115     if (parent_gpu->nvswitch_info.is_nvswitch_connected)
116         parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;
117 
118     format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid);
119     snprintf(parent_gpu->name,
120              sizeof(parent_gpu->name),
121              "ID %u: %s: %s",
122              uvm_id_value(parent_gpu->id),
123              parent_gpu->rm_info.name,
124              uuid_buffer);
125 }
126 
127 static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
128 {
129     NV_STATUS status;
130     UvmGpuCaps gpu_caps;
131 
132     memset(&gpu_caps, 0, sizeof(gpu_caps));
133 
134     status = uvm_rm_locked_call(nvUvmInterfaceQueryCaps(uvm_gpu_device_handle(gpu), &gpu_caps));
135     if (status != NV_OK)
136         return status;
137 
138     if (gpu_caps.numaEnabled) {
139         UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent));
140         gpu->mem_info.numa.enabled = true;
141         gpu->mem_info.numa.node_id = gpu_caps.numaNodeId;
142     }
143     else {
144         UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent));
145     }
146 
147     return NV_OK;
148 }
149 
150 static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu)
151 {
152     NV_STATUS status;
153     UvmGpuAddressSpaceInfo gpu_address_space_info = {0};
154 
155     status = uvm_rm_locked_call(nvUvmInterfaceAddressSpaceCreate(uvm_gpu_device_handle(gpu),
156                                                                  gpu->parent->rm_va_base,
157                                                                  gpu->parent->rm_va_size,
158                                                                  &gpu->rm_address_space,
159                                                                  &gpu_address_space_info));
160     if (status != NV_OK)
161         return status;
162 
163     gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
164 
165     gpu->time.time0_register = gpu_address_space_info.time0Offset;
166     gpu->time.time1_register = gpu_address_space_info.time1Offset;
167 
168     gpu->max_subcontexts = gpu_address_space_info.maxSubctxCount;
169 
170     return NV_OK;
171 }
172 
173 static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
174 {
175     NV_STATUS status;
176     UvmGpuFbInfo fb_info = {0};
177 
178     status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(uvm_gpu_device_handle(gpu), &fb_info));
179     if (status != NV_OK)
180         return status;
181 
182     if (!fb_info.bZeroFb) {
183         gpu->mem_info.size = ((NvU64)fb_info.heapSize + fb_info.reservedHeapSize) * 1024;
184         gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress;
185     }
186 
187     return NV_OK;
188 }
189 
190 static NV_STATUS get_gpu_ecc_info(uvm_gpu_t *gpu)
191 {
192     NV_STATUS status;
193     UvmGpuEccInfo ecc_info = {0};
194 
195     status = uvm_rm_locked_call(nvUvmInterfaceGetEccInfo(uvm_gpu_device_handle(gpu), &ecc_info));
196     if (status != NV_OK)
197         return status;
198 
199     gpu->ecc.enabled = ecc_info.bEccEnabled;
200     if (gpu->ecc.enabled) {
201         gpu->ecc.hw_interrupt_tree_location = (volatile NvU32*)((char*)ecc_info.eccReadLocation + ecc_info.eccOffset);
202         UVM_ASSERT(gpu->ecc.hw_interrupt_tree_location != NULL);
203 
204         gpu->ecc.mask = ecc_info.eccMask;
205         UVM_ASSERT(gpu->ecc.mask != 0);
206 
207         gpu->ecc.error_notifier = ecc_info.eccErrorNotifier;
208         UVM_ASSERT(gpu->ecc.error_notifier != NULL);
209     }
210 
211     return NV_OK;
212 }
213 
214 static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
215 {
216     // TODO: Bug 1757136: Add Linux SLI support. Until then, explicitly disable
217     //       UVM on SLI.
218     return parent_gpu->rm_info.subdeviceCount == 1;
219 }
220 
221 static bool platform_uses_canonical_form_address(void)
222 {
223     if (NVCPU_IS_PPC64LE)
224         return false;
225 
226     return true;
227 }
228 
229 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
230 {
231     // Lower and upper address spaces are typically found in platforms that use
232     // the canonical address form.
233     NvU64 max_va_lower;
234     NvU64 addr_end = addr + size - 1;
235     NvU8 gpu_addr_shift;
236     NvU8 cpu_addr_shift;
237     NvU8 addr_shift;
238 
239     // Watch out for calling this too early in init
240     UVM_ASSERT(gpu->address_space_tree.hal);
241     UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64);
242     UVM_ASSERT(addr <= addr_end);
243     UVM_ASSERT(size > 0);
244 
245     gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
246     cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
247     addr_shift = gpu_addr_shift;
248 
249     // Pascal+ GPUs are capable of accessing kernel pointers in various modes
250     // by applying the same upper-bit checks that x86, ARM, and Power
251     // processors do. x86 and ARM use canonical form addresses. For ARM, even
252     // with Top-Byte Ignore enabled, the following logic validates addresses
253     // from the kernel VA range. PowerPC does not use canonical form address.
254     // The following diagram illustrates the valid (V) VA regions that can be
255     // mapped (or addressed) by the GPU/CPU when the CPU uses canonical form.
256     // (C) regions are only accessible by the CPU. Similarly, (G) regions
257     // are only accessible by the GPU. (X) regions are not addressible.
258     // Note that we only consider (V) regions, i.e., address ranges that are
259     // addressable by both, the CPU and GPU.
260     //
261     //               GPU MAX VA < CPU MAX VA           GPU MAX VA >= CPU MAX VA
262     //          0xF..F +----------------+          0xF..F +----------------+
263     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
264     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
265     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
266     // GPU MIN UPPER VA|----------------| CPU MIN UPPER VA|----------------|
267     //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
268     //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
269     // CPU MIN UPPER VA|----------------| GPU MIN UPPER VA|----------------|
270     //                 |XXXXXXXXXXXXXXXX|                 |XXXXXXXXXXXXXXXX|
271     //                 |XXXXXXXXXXXXXXXX|                 |XXXXXXXXXXXXXXXX|
272     // CPU MAX LOWER VA|----------------| GPU MAX LOWER VA|----------------|
273     //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
274     //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
275     // GPU MAX LOWER VA|----------------| CPU MAX LOWER VA|----------------|
276     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
277     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
278     //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
279     //               0 +----------------+               0 +----------------+
280 
281     // On canonical form address platforms and Pascal+ GPUs.
282     if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
283         NvU64 min_va_upper;
284 
285         // On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
286         // 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
287         // wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
288         // See more details on uvm_parent_gpu_canonical_address(..);
289         if (cpu_addr_shift > gpu_addr_shift)
290             addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
291         else if (gpu_addr_shift == 57)
292             addr_shift = gpu_addr_shift;
293         else
294             addr_shift = cpu_addr_shift;
295 
296         min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
297         max_va_lower = 1ULL << (addr_shift - 1);
298         return (addr_end < max_va_lower) || (addr >= min_va_upper);
299     }
300     else {
301         max_va_lower = 1ULL << addr_shift;
302         return addr_end < max_va_lower;
303     }
304 }
305 
306 // The internal UVM VAS does not use canonical form addresses.
307 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
308 {
309     NvU64 addr_end = addr + size - 1;
310     NvU64 max_gpu_va;
311 
312     // Watch out for calling this too early in init
313     UVM_ASSERT(gpu->address_space_tree.hal);
314     UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64);
315     UVM_ASSERT(addr <= addr_end);
316     UVM_ASSERT(size > 0);
317 
318     max_gpu_va = 1ULL << gpu->address_space_tree.hal->num_va_bits();
319     return addr_end < max_gpu_va;
320 }
321 
322 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
323 {
324     NvU8 gpu_addr_shift;
325     NvU8 cpu_addr_shift;
326     NvU8 addr_shift;
327     NvU64 input_addr = addr;
328 
329     if (platform_uses_canonical_form_address()) {
330         // When the CPU VA width is larger than GPU's, it means that:
331         // On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
332         // On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
333         // We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
334         // behavior of CPUs with smaller (than GPU) VA widths.
335         gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
336         cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
337 
338         if (cpu_addr_shift > gpu_addr_shift)
339             addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
340         else if (gpu_addr_shift == 57)
341             addr_shift = gpu_addr_shift;
342         else
343             addr_shift = cpu_addr_shift;
344 
345         addr = (NvU64)((NvS64)(addr << (64 - addr_shift)) >> (64 - addr_shift));
346 
347         // This protection acts on when the address is not covered by the GPU's
348         // OOR_ADDR_CHECK. This can only happen when OOR_ADDR_CHECK is in
349         // permissive (NO_CHECK) mode.
350         if ((addr << (64 - gpu_addr_shift)) != (input_addr << (64 - gpu_addr_shift)))
351             return input_addr;
352     }
353 
354     return addr;
355 }
356 
357 static void gpu_info_print_ce_caps(uvm_gpu_t *gpu, struct seq_file *s)
358 {
359     NvU32 i;
360     UvmGpuCopyEnginesCaps *ces_caps;
361     NV_STATUS status;
362 
363     ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps));
364     if (!ces_caps) {
365         UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (no memory)\n");
366         return;
367     }
368 
369     status = uvm_rm_locked_call(nvUvmInterfaceQueryCopyEnginesCaps(uvm_gpu_device_handle(gpu), ces_caps));
370     if (status != NV_OK) {
371         UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (query failed)\n");
372         goto out;
373     }
374 
375     UVM_SEQ_OR_DBG_PRINT(s, "supported_ces:\n");
376     for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) {
377         UvmGpuCopyEngineCaps *ce_caps = ces_caps->copyEngineCaps + i;
378 
379         if (!ce_caps->supported)
380             continue;
381 
382         UVM_SEQ_OR_DBG_PRINT(s, " ce %u pce mask 0x%08x grce %u shared %u sysmem read %u sysmem write %u sysmem %u "
383                              "nvlink p2p %u p2p %u\n",
384                              i,
385                              ce_caps->cePceMask,
386                              ce_caps->grce,
387                              ce_caps->shared,
388                              ce_caps->sysmemRead,
389                              ce_caps->sysmemWrite,
390                              ce_caps->sysmem,
391                              ce_caps->nvlinkP2p,
392                              ce_caps->p2p);
393     }
394 
395 out:
396     uvm_kvfree(ces_caps);
397 }
398 
399 static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)
400 {
401     BUILD_BUG_ON(UVM_VIRT_MODE_COUNT != 4);
402 
403     switch (virtMode) {
404         UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_NONE);
405         UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_LEGACY);
406         UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_SRIOV_HEAVY);
407         UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_SRIOV_STANDARD);
408         UVM_ENUM_STRING_DEFAULT();
409     }
410 }
411 
412 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
413 {
414     BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);
415 
416     switch (link_type) {
417         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_INVALID);
418         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_PCIE);
419         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_1);
420         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_2);
421         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_3);
422         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_4);
423         UVM_ENUM_STRING_CASE(UVM_GPU_LINK_C2C);
424         UVM_ENUM_STRING_DEFAULT();
425     }
426 }
427 
428 static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s)
429 {
430     const UvmGpuInfo *gpu_info = &gpu->parent->rm_info;
431     NvU64 num_pages_in;
432     NvU64 num_pages_out;
433     NvU64 mapped_cpu_pages_size;
434     NvU32 get, put;
435     unsigned int cpu;
436 
437     UVM_SEQ_OR_DBG_PRINT(s, "GPU %s\n", uvm_gpu_name(gpu));
438     UVM_SEQ_OR_DBG_PRINT(s, "retained_count                         %llu\n", uvm_gpu_retained_count(gpu));
439     UVM_SEQ_OR_DBG_PRINT(s, "ecc                                    %s\n", gpu->ecc.enabled ? "enabled" : "disabled");
440     if (gpu->parent->closest_cpu_numa_node == -1)
441         UVM_SEQ_OR_DBG_PRINT(s, "closest_cpu_numa_node                  n/a\n");
442     else
443         UVM_SEQ_OR_DBG_PRINT(s, "closest_cpu_numa_node                  %d\n", gpu->parent->closest_cpu_numa_node);
444 
445     if (!uvm_procfs_is_debug_enabled())
446         return;
447 
448     UVM_SEQ_OR_DBG_PRINT(s, "CPU link type                          %s\n",
449                          uvm_gpu_link_type_string(gpu->parent->system_bus.link));
450     UVM_SEQ_OR_DBG_PRINT(s, "CPU link bandwidth                     %uMBps\n",
451                          gpu->parent->system_bus.link_rate_mbyte_per_s);
452 
453     UVM_SEQ_OR_DBG_PRINT(s, "architecture                           0x%X\n", gpu_info->gpuArch);
454     UVM_SEQ_OR_DBG_PRINT(s, "implementation                         0x%X\n", gpu_info->gpuImplementation);
455     UVM_SEQ_OR_DBG_PRINT(s, "gpcs                                   %u\n", gpu_info->gpcCount);
456     UVM_SEQ_OR_DBG_PRINT(s, "max_gpcs                               %u\n", gpu_info->maxGpcCount);
457     UVM_SEQ_OR_DBG_PRINT(s, "tpcs                                   %u\n", gpu_info->tpcCount);
458     UVM_SEQ_OR_DBG_PRINT(s, "max_tpcs_per_gpc                       %u\n", gpu_info->maxTpcPerGpcCount);
459     UVM_SEQ_OR_DBG_PRINT(s, "host_class                             0x%X\n", gpu_info->hostClass);
460     UVM_SEQ_OR_DBG_PRINT(s, "ce_class                               0x%X\n", gpu_info->ceClass);
461     UVM_SEQ_OR_DBG_PRINT(s, "virtualization_mode                    %s\n",
462                          uvm_gpu_virt_type_string(gpu_info->virtMode));
463     UVM_SEQ_OR_DBG_PRINT(s, "big_page_size                          %u\n", gpu->big_page.internal_size);
464     UVM_SEQ_OR_DBG_PRINT(s, "rm_va_base                             0x%llx\n", gpu->parent->rm_va_base);
465     UVM_SEQ_OR_DBG_PRINT(s, "rm_va_size                             0x%llx\n", gpu->parent->rm_va_size);
466     UVM_SEQ_OR_DBG_PRINT(s, "vidmem_size                            %llu (%llu MBs)\n",
467                          gpu->mem_info.size,
468                          gpu->mem_info.size / (1024 * 1024));
469     UVM_SEQ_OR_DBG_PRINT(s, "vidmem_max_allocatable                 0x%llx (%llu MBs)\n",
470                          gpu->mem_info.max_allocatable_address,
471                          gpu->mem_info.max_allocatable_address / (1024 * 1024));
472 
473     if (gpu->mem_info.numa.enabled) {
474         NvU64 window_size = gpu->parent->system_bus.memory_window_end - gpu->parent->system_bus.memory_window_start + 1;
475         UVM_SEQ_OR_DBG_PRINT(s, "numa_node_id                           %u\n", uvm_gpu_numa_node(gpu));
476         UVM_SEQ_OR_DBG_PRINT(s, "memory_window_start                    0x%llx\n",
477                              gpu->parent->system_bus.memory_window_start);
478         UVM_SEQ_OR_DBG_PRINT(s, "memory_window_end                      0x%llx\n",
479                              gpu->parent->system_bus.memory_window_end);
480         UVM_SEQ_OR_DBG_PRINT(s, "system_memory_window_size              0x%llx (%llu MBs)\n",
481                              window_size,
482                              window_size / (1024 * 1024));
483     }
484 
485     if (gpu->parent->npu)
486         UVM_SEQ_OR_DBG_PRINT(s, "npu_domain                             %d\n", gpu->parent->npu->pci_domain);
487 
488     UVM_SEQ_OR_DBG_PRINT(s, "interrupts                             %llu\n", gpu->parent->isr.interrupt_count);
489 
490     if (gpu->parent->isr.replayable_faults.handling) {
491         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_bh                   %llu\n",
492                              gpu->parent->isr.replayable_faults.stats.bottom_half_count);
493         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_bh/cpu\n");
494         for_each_cpu(cpu, &gpu->parent->isr.replayable_faults.stats.cpus_used_mask) {
495             UVM_SEQ_OR_DBG_PRINT(s, "    cpu%02u                              %llu\n",
496                                  cpu,
497                                  gpu->parent->isr.replayable_faults.stats.cpu_exec_count[cpu]);
498         }
499         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_buffer_entries       %u\n",
500                              gpu->parent->fault_buffer_info.replayable.max_faults);
501         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_cached_get           %u\n",
502                              gpu->parent->fault_buffer_info.replayable.cached_get);
503         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_cached_put           %u\n",
504                              gpu->parent->fault_buffer_info.replayable.cached_put);
505         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_get                  %u\n",
506                              gpu->parent->fault_buffer_hal->read_get(gpu->parent));
507         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_put                  %u\n",
508                              gpu->parent->fault_buffer_hal->read_put(gpu->parent));
509         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_fault_batch_size     %u\n",
510                              gpu->parent->fault_buffer_info.max_batch_size);
511         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_replay_policy        %s\n",
512                              uvm_perf_fault_replay_policy_string(gpu->parent->fault_buffer_info.replayable.replay_policy));
513         UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_num_faults           %llu\n",
514                              gpu->parent->stats.num_replayable_faults);
515     }
516     if (gpu->parent->isr.non_replayable_faults.handling) {
517         UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_bh               %llu\n",
518                              gpu->parent->isr.non_replayable_faults.stats.bottom_half_count);
519         UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_bh/cpu\n");
520         for_each_cpu(cpu, &gpu->parent->isr.non_replayable_faults.stats.cpus_used_mask) {
521             UVM_SEQ_OR_DBG_PRINT(s, "    cpu%02u                              %llu\n",
522                                  cpu,
523                                  gpu->parent->isr.non_replayable_faults.stats.cpu_exec_count[cpu]);
524         }
525         UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_buffer_entries   %u\n",
526                              gpu->parent->fault_buffer_info.non_replayable.max_faults);
527         UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_num_faults       %llu\n",
528                              gpu->parent->stats.num_non_replayable_faults);
529     }
530 
531     if (gpu->parent->isr.access_counters.handling_ref_count > 0) {
532         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_bh                     %llu\n",
533                              gpu->parent->isr.access_counters.stats.bottom_half_count);
534         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_bh/cpu\n");
535         for_each_cpu(cpu, &gpu->parent->isr.access_counters.stats.cpus_used_mask) {
536             UVM_SEQ_OR_DBG_PRINT(s, "    cpu%02u                              %llu\n",
537                                  cpu,
538                                  gpu->parent->isr.access_counters.stats.cpu_exec_count[cpu]);
539         }
540         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_buffer_entries         %u\n",
541                              gpu->parent->access_counter_buffer_info.max_notifications);
542         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_cached_get             %u\n",
543                              gpu->parent->access_counter_buffer_info.cached_get);
544         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_cached_put             %u\n",
545                              gpu->parent->access_counter_buffer_info.cached_put);
546 
547         get = UVM_GPU_READ_ONCE(*gpu->parent->access_counter_buffer_info.rm_info.pAccessCntrBufferGet);
548         put = UVM_GPU_READ_ONCE(*gpu->parent->access_counter_buffer_info.rm_info.pAccessCntrBufferPut);
549 
550         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_get                    %u\n", get);
551         UVM_SEQ_OR_DBG_PRINT(s, "access_counters_put                    %u\n", put);
552     }
553 
554     num_pages_out = atomic64_read(&gpu->parent->stats.num_pages_out);
555     num_pages_in = atomic64_read(&gpu->parent->stats.num_pages_in);
556     mapped_cpu_pages_size = atomic64_read(&gpu->parent->mapped_cpu_pages_size);
557 
558     UVM_SEQ_OR_DBG_PRINT(s, "migrated_pages_in                      %llu (%llu MB)\n",
559                          num_pages_in,
560                          (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u));
561     UVM_SEQ_OR_DBG_PRINT(s, "migrated_pages_out                     %llu (%llu MB)\n",
562                          num_pages_out,
563                          (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u));
564     UVM_SEQ_OR_DBG_PRINT(s, "mapped_cpu_pages_dma                   %llu (%llu MB)\n",
565                          mapped_cpu_pages_size / PAGE_SIZE,
566                          mapped_cpu_pages_size / (1024u * 1024u));
567 
568     gpu_info_print_ce_caps(gpu, s);
569 
570     if (uvm_conf_computing_mode_enabled(gpu)) {
571         UVM_SEQ_OR_DBG_PRINT(s, "dma_buffer_pool_num_buffers             %lu\n",
572                              gpu->conf_computing.dma_buffer_pool.num_dma_buffers);
573     }
574 }
575 
576 static void
577 gpu_fault_stats_print_common(uvm_parent_gpu_t *parent_gpu, struct seq_file *s)
578 {
579     NvU64 num_pages_in;
580     NvU64 num_pages_out;
581 
582     UVM_ASSERT(uvm_procfs_is_debug_enabled());
583 
584     UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults      %llu\n", parent_gpu->stats.num_replayable_faults);
585     UVM_SEQ_OR_DBG_PRINT(s, "duplicates             %llu\n",
586                          parent_gpu->fault_buffer_info.replayable.stats.num_duplicate_faults);
587     UVM_SEQ_OR_DBG_PRINT(s, "faults_by_access_type:\n");
588     UVM_SEQ_OR_DBG_PRINT(s, "  prefetch             %llu\n",
589                          parent_gpu->fault_buffer_info.replayable.stats.num_prefetch_faults);
590     UVM_SEQ_OR_DBG_PRINT(s, "  read                 %llu\n",
591                          parent_gpu->fault_buffer_info.replayable.stats.num_read_faults);
592     UVM_SEQ_OR_DBG_PRINT(s, "  write                %llu\n",
593                          parent_gpu->fault_buffer_info.replayable.stats.num_write_faults);
594     UVM_SEQ_OR_DBG_PRINT(s, "  atomic               %llu\n",
595                          parent_gpu->fault_buffer_info.replayable.stats.num_atomic_faults);
596     num_pages_out = atomic64_read(&parent_gpu->fault_buffer_info.replayable.stats.num_pages_out);
597     num_pages_in = atomic64_read(&parent_gpu->fault_buffer_info.replayable.stats.num_pages_in);
598     UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n");
599     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_in         %llu (%llu MB)\n", num_pages_in,
600                          (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u));
601     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_out        %llu (%llu MB)\n", num_pages_out,
602                          (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u));
603     UVM_SEQ_OR_DBG_PRINT(s, "replays:\n");
604     UVM_SEQ_OR_DBG_PRINT(s, "  start                %llu\n",
605                          parent_gpu->fault_buffer_info.replayable.stats.num_replays);
606     UVM_SEQ_OR_DBG_PRINT(s, "  start_ack_all        %llu\n",
607                          parent_gpu->fault_buffer_info.replayable.stats.num_replays_ack_all);
608     UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults  %llu\n", parent_gpu->stats.num_non_replayable_faults);
609     UVM_SEQ_OR_DBG_PRINT(s, "faults_by_access_type:\n");
610     UVM_SEQ_OR_DBG_PRINT(s, "  read                 %llu\n",
611                          parent_gpu->fault_buffer_info.non_replayable.stats.num_read_faults);
612     UVM_SEQ_OR_DBG_PRINT(s, "  write                %llu\n",
613                          parent_gpu->fault_buffer_info.non_replayable.stats.num_write_faults);
614     UVM_SEQ_OR_DBG_PRINT(s, "  atomic               %llu\n",
615                          parent_gpu->fault_buffer_info.non_replayable.stats.num_atomic_faults);
616     UVM_SEQ_OR_DBG_PRINT(s, "faults_by_addressing:\n");
617     UVM_SEQ_OR_DBG_PRINT(s, "  virtual              %llu\n",
618                          parent_gpu->stats.num_non_replayable_faults -
619                          parent_gpu->fault_buffer_info.non_replayable.stats.num_physical_faults);
620     UVM_SEQ_OR_DBG_PRINT(s, "  physical             %llu\n",
621                          parent_gpu->fault_buffer_info.non_replayable.stats.num_physical_faults);
622     num_pages_out = atomic64_read(&parent_gpu->fault_buffer_info.non_replayable.stats.num_pages_out);
623     num_pages_in = atomic64_read(&parent_gpu->fault_buffer_info.non_replayable.stats.num_pages_in);
624     UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n");
625     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_in         %llu (%llu MB)\n", num_pages_in,
626                          (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u));
627     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_out        %llu (%llu MB)\n", num_pages_out,
628                          (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u));
629 }
630 
631 static void gpu_access_counters_print_common(uvm_parent_gpu_t *parent_gpu, struct seq_file *s)
632 {
633     NvU64 num_pages_in;
634     NvU64 num_pages_out;
635 
636     UVM_ASSERT(uvm_procfs_is_debug_enabled());
637 
638     num_pages_out = atomic64_read(&parent_gpu->access_counter_buffer_info.stats.num_pages_out);
639     num_pages_in = atomic64_read(&parent_gpu->access_counter_buffer_info.stats.num_pages_in);
640     UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n");
641     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_in         %llu (%llu MB)\n", num_pages_in,
642                          (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u));
643     UVM_SEQ_OR_DBG_PRINT(s, "  num_pages_out        %llu (%llu MB)\n", num_pages_out,
644                          (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u));
645 }
646 
647 void uvm_gpu_print(uvm_gpu_t *gpu)
648 {
649     gpu_info_print_common(gpu, NULL);
650 }
651 
652 static void gpu_peer_caps_print(uvm_gpu_t **gpu_pair, struct seq_file *s)
653 {
654     bool nvswitch_connected;
655     uvm_aperture_t aperture;
656     uvm_gpu_peer_t *peer_caps;
657     uvm_gpu_t *local;
658     uvm_gpu_t *remote;
659 
660     UVM_ASSERT(uvm_procfs_is_debug_enabled());
661 
662     local = gpu_pair[0];
663     remote = gpu_pair[1];
664     peer_caps = uvm_gpu_peer_caps(local, remote);
665     aperture = uvm_gpu_peer_aperture(local, remote);
666     nvswitch_connected = uvm_gpus_are_nvswitch_connected(local, remote);
667     UVM_SEQ_OR_DBG_PRINT(s, "Link type                      %s\n", uvm_gpu_link_type_string(peer_caps->link_type));
668     UVM_SEQ_OR_DBG_PRINT(s, "Bandwidth                      %uMBps\n", peer_caps->total_link_line_rate_mbyte_per_s);
669     UVM_SEQ_OR_DBG_PRINT(s, "Aperture                       %s\n", uvm_aperture_string(aperture));
670     UVM_SEQ_OR_DBG_PRINT(s, "Connected through NVSWITCH     %s\n", nvswitch_connected ? "True" : "False");
671     UVM_SEQ_OR_DBG_PRINT(s, "Refcount                       %llu\n", UVM_READ_ONCE(peer_caps->ref_count));
672 }
673 
674 static int nv_procfs_read_gpu_info(struct seq_file *s, void *v)
675 {
676     uvm_gpu_t *gpu = (uvm_gpu_t *)s->private;
677 
678     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
679             return -EAGAIN;
680 
681     gpu_info_print_common(gpu, s);
682 
683     uvm_up_read(&g_uvm_global.pm.lock);
684 
685     return 0;
686 }
687 
688 static int nv_procfs_read_gpu_info_entry(struct seq_file *s, void *v)
689 {
690     UVM_ENTRY_RET(nv_procfs_read_gpu_info(s, v));
691 }
692 
693 static int nv_procfs_read_gpu_fault_stats(struct seq_file *s, void *v)
694 {
695     uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)s->private;
696 
697     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
698             return -EAGAIN;
699 
700     gpu_fault_stats_print_common(parent_gpu, s);
701 
702     uvm_up_read(&g_uvm_global.pm.lock);
703 
704     return 0;
705 }
706 
707 static int nv_procfs_read_gpu_fault_stats_entry(struct seq_file *s, void *v)
708 {
709     UVM_ENTRY_RET(nv_procfs_read_gpu_fault_stats(s, v));
710 }
711 
712 static int nv_procfs_read_gpu_access_counters(struct seq_file *s, void *v)
713 {
714     uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)s->private;
715 
716     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
717             return -EAGAIN;
718 
719     gpu_access_counters_print_common(parent_gpu, s);
720 
721     uvm_up_read(&g_uvm_global.pm.lock);
722 
723     return 0;
724 }
725 
726 static int nv_procfs_read_gpu_access_counters_entry(struct seq_file *s, void *v)
727 {
728     UVM_ENTRY_RET(nv_procfs_read_gpu_access_counters(s, v));
729 }
730 
731 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_info_entry);
732 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_fault_stats_entry);
733 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_access_counters_entry);
734 
735 static NV_STATUS init_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)
736 {
737     struct proc_dir_entry *gpu_base_dir_entry;
738     char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
739     char gpu_dir_name[sizeof(uuid_text_buffer) + 1];
740 
741     if (!uvm_procfs_is_enabled())
742         return NV_OK;
743 
744     gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir();
745 
746     format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &parent_gpu->uuid);
747 
748     // Create UVM-GPU-${UUID} directory
749     snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%s", uuid_text_buffer);
750 
751     parent_gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu_base_dir_entry);
752     if (parent_gpu->procfs.dir == NULL)
753         return NV_ERR_OPERATING_SYSTEM;
754 
755     return NV_OK;
756 }
757 
758 static void deinit_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)
759 {
760     proc_remove(parent_gpu->procfs.dir);
761 }
762 
763 static NV_STATUS init_parent_procfs_files(uvm_parent_gpu_t *parent_gpu)
764 {
765     // Fault and access counter files are debug only
766     if (!uvm_procfs_is_debug_enabled())
767         return NV_OK;
768 
769     parent_gpu->procfs.fault_stats_file = NV_CREATE_PROC_FILE("fault_stats",
770                                                               parent_gpu->procfs.dir,
771                                                               gpu_fault_stats_entry,
772                                                               parent_gpu);
773     if (parent_gpu->procfs.fault_stats_file == NULL)
774         return NV_ERR_OPERATING_SYSTEM;
775 
776     parent_gpu->procfs.access_counters_file = NV_CREATE_PROC_FILE("access_counters",
777                                                                   parent_gpu->procfs.dir,
778                                                                   gpu_access_counters_entry,
779                                                                   parent_gpu);
780     if (parent_gpu->procfs.access_counters_file == NULL)
781         return NV_ERR_OPERATING_SYSTEM;
782 
783     return NV_OK;
784 }
785 
786 static void deinit_parent_procfs_files(uvm_parent_gpu_t *parent_gpu)
787 {
788     proc_remove(parent_gpu->procfs.access_counters_file);
789     proc_remove(parent_gpu->procfs.fault_stats_file);
790 }
791 
792 static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
793 {
794     struct proc_dir_entry *gpu_base_dir_entry;
795     char symlink_name[16]; // Hold a global_gpu_id_t value in decimal.
796     char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
797     char gpu_dir_name[sizeof(symlink_name) + sizeof(uuid_text_buffer) + 1];
798 
799     if (!uvm_procfs_is_enabled())
800         return NV_OK;
801 
802     format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), uvm_gpu_uuid(gpu));
803 
804     gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir();
805 
806     // Create UVM-GPU-${UUID}/${sub_processor_index} directory
807     snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_global_id_sub_processor_index(gpu->global_id));
808 
809     gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu->parent->procfs.dir);
810     if (gpu->procfs.dir == NULL)
811         return NV_ERR_OPERATING_SYSTEM;
812 
813     // Create symlink from ${global_gpu_id} to
814     // gpus/UVM-GPU-${UUID}/${sub_processor_index}
815     snprintf(symlink_name, sizeof(symlink_name), "%u", uvm_global_id_value(gpu->global_id));
816     snprintf(gpu_dir_name,
817              sizeof(gpu_dir_name),
818              "%s/%u",
819              uuid_text_buffer,
820              uvm_global_id_sub_processor_index(gpu->global_id));
821 
822     gpu->procfs.dir_symlink = proc_symlink(symlink_name, gpu_base_dir_entry, gpu_dir_name);
823     if (gpu->procfs.dir_symlink == NULL)
824         return NV_ERR_OPERATING_SYSTEM;
825 
826     // GPU peer files are debug only
827     if (!uvm_procfs_is_debug_enabled())
828         return NV_OK;
829 
830     gpu->procfs.dir_peers = NV_CREATE_PROC_DIR(UVM_PROC_GPUS_PEER_DIR_NAME, gpu->procfs.dir);
831     if (gpu->procfs.dir_peers == NULL)
832         return NV_ERR_OPERATING_SYSTEM;
833 
834     return NV_OK;
835 }
836 
837 // The kernel waits on readers to finish before returning from those calls
838 static void deinit_procfs_dirs(uvm_gpu_t *gpu)
839 {
840     proc_remove(gpu->procfs.dir_peers);
841     proc_remove(gpu->procfs.dir_symlink);
842     proc_remove(gpu->procfs.dir);
843 }
844 
845 static NV_STATUS init_procfs_files(uvm_gpu_t *gpu)
846 {
847     gpu->procfs.info_file = NV_CREATE_PROC_FILE("info", gpu->procfs.dir, gpu_info_entry, gpu);
848     if (gpu->procfs.info_file == NULL)
849         return NV_ERR_OPERATING_SYSTEM;
850 
851     return NV_OK;
852 }
853 
854 static void deinit_procfs_files(uvm_gpu_t *gpu)
855 {
856     proc_remove(gpu->procfs.info_file);
857 }
858 
859 static void deinit_procfs_peer_cap_files(uvm_gpu_peer_t *peer_caps)
860 {
861     proc_remove(peer_caps->procfs.peer_symlink_file[0]);
862     proc_remove(peer_caps->procfs.peer_symlink_file[1]);
863     proc_remove(peer_caps->procfs.peer_file[0]);
864     proc_remove(peer_caps->procfs.peer_file[1]);
865 }
866 
867 static NV_STATUS init_semaphore_pools(uvm_gpu_t *gpu)
868 {
869     NV_STATUS status;
870     uvm_gpu_t *other_gpu;
871 
872     status = uvm_gpu_semaphore_pool_create(gpu, &gpu->semaphore_pool);
873     if (status != NV_OK)
874         return status;
875 
876     // When the Confidential Computing feature is enabled, a separate secure
877     // pool is created that holds page allocated in the CPR of vidmem.
878     if (uvm_conf_computing_mode_enabled(gpu)) {
879         status = uvm_gpu_semaphore_secure_pool_create(gpu, &gpu->secure_semaphore_pool);
880         if (status != NV_OK)
881             return status;
882     }
883 
884     for_each_global_gpu(other_gpu) {
885         if (uvm_conf_computing_mode_enabled(gpu))
886             break;
887         if (other_gpu == gpu)
888             continue;
889         status = uvm_gpu_semaphore_pool_map_gpu(other_gpu->semaphore_pool, gpu);
890         if (status != NV_OK)
891             return status;
892     }
893 
894     return NV_OK;
895 }
896 
897 static void deinit_semaphore_pools(uvm_gpu_t *gpu)
898 {
899     uvm_gpu_t *other_gpu;
900 
901     for_each_global_gpu(other_gpu) {
902         if (other_gpu == gpu)
903             continue;
904         uvm_gpu_semaphore_pool_unmap_gpu(other_gpu->semaphore_pool, gpu);
905     }
906 
907     uvm_gpu_semaphore_pool_destroy(gpu->semaphore_pool);
908     uvm_gpu_semaphore_pool_destroy(gpu->secure_semaphore_pool);
909 }
910 
911 static NV_STATUS find_unused_global_gpu_id(uvm_parent_gpu_t *parent_gpu, uvm_global_gpu_id_t *out_id)
912 {
913     NvU32 i;
914 
915     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
916 
917     if (!parent_gpu) {
918         for (i = 0; i < UVM_MAX_GPUS; i++) {
919             if (!g_uvm_global.parent_gpus[i]) {
920                 *out_id = uvm_global_gpu_id_from_parent_index(i);
921                 return NV_OK;
922             }
923         }
924     }
925     else {
926         NvU32 sub_processor_index = find_first_zero_bit(parent_gpu->valid_gpus, UVM_ID_MAX_SUB_PROCESSORS);
927         if (sub_processor_index < UVM_ID_MAX_SUB_PROCESSORS) {
928             *out_id = uvm_global_gpu_id_from_sub_processor_index(parent_gpu->id, sub_processor_index);
929             return NV_OK;
930         }
931     }
932 
933     return NV_ERR_INSUFFICIENT_RESOURCES;
934 }
935 
936 // Allocates a uvm_parent_gpu_t, assigns the GPU ID, and sets up basic data
937 // structures, but leaves all other initialization up to the caller.
938 static NV_STATUS alloc_parent_gpu(const NvProcessorUuid *gpu_uuid,
939                                   uvm_gpu_id_t gpu_id,
940                                   uvm_parent_gpu_t **parent_gpu_out)
941 {
942     uvm_parent_gpu_t *parent_gpu;
943     NV_STATUS status;
944 
945     parent_gpu = uvm_kvmalloc_zero(sizeof(*parent_gpu));
946     if (!parent_gpu)
947         return NV_ERR_NO_MEMORY;
948 
949     parent_gpu->id = gpu_id;
950 
951     uvm_processor_uuid_copy(&parent_gpu->uuid, gpu_uuid);
952     uvm_sema_init(&parent_gpu->isr.replayable_faults.service_lock, 1, UVM_LOCK_ORDER_ISR);
953     uvm_sema_init(&parent_gpu->isr.non_replayable_faults.service_lock, 1, UVM_LOCK_ORDER_ISR);
954     uvm_sema_init(&parent_gpu->isr.access_counters.service_lock, 1, UVM_LOCK_ORDER_ISR);
955     uvm_spin_lock_irqsave_init(&parent_gpu->isr.interrupts_lock, UVM_LOCK_ORDER_LEAF);
956     uvm_spin_lock_init(&parent_gpu->instance_ptr_table_lock, UVM_LOCK_ORDER_LEAF);
957     uvm_rb_tree_init(&parent_gpu->instance_ptr_table);
958     uvm_rb_tree_init(&parent_gpu->tsg_table);
959 
960     // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
961     status = errno_to_nv_status(nv_kthread_q_init(&parent_gpu->lazy_free_q, "vidmem lazy free"));
962 
963     nv_kref_init(&parent_gpu->gpu_kref);
964 
965     *parent_gpu_out = parent_gpu;
966 
967     return status;
968 }
969 
970 // Allocates a uvm_gpu_t struct and initializes the basic fields and leaves all
971 // other initialization up to the caller.
972 static uvm_gpu_t *alloc_gpu(uvm_parent_gpu_t *parent_gpu, uvm_global_gpu_id_t global_gpu_id)
973 {
974     NvU32 sub_processor_index;
975     uvm_gpu_t *gpu;
976 
977     gpu = uvm_kvmalloc_zero(sizeof(*gpu));
978     if (!gpu)
979         return gpu;
980 
981     gpu->id = parent_gpu->id;
982     gpu->global_id = global_gpu_id;
983     gpu->parent = parent_gpu;
984 
985     // Initialize enough of the gpu struct for remove_gpu to be called
986     gpu->magic = UVM_GPU_MAGIC_VALUE;
987     uvm_spin_lock_init(&gpu->peer_info.peer_gpus_lock, UVM_LOCK_ORDER_LEAF);
988 
989     sub_processor_index = uvm_global_id_sub_processor_index(global_gpu_id);
990     parent_gpu->gpus[sub_processor_index] = gpu;
991 
992     return gpu;
993 }
994 
995 static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
996 {
997     NV_STATUS status;
998     NvU32 num_entries;
999     NvU64 va_size;
1000     NvU64 va_per_entry;
1001 
1002     status = uvm_page_tree_init(gpu,
1003                                 NULL,
1004                                 UVM_PAGE_TREE_TYPE_KERNEL,
1005                                 gpu->big_page.internal_size,
1006                                 uvm_gpu_page_tree_init_location(gpu),
1007                                 &gpu->address_space_tree);
1008     if (status != NV_OK) {
1009         UVM_ERR_PRINT("Initializing the page tree failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1010         return status;
1011     }
1012 
1013     num_entries = uvm_mmu_page_tree_entries(&gpu->address_space_tree, 0, UVM_PAGE_SIZE_AGNOSTIC);
1014 
1015     UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64);
1016     va_size = 1ull << gpu->address_space_tree.hal->num_va_bits();
1017     va_per_entry = va_size / num_entries;
1018 
1019     // Make sure that RM's part of the VA is aligned to the VA covered by a
1020     // single top level PDE.
1021     UVM_ASSERT_MSG(gpu->parent->rm_va_base % va_per_entry == 0,
1022                    "va_base 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_base, va_per_entry);
1023     UVM_ASSERT_MSG(gpu->parent->rm_va_size % va_per_entry == 0,
1024                    "va_size 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_size, va_per_entry);
1025 
1026     status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space,
1027             uvm_page_tree_pdb(&gpu->address_space_tree)->addr.address, num_entries,
1028             uvm_page_tree_pdb(&gpu->address_space_tree)->addr.aperture == UVM_APERTURE_VID,
1029             -1U /* Invalid PASID for internal RM address space */));
1030     if (status != NV_OK) {
1031         UVM_ERR_PRINT("nvUvmInterfaceSetPageDirectory() failed: %s, GPU %s\n",
1032                       nvstatusToString(status),
1033                       uvm_gpu_name(gpu));
1034         return status;
1035     }
1036     gpu->rm_address_space_moved_to_page_tree = true;
1037 
1038     return NV_OK;
1039 }
1040 
1041 static void deconfigure_address_space(uvm_gpu_t *gpu)
1042 {
1043     if (gpu->rm_address_space_moved_to_page_tree)
1044         uvm_rm_locked_call_void(nvUvmInterfaceUnsetPageDirectory(gpu->rm_address_space));
1045 
1046     if (gpu->address_space_tree.root)
1047         uvm_page_tree_deinit(&gpu->address_space_tree);
1048 }
1049 
1050 static NV_STATUS service_interrupts(uvm_parent_gpu_t *parent_gpu)
1051 {
1052     // Asking RM to service interrupts from top half interrupt handler would
1053     // very likely deadlock.
1054     UVM_ASSERT(!in_interrupt());
1055 
1056     return uvm_rm_locked_call(nvUvmInterfaceServiceDeviceInterruptsRM(parent_gpu->rm_device));
1057 }
1058 
1059 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu)
1060 {
1061     NV_STATUS status = uvm_gpu_check_ecc_error_no_rm(gpu);
1062 
1063     if (status == NV_OK || status != NV_WARN_MORE_PROCESSING_REQUIRED)
1064         return status;
1065 
1066     // An interrupt that might mean an ECC error needs to be serviced.
1067     UVM_ASSERT(status == NV_WARN_MORE_PROCESSING_REQUIRED);
1068 
1069     status = service_interrupts(gpu->parent);
1070     if (status != NV_OK) {
1071         UVM_ERR_PRINT("Servicing interrupts failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1072         return status;
1073     }
1074 
1075     // After servicing interrupts the ECC error notifier should be current.
1076     if (*gpu->ecc.error_notifier) {
1077         UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu));
1078         uvm_global_set_fatal_error(NV_ERR_ECC_ERROR);
1079         return NV_ERR_ECC_ERROR;
1080     }
1081 
1082     return NV_OK;
1083 }
1084 
1085 static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
1086                                  const NvProcessorUuid *gpu_uuid,
1087                                  const UvmGpuInfo *gpu_info,
1088                                  const UvmGpuPlatformInfo *gpu_platform_info)
1089 {
1090     NV_STATUS status;
1091 
1092     status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(g_uvm_global.rm_session_handle,
1093                                                            gpu_info,
1094                                                            gpu_uuid,
1095                                                            &parent_gpu->rm_device,
1096                                                            NV_FALSE));
1097     if (status != NV_OK) {
1098         UVM_ERR_PRINT("Creating RM device failed: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1099         return status;
1100     }
1101 
1102     status = uvm_conf_computing_init_parent_gpu(parent_gpu);
1103     if (status != NV_OK) {
1104         UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n",
1105                       nvstatusToString(status), parent_gpu->name);
1106         return status;
1107     }
1108 
1109     parent_gpu->pci_dev = gpu_platform_info->pci_dev;
1110     parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev);
1111     parent_gpu->dma_addressable_start = gpu_platform_info->dma_addressable_start;
1112     parent_gpu->dma_addressable_limit = gpu_platform_info->dma_addressable_limit;
1113 
1114     parent_gpu->sli_enabled = (gpu_info->subdeviceCount > 1);
1115 
1116     parent_gpu->virt_mode = gpu_info->virtMode;
1117     if (parent_gpu->virt_mode == UVM_VIRT_MODE_LEGACY) {
1118         UVM_ERR_PRINT("Failed to init GPU %s. UVM is not supported in legacy virtualization mode\n", parent_gpu->name);
1119         return NV_ERR_NOT_SUPPORTED;
1120     }
1121 
1122     if (gpu_info->isSimulated)
1123         ++g_uvm_global.num_simulated_devices;
1124 
1125     status = init_parent_procfs_dir(parent_gpu);
1126     if (status != NV_OK) {
1127         UVM_ERR_PRINT("Failed to init parent procfs dir: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1128         return status;
1129     }
1130 
1131     status = uvm_hal_init_gpu(parent_gpu);
1132     if (status != NV_OK) {
1133         UVM_ERR_PRINT("Failed to init GPU hal: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1134         return status;
1135     }
1136 
1137     uvm_hal_init_properties(parent_gpu);
1138 
1139     UVM_ASSERT(!parent_gpu->rm_info.smcEnabled || parent_gpu->smc.supported);
1140     parent_gpu->smc.enabled = !!parent_gpu->rm_info.smcEnabled;
1141 
1142     uvm_mmu_init_gpu_chunk_sizes(parent_gpu);
1143 
1144     status = uvm_ats_add_gpu(parent_gpu);
1145     if (status != NV_OK) {
1146         UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1147         return status;
1148     }
1149 
1150     status = init_parent_procfs_files(parent_gpu);
1151     if (status != NV_OK) {
1152         UVM_ERR_PRINT("Failed to init parent procfs files: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1153         return status;
1154     }
1155 
1156     status = uvm_gpu_init_isr(parent_gpu);
1157     if (status != NV_OK) {
1158         UVM_ERR_PRINT("Failed to init ISR: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name);
1159         return status;
1160     }
1161 
1162     return NV_OK;
1163 }
1164 
1165 static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
1166 {
1167     NV_STATUS status;
1168 
1169     // Presently, an RM client can only subscribe to a single partition per
1170     // GPU. Therefore, UVM needs to create several RM clients. For simplicity,
1171     // and since P2P is not supported when SMC partitions are created, we
1172     // create a client (session) per GPU partition.
1173     if (gpu->parent->smc.enabled) {
1174         UvmPlatformInfo platform_info;
1175         status = uvm_rm_locked_call(nvUvmInterfaceSessionCreate(&gpu->smc.rm_session_handle, &platform_info));
1176         if (status != NV_OK) {
1177             UVM_ERR_PRINT("Creating RM session failed: %s\n", nvstatusToString(status));
1178             return status;
1179         }
1180 
1181         status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_gpu_session_handle(gpu),
1182                                                                gpu_info,
1183                                                                uvm_gpu_uuid(gpu),
1184                                                                &gpu->smc.rm_device,
1185                                                                NV_TRUE));
1186         if (status != NV_OK) {
1187             UVM_ERR_PRINT("Creating RM device failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1188             return status;
1189         }
1190     }
1191 
1192     gpu->smc.swizz_id = gpu_info->smcSwizzId;
1193 
1194     // Initialize the per-GPU procfs dirs as early as possible so that other
1195     // parts of the driver can add files in them as part of their per-GPU init.
1196     status = init_procfs_dirs(gpu);
1197     if (status != NV_OK) {
1198         UVM_ERR_PRINT("Failed to init procfs dirs: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1199         return status;
1200     }
1201 
1202     status = get_gpu_caps(gpu);
1203     if (status != NV_OK) {
1204         UVM_ERR_PRINT("Failed to get GPU caps: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1205         return status;
1206     }
1207 
1208     uvm_mmu_init_gpu_peer_addresses(gpu);
1209 
1210     status = alloc_and_init_address_space(gpu);
1211     if (status != NV_OK) {
1212         UVM_ERR_PRINT("Creating RM address space failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1213         return status;
1214     }
1215 
1216     status = get_gpu_fb_info(gpu);
1217     if (status != NV_OK) {
1218         UVM_ERR_PRINT("Failed to get GPU FB info: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1219         return status;
1220     }
1221 
1222     status = get_gpu_ecc_info(gpu);
1223     if (status != NV_OK) {
1224         UVM_ERR_PRINT("Failed to get GPU ECC info: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1225         return status;
1226     }
1227 
1228     status = uvm_pmm_gpu_init(&gpu->pmm);
1229     if (status != NV_OK) {
1230         UVM_ERR_PRINT("PMM initialization failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1231         return status;
1232     }
1233 
1234     status = uvm_pmm_sysmem_mappings_init(gpu, &gpu->pmm_reverse_sysmem_mappings);
1235     if (status != NV_OK) {
1236         UVM_ERR_PRINT("CPU PMM MMIO initialization failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1237         return status;
1238     }
1239 
1240     status = init_semaphore_pools(gpu);
1241     if (status != NV_OK) {
1242         UVM_ERR_PRINT("Failed to initialize the semaphore pool: %s, GPU %s\n",
1243                       nvstatusToString(status),
1244                       uvm_gpu_name(gpu));
1245         return status;
1246     }
1247 
1248     status = uvm_channel_manager_create(gpu, &gpu->channel_manager);
1249     if (status != NV_OK) {
1250         UVM_ERR_PRINT("Failed to initialize the channel manager: %s, GPU %s\n",
1251                       nvstatusToString(status),
1252                       uvm_gpu_name(gpu));
1253         return status;
1254     }
1255 
1256     status = configure_address_space(gpu);
1257     if (status != NV_OK) {
1258         UVM_ERR_PRINT("Failed to configure the GPU address space: %s, GPU %s\n",
1259                       nvstatusToString(status),
1260                       uvm_gpu_name(gpu));
1261         return status;
1262     }
1263 
1264     status = uvm_mmu_create_flat_mappings(gpu);
1265     if (status != NV_OK) {
1266         UVM_ERR_PRINT("Creating flat mappings failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1267         return status;
1268     }
1269 
1270     status = uvm_conf_computing_gpu_init(gpu);
1271     if (status != NV_OK) {
1272         UVM_ERR_PRINT("Failed to initialize Confidential Compute: %s for GPU %s\n",
1273                       nvstatusToString(status),
1274                       uvm_gpu_name(gpu));
1275         return status;
1276     }
1277 
1278     status = init_procfs_files(gpu);
1279     if (status != NV_OK) {
1280         UVM_ERR_PRINT("Failed to init procfs files: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1281         return status;
1282     }
1283 
1284     status = uvm_perf_heuristics_add_gpu(gpu);
1285     if (status != NV_OK) {
1286         UVM_ERR_PRINT("Failed to init heuristics: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1287         return status;
1288     }
1289 
1290     return NV_OK;
1291 }
1292 
1293 // Add a new gpu and register it with RM
1294 // TODO: Bug 2844714: Split parent-specific parts of this function out into a
1295 // separate add_parent_gpu() function.
1296 static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,
1297                          const uvm_global_gpu_id_t global_gpu_id,
1298                          const UvmGpuInfo *gpu_info,
1299                          const UvmGpuPlatformInfo *gpu_platform_info,
1300                          uvm_parent_gpu_t *parent_gpu,
1301                          uvm_gpu_t **gpu_out)
1302 {
1303     NV_STATUS status;
1304     bool alloc_parent = (parent_gpu == NULL);
1305     uvm_gpu_t *gpu = NULL;
1306 
1307     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1308 
1309     if (alloc_parent) {
1310         status = alloc_parent_gpu(gpu_uuid, uvm_gpu_id_from_global_gpu_id(global_gpu_id), &parent_gpu);
1311         if (status != NV_OK)
1312             return status;
1313     }
1314 
1315     gpu = alloc_gpu(parent_gpu, global_gpu_id);
1316     if (!gpu) {
1317         if (alloc_parent)
1318             uvm_parent_gpu_kref_put(parent_gpu);
1319 
1320         return NV_ERR_NO_MEMORY;
1321     }
1322 
1323     parent_gpu->num_retained_gpus++;
1324 
1325     if (alloc_parent)
1326         fill_gpu_info(parent_gpu, gpu_info);
1327 
1328     // After this point all error clean up should be handled by remove_gpu()
1329 
1330     if (!gpu_supports_uvm(parent_gpu)) {
1331         UVM_DBG_PRINT("Registration of non-UVM-capable GPU attempted: GPU %s\n", uvm_gpu_name(gpu));
1332         status = NV_ERR_NOT_SUPPORTED;
1333         goto error;
1334     }
1335 
1336     if (alloc_parent) {
1337         status = init_parent_gpu(parent_gpu, gpu_uuid, gpu_info, gpu_platform_info);
1338         if (status != NV_OK)
1339             goto error;
1340     }
1341 
1342     status = init_gpu(gpu, gpu_info);
1343     if (status != NV_OK)
1344         goto error;
1345 
1346     status = uvm_gpu_check_ecc_error(gpu);
1347     if (status != NV_OK)
1348         goto error;
1349 
1350     atomic64_set(&gpu->retained_count, 1);
1351     uvm_global_processor_mask_set(&g_uvm_global.retained_gpus, gpu->global_id);
1352 
1353     uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
1354 
1355     if (alloc_parent)
1356         uvm_global_add_parent_gpu(parent_gpu);
1357 
1358     // Mark the GPU as valid in the parent GPU's GPU table.
1359     UVM_ASSERT(!test_bit(uvm_global_id_sub_processor_index(gpu->global_id), parent_gpu->valid_gpus));
1360     __set_bit(uvm_global_id_sub_processor_index(gpu->global_id), parent_gpu->valid_gpus);
1361 
1362     // Although locking correctness does not, at this early point (before the
1363     // GPU is visible in the table) strictly require holding the gpu_table_lock
1364     // in order to read gpu->isr.replayable_faults.handling, nor to enable page
1365     // fault interrupts (this could have been done earlier), it is best to do it
1366     // here, in order to avoid an interrupt storm. That way, we take advantage
1367     // of the spinlock_irqsave side effect of turning off local CPU interrupts,
1368     // part of holding the gpu_table_lock. That means that the local CPU won't
1369     // receive any of these interrupts, until the GPU is safely added to the
1370     // table (where the top half ISR can find it).
1371     //
1372     // As usual with spinlock_irqsave behavior, *other* CPUs can still handle
1373     // these interrupts, but the local CPU will not be slowed down (interrupted)
1374     // by such handling, and can quickly release the gpu_table_lock, thus
1375     // unblocking any other CPU's top half (which waits for the gpu_table_lock).
1376     if (alloc_parent && parent_gpu->isr.replayable_faults.handling) {
1377         parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu);
1378 
1379         // Clear the interrupt bit and force the re-evaluation of the interrupt
1380         // condition to ensure that we don't miss any pending interrupt
1381         parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu,
1382                                                               parent_gpu->fault_buffer_info.replayable.cached_get);
1383     }
1384 
1385     // Access counters are enabled on demand
1386 
1387     uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
1388 
1389     if (alloc_parent) {
1390         status = discover_nvlink_peers(gpu);
1391         if (status != NV_OK) {
1392             UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1393 
1394             // Nobody can have retained the GPU yet, since we still hold the global
1395             // lock.
1396             UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
1397             atomic64_set(&gpu->retained_count, 0);
1398             goto error;
1399         }
1400     }
1401 
1402     *gpu_out = gpu;
1403 
1404     return NV_OK;
1405 
1406 error:
1407     remove_gpu(gpu);
1408 
1409     return status;
1410 }
1411 
1412 static void sync_parent_gpu_trackers(uvm_parent_gpu_t *parent_gpu,
1413                                      bool sync_replay_tracker,
1414                                      bool sync_clear_faulted_tracker)
1415 {
1416     NV_STATUS status;
1417 
1418     // Sync the replay tracker since it inherits dependencies from the VA block
1419     // trackers.
1420     if (sync_replay_tracker) {
1421         uvm_gpu_replayable_faults_isr_lock(parent_gpu);
1422         status = uvm_tracker_wait(&parent_gpu->fault_buffer_info.replayable.replay_tracker);
1423         uvm_gpu_replayable_faults_isr_unlock(parent_gpu);
1424 
1425         if (status != NV_OK)
1426             UVM_ASSERT(status == uvm_global_get_status());
1427     }
1428 
1429     // Sync the clear_faulted tracker since it inherits dependencies from the
1430     // VA block trackers, too.
1431     if (sync_clear_faulted_tracker) {
1432         uvm_gpu_non_replayable_faults_isr_lock(parent_gpu);
1433         status = uvm_tracker_wait(&parent_gpu->fault_buffer_info.non_replayable.clear_faulted_tracker);
1434         uvm_gpu_non_replayable_faults_isr_unlock(parent_gpu);
1435 
1436         if (status != NV_OK)
1437             UVM_ASSERT(status == uvm_global_get_status());
1438     }
1439 }
1440 
1441 // Remove all references the given GPU has to other GPUs, since one of those
1442 // other GPUs is getting removed. This involves waiting for any unfinished
1443 // trackers contained by this GPU.
1444 static void remove_gpus_from_gpu(uvm_gpu_t *gpu)
1445 {
1446     sync_parent_gpu_trackers(gpu->parent,
1447                              gpu->parent->isr.replayable_faults.handling,
1448                              gpu->parent->isr.non_replayable_faults.handling);
1449 
1450     // Sync all trackers in PMM
1451     uvm_pmm_gpu_sync(&gpu->pmm);
1452 
1453     // Sync all trackers in the GPU's DMA allocation pool
1454     uvm_conf_computing_dma_buffer_pool_sync(&gpu->conf_computing.dma_buffer_pool);
1455 }
1456 
1457 // Remove all references to the given GPU from its parent, since it is being
1458 // removed.  This involves waiting for any unfinished trackers contained
1459 // by the parent GPU.
1460 static void remove_gpu_from_parent_gpu(uvm_gpu_t *gpu)
1461 {
1462     // We use *.was_handling instead of *.handling here since this function is
1463     // called after uvm_gpu_disable_isr(), and the *.handling flags will
1464     // already have been copied to *.was_handling, and then set to false.
1465     sync_parent_gpu_trackers(gpu->parent,
1466                              gpu->parent->isr.replayable_faults.was_handling,
1467                              gpu->parent->isr.non_replayable_faults.was_handling);
1468 }
1469 
1470 static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
1471 {
1472     // All channels should have been removed before the retained count went to 0
1473     UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->instance_ptr_table));
1474     UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->tsg_table));
1475 
1476     // Access counters should have been disabled when the GPU is no longer
1477     // registered in any VA space.
1478     UVM_ASSERT(parent_gpu->isr.access_counters.handling_ref_count == 0);
1479 
1480     // Return ownership to RM
1481     uvm_gpu_deinit_isr(parent_gpu);
1482 
1483     deinit_parent_procfs_files(parent_gpu);
1484 
1485     uvm_ats_remove_gpu(parent_gpu);
1486 
1487     UVM_ASSERT(atomic64_read(&parent_gpu->mapped_cpu_pages_size) == 0);
1488 
1489     // After calling nvUvmInterfaceUnregisterGpu() the reference to pci_dev may
1490     // not be valid any more so clear it ahead of time.
1491     parent_gpu->pci_dev = NULL;
1492 
1493     deinit_parent_procfs_dir(parent_gpu);
1494 
1495     if (parent_gpu->rm_info.isSimulated)
1496         --g_uvm_global.num_simulated_devices;
1497 
1498     if (parent_gpu->rm_device != 0)
1499         uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(parent_gpu->rm_device));
1500 
1501     uvm_parent_gpu_kref_put(parent_gpu);
1502 }
1503 
1504 static void deinit_gpu(uvm_gpu_t *gpu)
1505 {
1506     uvm_gpu_t *other_gpu;
1507 
1508     // Remove any pointers to this GPU from other GPUs' trackers.
1509     for_each_global_gpu(other_gpu) {
1510         UVM_ASSERT(other_gpu != gpu);
1511         remove_gpus_from_gpu(other_gpu);
1512     }
1513 
1514     // Further, remove any pointers to this GPU from its parent's trackers.
1515     remove_gpu_from_parent_gpu(gpu);
1516 
1517     uvm_perf_heuristics_remove_gpu(gpu);
1518 
1519     deinit_procfs_files(gpu);
1520 
1521     // TODO Bug 3429163: [UVM] Move uvm_mmu_destroy_flat_mapping() to the
1522     // correct spot
1523     uvm_mmu_destroy_flat_mappings(gpu);
1524 
1525     // Wait for any deferred frees and their associated trackers to be finished
1526     // before tearing down channels.
1527     uvm_pmm_gpu_sync(&gpu->pmm);
1528 
1529     uvm_channel_manager_destroy(gpu->channel_manager);
1530 
1531     // Deconfigure the address space only after destroying all the channels as
1532     // in case any of them hit fatal errors, RM will assert that they are not
1533     // idle during nvUvmInterfaceUnsetPageDirectory() and that's an unnecessary
1534     // pain during development.
1535     deconfigure_address_space(gpu);
1536 
1537     deinit_semaphore_pools(gpu);
1538 
1539     uvm_pmm_sysmem_mappings_deinit(&gpu->pmm_reverse_sysmem_mappings);
1540 
1541     uvm_pmm_gpu_deinit(&gpu->pmm);
1542 
1543     if (gpu->rm_address_space != 0)
1544         uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space));
1545 
1546     deinit_procfs_dirs(gpu);
1547 
1548     if (gpu->parent->smc.enabled) {
1549         if (gpu->smc.rm_device != 0)
1550             uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
1551 
1552         if (gpu->smc.rm_session_handle != 0)
1553             uvm_rm_locked_call_void(nvUvmInterfaceSessionDestroy(gpu->smc.rm_session_handle));
1554     }
1555 
1556     gpu->magic = 0;
1557 }
1558 
1559 // Remove a gpu and unregister it from RM
1560 // Note that this is also used in most error paths in add_gpu()
1561 static void remove_gpu(uvm_gpu_t *gpu)
1562 {
1563     NvU32 sub_processor_index;
1564     uvm_parent_gpu_t *parent_gpu;
1565     bool free_parent;
1566 
1567     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1568 
1569     sub_processor_index = uvm_global_id_sub_processor_index(gpu->global_id);
1570     parent_gpu = gpu->parent;
1571 
1572     UVM_ASSERT_MSG(uvm_gpu_retained_count(gpu) == 0,
1573                    "gpu_id %u retained_count %llu\n",
1574                    uvm_id_value(gpu->id),
1575                    uvm_gpu_retained_count(gpu));
1576 
1577     UVM_ASSERT(parent_gpu->num_retained_gpus > 0);
1578     parent_gpu->num_retained_gpus--;
1579 
1580     free_parent = (parent_gpu->num_retained_gpus == 0);
1581 
1582     // NVLINK peers must be removed and the relevant access counter buffers must
1583     // be flushed before removing this GPU from the global table. See the
1584     // comment on discover_nvlink_peers in add_gpu.
1585     if (free_parent)
1586         destroy_nvlink_peers(gpu);
1587 
1588     // uvm_mem_free and other uvm_mem APIs invoked by the Confidential Compute
1589     // deinitialization must be called before the GPU is removed from the global
1590     // table.
1591     //
1592     // TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot.
1593     uvm_conf_computing_gpu_deinit(gpu);
1594 
1595     // TODO: Bug 2844714: If the parent is not being freed, the following
1596     // gpu_table_lock is only needed to protect concurrent
1597     // find_first_valid_gpu() in BH from the __clear_bit here. After
1598     // find_first_valid_gpu() is removed, gpu_table_lock should only be acquired
1599     // and released in the free_parent case.
1600     //
1601     // In the free_parent case, gpu_table_lock protects the top half from the
1602     // uvm_global_remove_parent_gpu()
1603     uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
1604 
1605     // Mark the GPU as invalid in the parent GPU's GPU table.
1606     __clear_bit(sub_processor_index, parent_gpu->valid_gpus);
1607 
1608     // Remove the GPU from the table.
1609     if (free_parent)
1610         uvm_global_remove_parent_gpu(parent_gpu);
1611 
1612     uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
1613 
1614     uvm_global_processor_mask_clear(&g_uvm_global.retained_gpus, gpu->global_id);
1615 
1616     // If the parent is being freed, stop scheduling new bottom halves and
1617     // update relevant software state.  Else flush any pending bottom halves
1618     // before continuing.
1619     if (free_parent)
1620         uvm_gpu_disable_isr(parent_gpu);
1621     else
1622         uvm_gpu_flush_bottom_halves(parent_gpu);
1623 
1624     deinit_gpu(gpu);
1625 
1626     UVM_ASSERT(parent_gpu->gpus[sub_processor_index] == gpu);
1627     parent_gpu->gpus[sub_processor_index] = NULL;
1628     uvm_kvfree(gpu);
1629 
1630     if (free_parent)
1631         deinit_parent_gpu(parent_gpu);
1632 }
1633 
1634 // Do not not call this directly. It is called by nv_kref_put, when the
1635 // GPU's ref count drops to zero.
1636 static void uvm_parent_gpu_destroy(nv_kref_t *nv_kref)
1637 {
1638     uvm_parent_gpu_t *parent_gpu = container_of(nv_kref, uvm_parent_gpu_t, gpu_kref);
1639     NvU32 sub_processor_index;
1640 
1641     UVM_ASSERT(parent_gpu->num_retained_gpus == 0);
1642     UVM_ASSERT(bitmap_empty(parent_gpu->valid_gpus, UVM_ID_MAX_SUB_PROCESSORS));
1643 
1644     nv_kthread_q_stop(&parent_gpu->lazy_free_q);
1645 
1646     for (sub_processor_index = 0; sub_processor_index < UVM_ID_MAX_SUB_PROCESSORS; sub_processor_index++)
1647         UVM_ASSERT(!parent_gpu->gpus[sub_processor_index]);
1648 
1649     uvm_kvfree(parent_gpu);
1650 }
1651 
1652 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *parent_gpu)
1653 {
1654     nv_kref_put(&parent_gpu->gpu_kref, uvm_parent_gpu_destroy);
1655 }
1656 
1657 static void update_stats_gpu_fault_instance(uvm_gpu_t *gpu,
1658                                             const uvm_fault_buffer_entry_t *fault_entry,
1659                                             bool is_duplicate)
1660 {
1661     if (!fault_entry->is_replayable) {
1662         switch (fault_entry->fault_access_type)
1663         {
1664             case UVM_FAULT_ACCESS_TYPE_READ:
1665                 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_read_faults;
1666                 break;
1667             case UVM_FAULT_ACCESS_TYPE_WRITE:
1668                 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_write_faults;
1669                 break;
1670             case UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK:
1671             case UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG:
1672                 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_atomic_faults;
1673                 break;
1674             default:
1675                 UVM_ASSERT_MSG(false, "Invalid access type for non-replayable faults\n");
1676                 break;
1677         }
1678 
1679         if (!fault_entry->is_virtual)
1680             ++gpu->parent->fault_buffer_info.non_replayable.stats.num_physical_faults;
1681 
1682         ++gpu->parent->stats.num_non_replayable_faults;
1683 
1684         return;
1685     }
1686 
1687     UVM_ASSERT(fault_entry->is_virtual);
1688 
1689     switch (fault_entry->fault_access_type)
1690     {
1691         case UVM_FAULT_ACCESS_TYPE_PREFETCH:
1692             ++gpu->parent->fault_buffer_info.replayable.stats.num_prefetch_faults;
1693             break;
1694         case UVM_FAULT_ACCESS_TYPE_READ:
1695             ++gpu->parent->fault_buffer_info.replayable.stats.num_read_faults;
1696             break;
1697         case UVM_FAULT_ACCESS_TYPE_WRITE:
1698             ++gpu->parent->fault_buffer_info.replayable.stats.num_write_faults;
1699             break;
1700         case UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK:
1701         case UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG:
1702             ++gpu->parent->fault_buffer_info.replayable.stats.num_atomic_faults;
1703             break;
1704         default:
1705             break;
1706     }
1707     if (is_duplicate || fault_entry->filtered)
1708         ++gpu->parent->fault_buffer_info.replayable.stats.num_duplicate_faults;
1709 
1710     ++gpu->parent->stats.num_replayable_faults;
1711 }
1712 
1713 static void update_stats_fault_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
1714 {
1715     uvm_gpu_t *gpu;
1716     const uvm_fault_buffer_entry_t *fault_entry, *fault_instance;
1717 
1718     UVM_ASSERT(event_id == UVM_PERF_EVENT_FAULT);
1719 
1720     if (UVM_ID_IS_CPU(event_data->fault.proc_id))
1721         return;
1722 
1723     // The reported fault entry must be the "representative" fault entry
1724     UVM_ASSERT(!event_data->fault.gpu.buffer_entry->filtered);
1725 
1726     gpu = uvm_va_space_get_gpu(event_data->fault.space, event_data->fault.proc_id);
1727 
1728     fault_entry = event_data->fault.gpu.buffer_entry;
1729 
1730     // Update the stats using the representative fault entry and the rest of
1731     // instances
1732     update_stats_gpu_fault_instance(gpu, fault_entry, event_data->fault.gpu.is_duplicate);
1733 
1734     list_for_each_entry(fault_instance, &fault_entry->merged_instances_list, merged_instances_list)
1735         update_stats_gpu_fault_instance(gpu, fault_instance, event_data->fault.gpu.is_duplicate);
1736 }
1737 
1738 static void update_stats_migration_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
1739 {
1740     uvm_gpu_t *gpu_dst = NULL;
1741     uvm_gpu_t *gpu_src = NULL;
1742     NvU64 pages;
1743     bool is_replayable_fault;
1744     bool is_non_replayable_fault;
1745     bool is_access_counter;
1746     uvm_va_space_t *va_space = uvm_va_block_get_va_space(event_data->migration.block);
1747 
1748     UVM_ASSERT(event_id == UVM_PERF_EVENT_MIGRATION);
1749 
1750     if (UVM_ID_IS_GPU(event_data->migration.dst))
1751         gpu_dst = uvm_va_space_get_gpu(va_space, event_data->migration.dst);
1752 
1753     if (UVM_ID_IS_GPU(event_data->migration.src))
1754         gpu_src = uvm_va_space_get_gpu(va_space, event_data->migration.src);
1755 
1756     if (!gpu_dst && !gpu_src)
1757         return;
1758 
1759     // Page prefetching is also triggered by faults
1760     is_replayable_fault =
1761         event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
1762     is_non_replayable_fault =
1763         event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
1764     is_access_counter =
1765         event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
1766 
1767     pages = event_data->migration.bytes / PAGE_SIZE;
1768     UVM_ASSERT(event_data->migration.bytes % PAGE_SIZE == 0);
1769     UVM_ASSERT(pages > 0);
1770 
1771     if (gpu_dst) {
1772         atomic64_add(pages, &gpu_dst->parent->stats.num_pages_in);
1773         if (is_replayable_fault)
1774             atomic64_add(pages, &gpu_dst->parent->fault_buffer_info.replayable.stats.num_pages_in);
1775         else if (is_non_replayable_fault)
1776             atomic64_add(pages, &gpu_dst->parent->fault_buffer_info.non_replayable.stats.num_pages_in);
1777         else if (is_access_counter)
1778             atomic64_add(pages, &gpu_dst->parent->access_counter_buffer_info.stats.num_pages_in);
1779     }
1780     if (gpu_src) {
1781         atomic64_add(pages, &gpu_src->parent->stats.num_pages_out);
1782         if (is_replayable_fault)
1783             atomic64_add(pages, &gpu_src->parent->fault_buffer_info.replayable.stats.num_pages_out);
1784         else if (is_non_replayable_fault)
1785             atomic64_add(pages, &gpu_src->parent->fault_buffer_info.non_replayable.stats.num_pages_out);
1786         else if (is_access_counter)
1787             atomic64_add(pages, &gpu_src->parent->access_counter_buffer_info.stats.num_pages_out);
1788     }
1789 }
1790 
1791 // Override the UVM driver and GPU settings from the module loader
1792 static void uvm_param_conf(void)
1793 {
1794     // uvm_peer_copy: Valid entries are "phys" and "virt" for Ampere+ GPUs.
1795     // No effect in pre-Ampere GPUs
1796     if (strcmp(uvm_peer_copy, UVM_PARAM_PEER_COPY_VIRTUAL) == 0) {
1797         g_uvm_global.peer_copy_mode = UVM_GPU_PEER_COPY_MODE_VIRTUAL;
1798     }
1799     else {
1800         if (strcmp(uvm_peer_copy, UVM_PARAM_PEER_COPY_PHYSICAL) != 0) {
1801             pr_info("Invalid value for uvm_peer_copy = %s, using %s instead.\n",
1802                     uvm_peer_copy, UVM_PARAM_PEER_COPY_PHYSICAL);
1803         }
1804 
1805         g_uvm_global.peer_copy_mode = UVM_GPU_PEER_COPY_MODE_PHYSICAL;
1806     }
1807 }
1808 
1809 NV_STATUS uvm_gpu_init(void)
1810 {
1811     NV_STATUS status;
1812 
1813     uvm_param_conf();
1814 
1815     status = uvm_hal_init_table();
1816     if (status != NV_OK) {
1817         UVM_ERR_PRINT("uvm_hal_init_table() failed: %s\n", nvstatusToString(status));
1818         return status;
1819     }
1820 
1821     return NV_OK;
1822 }
1823 
1824 void uvm_gpu_exit(void)
1825 {
1826     uvm_parent_gpu_t *parent_gpu;
1827 
1828     for_each_parent_gpu(parent_gpu)
1829         UVM_ASSERT_MSG(false, "GPU still present: %s\n", parent_gpu->name);
1830 
1831     // CPU should never be in the retained GPUs mask
1832     UVM_ASSERT(!uvm_global_processor_mask_test(&g_uvm_global.retained_gpus, UVM_GLOBAL_ID_CPU));
1833 }
1834 
1835 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space)
1836 {
1837     NV_STATUS status;
1838 
1839     if (uvm_procfs_is_debug_enabled()) {
1840         status = uvm_perf_register_event_callback(&va_space->perf_events,
1841                                                   UVM_PERF_EVENT_FAULT,
1842                                                   update_stats_fault_cb);
1843         if (status != NV_OK)
1844             return status;
1845 
1846         status = uvm_perf_register_event_callback(&va_space->perf_events,
1847                                                   UVM_PERF_EVENT_MIGRATION,
1848                                                   update_stats_migration_cb);
1849         if (status != NV_OK)
1850             return status;
1851     }
1852 
1853     return NV_OK;
1854 }
1855 
1856 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid)
1857 {
1858     uvm_parent_gpu_t *parent_gpu;
1859 
1860     for_each_parent_gpu(parent_gpu) {
1861         if (uvm_processor_uuid_eq(&parent_gpu->uuid, gpu_uuid))
1862             return parent_gpu;
1863     }
1864 
1865     return NULL;
1866 }
1867 
1868 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid)
1869 {
1870     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1871 
1872     return uvm_parent_gpu_get_by_uuid_locked(gpu_uuid);
1873 }
1874 
1875 static uvm_gpu_t *uvm_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid)
1876 {
1877     uvm_gpu_id_t gpu_id;
1878     uvm_global_gpu_id_t global_gpu_id;
1879     uvm_gpu_t *gpu;
1880 
1881     for_each_gpu_id(gpu_id) {
1882         global_gpu_id = uvm_global_gpu_id_from_gpu_id(gpu_id);
1883         gpu = uvm_gpu_get(global_gpu_id);
1884         if (gpu) {
1885             if (uvm_processor_uuid_eq(uvm_gpu_uuid(gpu), gpu_uuid)) {
1886                 UVM_ASSERT(!gpu->parent->smc.enabled);
1887                 return gpu;
1888             }
1889         }
1890     }
1891 
1892     return NULL;
1893 }
1894 
1895 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid)
1896 {
1897     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1898 
1899     return uvm_gpu_get_by_uuid_locked(gpu_uuid);
1900 }
1901 
1902 uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id_locked(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id)
1903 {
1904     uvm_gpu_t *gpu;
1905 
1906     UVM_ASSERT(parent_gpu);
1907 
1908     for_each_gpu_in_parent(parent_gpu, gpu) {
1909         if (gpu->smc.swizz_id == swizz_id)
1910             return gpu;
1911     }
1912 
1913     return NULL;
1914 }
1915 
1916 uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id)
1917 {
1918     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1919 
1920     return uvm_gpu_get_by_parent_and_swizz_id_locked(parent_gpu, swizz_id);
1921 }
1922 
1923 // Increment the refcount for the GPU with the given UUID. If this is the first
1924 // time that this UUID is retained, the GPU is added to UVM.
1925 // When SMC partitioning is enabled, user_rm_device contains the user handles
1926 // that were created by the caller, and that can be used to identify and
1927 // obtain information about the partition. nvUvmInterfaceGetGpuInfo returns, in
1928 // gpu_info, whether SMC is enabled and the swizzId corresponding to the
1929 // partition.
1930 static NV_STATUS gpu_retain_by_uuid_locked(const NvProcessorUuid *gpu_uuid,
1931                                            const uvm_rm_user_object_t *user_rm_device,
1932                                            uvm_gpu_t **gpu_out)
1933 {
1934     NV_STATUS status = NV_OK;
1935     uvm_gpu_t *gpu = NULL;
1936     uvm_parent_gpu_t *parent_gpu;
1937     UvmGpuInfo *gpu_info = NULL;
1938     UvmGpuClientInfo client_info = {0};
1939     UvmGpuPlatformInfo gpu_platform_info = {0};
1940     uvm_global_gpu_id_t global_gpu_id;
1941 
1942     client_info.hClient = user_rm_device->user_client;
1943     client_info.hSmcPartRef = user_rm_device->user_object;
1944 
1945     gpu_info = uvm_kvmalloc_zero(sizeof(*gpu_info));
1946     if (!gpu_info)
1947         return NV_ERR_NO_MEMORY;
1948 
1949     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1950 
1951     parent_gpu = uvm_parent_gpu_get_by_uuid(gpu_uuid);
1952 
1953     if (parent_gpu == NULL) {
1954         // If this is the first time the UUID is seen, register it on RM
1955         status = uvm_rm_locked_call(nvUvmInterfaceRegisterGpu(gpu_uuid, &gpu_platform_info));
1956         if (status != NV_OK)
1957             goto error_free_gpu_info;
1958     }
1959 
1960     status = uvm_rm_locked_call(nvUvmInterfaceGetGpuInfo(gpu_uuid, &client_info, gpu_info));
1961     if (status != NV_OK)
1962         goto error_unregister;
1963 
1964     if (parent_gpu != NULL) {
1965         // If the UUID has been seen before, and if SMC is enabled, then check
1966         // if this specific partition has been seen previously.  The UUID-based
1967         // look-up above may have succeeded for a different partition with the
1968         // same parent GPU.
1969         if (gpu_info->smcEnabled) {
1970             gpu = uvm_gpu_get_by_parent_and_swizz_id(parent_gpu, gpu_info->smcSwizzId);
1971         }
1972         else {
1973             gpu = parent_gpu->gpus[0];
1974             UVM_ASSERT(gpu != NULL);
1975         }
1976     }
1977 
1978     if (gpu == NULL) {
1979         status = find_unused_global_gpu_id(parent_gpu, &global_gpu_id);
1980         if (status != NV_OK)
1981             goto error_unregister;
1982 
1983         status = add_gpu(gpu_uuid, global_gpu_id, gpu_info, &gpu_platform_info, parent_gpu, &gpu);
1984         if (status != NV_OK)
1985             goto error_unregister;
1986     }
1987     else {
1988         atomic64_inc(&gpu->retained_count);
1989     }
1990 
1991     *gpu_out = gpu;
1992 
1993     uvm_kvfree(gpu_info);
1994 
1995     return status;
1996 
1997 error_unregister:
1998     if (parent_gpu == NULL)
1999         uvm_rm_locked_call_void(nvUvmInterfaceUnregisterGpu(gpu_uuid));
2000 error_free_gpu_info:
2001     uvm_kvfree(gpu_info);
2002 
2003     return status;
2004 }
2005 
2006 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid,
2007                                  const uvm_rm_user_object_t *user_rm_device,
2008                                  uvm_gpu_t **gpu_out)
2009 {
2010     NV_STATUS status;
2011     uvm_mutex_lock(&g_uvm_global.global_lock);
2012     status = gpu_retain_by_uuid_locked(gpu_uuid, user_rm_device, gpu_out);
2013     uvm_mutex_unlock(&g_uvm_global.global_lock);
2014     return status;
2015 }
2016 
2017 void uvm_gpu_retain(uvm_gpu_t *gpu)
2018 {
2019     UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0);
2020     atomic64_inc(&gpu->retained_count);
2021 }
2022 
2023 void uvm_gpu_release_locked(uvm_gpu_t *gpu)
2024 {
2025     uvm_parent_gpu_t *parent_gpu = gpu->parent;
2026 
2027     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2028     UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0);
2029 
2030     if (atomic64_dec_and_test(&gpu->retained_count)) {
2031         nv_kref_get(&parent_gpu->gpu_kref);
2032         remove_gpu(gpu);
2033         if (parent_gpu->num_retained_gpus == 0)
2034             uvm_rm_locked_call_void(nvUvmInterfaceUnregisterGpu(&parent_gpu->uuid));
2035         uvm_parent_gpu_kref_put(parent_gpu);
2036     }
2037 }
2038 
2039 void uvm_gpu_release(uvm_gpu_t *gpu)
2040 {
2041     uvm_mutex_lock(&g_uvm_global.global_lock);
2042     uvm_gpu_release_locked(gpu);
2043     uvm_mutex_unlock(&g_uvm_global.global_lock);
2044 }
2045 
2046 // Note: Peer table is an upper triangular matrix packed into a flat array.
2047 // This function converts an index of 2D array of size [N x N] into an index
2048 // of upper triangular array of size [((N - 1) * ((N - 1) + 1)) / 2] which
2049 // does not include diagonal elements.
2050 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
2051 {
2052     NvU32 square_index, triangular_index;
2053     NvU32 gpu_index0 = uvm_id_gpu_index(gpu_id0);
2054     NvU32 gpu_index1 = uvm_id_gpu_index(gpu_id1);
2055 
2056     UVM_ASSERT(!uvm_id_equal(gpu_id0, gpu_id1));
2057 
2058     // Calculate an index of 2D array by re-ordering indices to always point
2059     // to the same entry.
2060     square_index = min(gpu_index0, gpu_index1) * UVM_ID_MAX_GPUS +
2061                    max(gpu_index0, gpu_index1);
2062 
2063     // Calculate and subtract number of lower triangular matrix elements till
2064     // the current row (which includes diagonal elements) to get the correct
2065     // index in an upper triangular matrix.
2066     // Note: As gpu_id can be [1, N), no extra logic is needed to calculate
2067     // diagonal elements.
2068     triangular_index = square_index - SUM_FROM_0_TO_N(min(uvm_id_value(gpu_id0), uvm_id_value(gpu_id1)));
2069 
2070     UVM_ASSERT(triangular_index < UVM_MAX_UNIQUE_GPU_PAIRS);
2071 
2072     return triangular_index;
2073 }
2074 
2075 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu)
2076 {
2077     // We may need to call service_interrupts() which cannot be done in the top
2078     // half interrupt handler so assert here as well to catch improper use as
2079     // early as possible.
2080     UVM_ASSERT(!in_interrupt());
2081 
2082     if (!gpu->ecc.enabled)
2083         return NV_OK;
2084 
2085     // Early out If a global ECC error is already set to not spam the logs with
2086     // the same error.
2087     if (uvm_global_get_status() == NV_ERR_ECC_ERROR)
2088         return NV_ERR_ECC_ERROR;
2089 
2090     if (*gpu->ecc.error_notifier) {
2091         UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu));
2092         uvm_global_set_fatal_error(NV_ERR_ECC_ERROR);
2093         return NV_ERR_ECC_ERROR;
2094     }
2095 
2096     // RM hasn't seen an ECC error yet, check whether there is a pending
2097     // interrupt that might indicate one. We might get false positives because
2098     // the interrupt bits we read are not ECC-specific. They're just the
2099     // top-level bits for any interrupt on all engines which support ECC. On
2100     // Pascal for example, RM returns us a mask with the bits for GR, L2, and
2101     // FB, because any of those might raise an ECC interrupt. So if they're set
2102     // we have to ask RM to check whether it was really an ECC error (and a
2103     // double-bit ECC error at that), in which case it sets the notifier.
2104     if ((*gpu->ecc.hw_interrupt_tree_location & gpu->ecc.mask) == 0) {
2105         // No pending interrupts.
2106         return NV_OK;
2107     }
2108 
2109     // An interrupt that might mean an ECC error needs to be serviced, signal
2110     // that to the caller.
2111     return NV_WARN_MORE_PROCESSING_REQUIRED;
2112 }
2113 
2114 static NV_STATUS get_p2p_caps(uvm_gpu_t *gpu0,
2115                               uvm_gpu_t *gpu1,
2116                               UvmGpuP2PCapsParams *p2p_caps_params)
2117 {
2118     NV_STATUS status;
2119     uvmGpuDeviceHandle rm_device0, rm_device1;
2120 
2121     if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) {
2122         rm_device0 = uvm_gpu_device_handle(gpu0);
2123         rm_device1 = uvm_gpu_device_handle(gpu1);
2124     }
2125     else {
2126         rm_device0 = uvm_gpu_device_handle(gpu1);
2127         rm_device1 = uvm_gpu_device_handle(gpu0);
2128     }
2129 
2130     memset(p2p_caps_params, 0, sizeof(*p2p_caps_params));
2131     status = uvm_rm_locked_call(nvUvmInterfaceGetP2PCaps(rm_device0, rm_device1, p2p_caps_params));
2132     if (status != NV_OK) {
2133         UVM_ERR_PRINT("nvUvmInterfaceGetP2PCaps() failed with error: %s, for GPU0:%s and GPU1:%s\n",
2134                        nvstatusToString(status),
2135                        uvm_gpu_name(gpu0),
2136                        uvm_gpu_name(gpu1));
2137         return status;
2138     }
2139 
2140     if (p2p_caps_params->p2pLink != UVM_LINK_TYPE_NONE) {
2141         // P2P is not supported under SMC partitioning
2142         UVM_ASSERT(!gpu0->parent->smc.enabled);
2143         UVM_ASSERT(!gpu1->parent->smc.enabled);
2144     }
2145 
2146     return NV_OK;
2147 }
2148 
2149 static NV_STATUS create_p2p_object(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, NvHandle *p2p_handle)
2150 {
2151     NV_STATUS status;
2152     uvmGpuDeviceHandle rm_device0, rm_device1;
2153 
2154     if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) {
2155         rm_device0 = uvm_gpu_device_handle(gpu0);
2156         rm_device1 = uvm_gpu_device_handle(gpu1);
2157     }
2158     else {
2159         rm_device0 = uvm_gpu_device_handle(gpu1);
2160         rm_device1 = uvm_gpu_device_handle(gpu0);
2161     }
2162 
2163     *p2p_handle = 0;
2164 
2165     status = uvm_rm_locked_call(nvUvmInterfaceP2pObjectCreate(rm_device0, rm_device1, p2p_handle));
2166     if (status != NV_OK) {
2167         UVM_ERR_PRINT("nvUvmInterfaceP2pObjectCreate() failed with error: %s, for GPU0:%s and GPU1:%s\n",
2168                        nvstatusToString(status),
2169                        uvm_gpu_name(gpu0),
2170                        uvm_gpu_name(gpu1));
2171         return status;
2172     }
2173 
2174     UVM_ASSERT(*p2p_handle);
2175     return NV_OK;
2176 }
2177 
2178 static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params,
2179                                       const uvm_gpu_peer_t *peer_caps,
2180                                       uvm_gpu_t *gpu0,
2181                                       uvm_gpu_t *gpu1)
2182 {
2183     bool sorted;
2184     NvU32 ce0, ce1;
2185 
2186     if (peer_caps->link_type < UVM_GPU_LINK_NVLINK_1)
2187         return;
2188 
2189     sorted = uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id);
2190     ce0 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 0 : 1];
2191     ce1 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 1 : 0];
2192 
2193     // Indirect peers communicate through the CPU, so the optimal CE
2194     // should match the one selected for writing to system memory
2195     if (peer_caps->is_indirect_peer) {
2196         uvm_channel_pool_t *pool;
2197 
2198         pool = gpu0->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
2199         UVM_ASSERT(ce0 == pool->engine_index);
2200 
2201         pool = gpu1->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
2202         UVM_ASSERT(ce1 == pool->engine_index);
2203     }
2204 
2205     uvm_channel_manager_set_p2p_ce(gpu0->channel_manager, gpu1, ce0);
2206     uvm_channel_manager_set_p2p_ce(gpu1->channel_manager, gpu0, ce1);
2207 }
2208 
2209 static int nv_procfs_read_gpu_peer_caps(struct seq_file *s, void *v)
2210 {
2211     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
2212             return -EAGAIN;
2213 
2214     gpu_peer_caps_print((uvm_gpu_t **)s->private, s);
2215 
2216     uvm_up_read(&g_uvm_global.pm.lock);
2217 
2218     return 0;
2219 }
2220 
2221 static int nv_procfs_read_gpu_peer_caps_entry(struct seq_file *s, void *v)
2222 {
2223     UVM_ENTRY_RET(nv_procfs_read_gpu_peer_caps(s, v));
2224 }
2225 
2226 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_peer_caps_entry);
2227 
2228 static NV_STATUS init_procfs_peer_cap_files(uvm_gpu_t *local, uvm_gpu_t *remote, size_t local_idx)
2229 {
2230     // This needs to hold a gpu_id_t in decimal
2231     char gpu_dir_name[16];
2232 
2233     // This needs to hold a GPU UUID
2234     char symlink_name[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
2235     uvm_gpu_peer_t *peer_caps;
2236 
2237     if (!uvm_procfs_is_enabled())
2238         return NV_OK;
2239 
2240     peer_caps = uvm_gpu_peer_caps(local, remote);
2241     peer_caps->procfs.pairs[local_idx][0] = local;
2242     peer_caps->procfs.pairs[local_idx][1] = remote;
2243 
2244     // Create gpus/gpuA/peers/gpuB
2245     snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_id_value(remote->id));
2246     peer_caps->procfs.peer_file[local_idx] = NV_CREATE_PROC_FILE(gpu_dir_name,
2247                                                                  local->procfs.dir_peers,
2248                                                                  gpu_peer_caps_entry,
2249                                                                  &peer_caps->procfs.pairs[local_idx]);
2250 
2251     if (peer_caps->procfs.peer_file[local_idx] == NULL)
2252         return NV_ERR_OPERATING_SYSTEM;
2253 
2254     // Create a symlink from UVM GPU UUID (UVM-GPU-...) to the UVM GPU ID gpuB
2255     format_uuid_to_buffer(symlink_name, sizeof(symlink_name), uvm_gpu_uuid(remote));
2256     peer_caps->procfs.peer_symlink_file[local_idx] = proc_symlink(symlink_name,
2257                                                                   local->procfs.dir_peers,
2258                                                                   gpu_dir_name);
2259     if (peer_caps->procfs.peer_symlink_file[local_idx] == NULL)
2260         return NV_ERR_OPERATING_SYSTEM;
2261 
2262     return NV_OK;
2263 }
2264 
2265 static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
2266                                   uvm_gpu_t *gpu1,
2267                                   const UvmGpuP2PCapsParams *p2p_caps_params,
2268                                   uvm_gpu_peer_t *peer_caps)
2269 {
2270     NV_STATUS status;
2271 
2272     UVM_ASSERT(p2p_caps_params->p2pLink != UVM_LINK_TYPE_C2C);
2273 
2274     // check for peer-to-peer compatibility (PCI-E or NvLink).
2275     peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
2276     if (peer_caps->link_type == UVM_GPU_LINK_INVALID
2277         || peer_caps->link_type == UVM_GPU_LINK_C2C
2278         )
2279         return NV_ERR_NOT_SUPPORTED;
2280 
2281     peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;
2282 
2283     // Initialize peer ids and establish peer mappings
2284     peer_caps->is_indirect_peer = (p2p_caps_params->indirectAccess == NV_TRUE);
2285 
2286     if (peer_caps->is_indirect_peer) {
2287         UVM_ASSERT(gpu0->mem_info.numa.enabled);
2288         UVM_ASSERT(gpu1->mem_info.numa.enabled);
2289 
2290         status = uvm_pmm_gpu_indirect_peer_init(&gpu0->pmm, gpu1);
2291         if (status != NV_OK)
2292             return status;
2293 
2294         status = uvm_pmm_gpu_indirect_peer_init(&gpu1->pmm, gpu0);
2295         if (status != NV_OK)
2296             return status;
2297 
2298         set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);
2299         UVM_ASSERT(peer_caps->total_link_line_rate_mbyte_per_s == 0);
2300     }
2301     else {
2302         // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1)
2303         peer_caps->peer_ids[0] = p2p_caps_params->peerIds[0];
2304 
2305         // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1)
2306         peer_caps->peer_ids[1] = p2p_caps_params->peerIds[1];
2307 
2308         // Establish peer mappings from each GPU to the other. Indirect peers
2309         // do not require identity mappings since they use sysmem aperture to
2310         // communicate.
2311         status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1);
2312         if (status != NV_OK)
2313             return status;
2314 
2315         status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0);
2316         if (status != NV_OK)
2317             return status;
2318 
2319         set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);
2320 
2321         UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0);
2322         UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1);
2323 
2324         // In the case of NVLINK peers, this initialization will happen during
2325         // add_gpu. As soon as the peer info table is assigned below, the access
2326         // counter bottom half could start operating on the GPU being newly
2327         // added and inspecting the peer caps, so all of the appropriate
2328         // initialization must happen before this point.
2329         uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
2330 
2331         uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
2332         UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL);
2333         gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1;
2334 
2335         uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
2336         uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
2337 
2338         uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
2339         UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL);
2340         gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0;
2341 
2342         uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
2343     }
2344 
2345     if (!uvm_procfs_is_debug_enabled())
2346         return NV_OK;
2347 
2348     status = init_procfs_peer_cap_files(gpu0, gpu1, 0);
2349     if (status != NV_OK)
2350         return status;
2351 
2352     status = init_procfs_peer_cap_files(gpu1, gpu0, 1);
2353     if (status != NV_OK)
2354         return status;
2355 
2356     return NV_OK;
2357 }
2358 
2359 static NV_STATUS enable_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
2360 {
2361     NV_STATUS status = NV_OK;
2362     UvmGpuP2PCapsParams p2p_caps_params;
2363     uvm_gpu_peer_t *peer_caps;
2364     NvHandle p2p_handle;
2365 
2366     UVM_ASSERT(gpu0);
2367     UVM_ASSERT(gpu1);
2368     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2369 
2370     peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2371     UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_INVALID);
2372     UVM_ASSERT(peer_caps->ref_count == 0);
2373 
2374     status = create_p2p_object(gpu0, gpu1, &p2p_handle);
2375     if (status != NV_OK)
2376         return status;
2377 
2378     // Store the handle in the global table.
2379     peer_caps->p2p_handle = p2p_handle;
2380 
2381     status = get_p2p_caps(gpu0, gpu1, &p2p_caps_params);
2382     if (status != NV_OK)
2383         goto cleanup;
2384 
2385     // Sanity checks
2386     UVM_ASSERT(p2p_caps_params.indirectAccess == NV_FALSE);
2387     UVM_ASSERT(p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE);
2388 
2389     status = init_peer_access(gpu0, gpu1, &p2p_caps_params, peer_caps);
2390     if (status != NV_OK)
2391         goto cleanup;
2392 
2393     return NV_OK;
2394 
2395 cleanup:
2396     disable_peer_access(gpu0, gpu1);
2397     return status;
2398 }
2399 
2400 static NV_STATUS enable_nvlink_peer_access(uvm_gpu_t *gpu0,
2401                                            uvm_gpu_t *gpu1,
2402                                            UvmGpuP2PCapsParams *p2p_caps_params)
2403 {
2404     NV_STATUS status = NV_OK;
2405     NvHandle p2p_handle;
2406     uvm_gpu_peer_t *peer_caps;
2407 
2408     UVM_ASSERT(gpu0);
2409     UVM_ASSERT(gpu1);
2410     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2411 
2412     peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2413     UVM_ASSERT(peer_caps->ref_count == 0);
2414     peer_caps->ref_count = 1;
2415 
2416     if (!p2p_caps_params->indirectAccess) {
2417         // Create P2P object for direct NVLink peers
2418         status = create_p2p_object(gpu0, gpu1, &p2p_handle);
2419         if (status != NV_OK) {
2420             UVM_ERR_PRINT("failed to create a P2P object with error: %s, for GPU1:%s and GPU2:%s \n",
2421                            nvstatusToString(status),
2422                            uvm_gpu_name(gpu0),
2423                            uvm_gpu_name(gpu1));
2424             return status;
2425         }
2426 
2427         UVM_ASSERT(p2p_handle != 0);
2428 
2429         // Store the handle in the global table.
2430         peer_caps->p2p_handle = p2p_handle;
2431 
2432         // Update p2p caps after p2p object creation as it generates the peer
2433         // ids
2434         status = get_p2p_caps(gpu0, gpu1, p2p_caps_params);
2435         if (status != NV_OK)
2436             goto cleanup;
2437     }
2438 
2439     status = init_peer_access(gpu0, gpu1, p2p_caps_params, peer_caps);
2440     if (status != NV_OK)
2441         goto cleanup;
2442 
2443     return NV_OK;
2444 
2445 cleanup:
2446     disable_peer_access(gpu0, gpu1);
2447     return status;
2448 }
2449 
2450 static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu)
2451 {
2452     NV_STATUS status = NV_OK;
2453     uvm_gpu_t *other_gpu;
2454 
2455     UVM_ASSERT(gpu);
2456     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2457 
2458     if (gpu->parent->smc.enabled)
2459         return NV_OK;
2460 
2461     for_each_global_gpu(other_gpu) {
2462         UvmGpuP2PCapsParams p2p_caps_params;
2463 
2464         if ((other_gpu == gpu) || other_gpu->parent->smc.enabled)
2465             continue;
2466 
2467         status = get_p2p_caps(gpu, other_gpu, &p2p_caps_params);
2468         if (status != NV_OK)
2469             goto cleanup;
2470 
2471         // PCIe peers need to be explicitly enabled via UvmEnablePeerAccess
2472         if (p2p_caps_params.p2pLink == UVM_LINK_TYPE_NONE || p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE)
2473             continue;
2474 
2475         // Indirect peers are only supported when onlined as NUMA nodes, because
2476         // we want to use vm_insert_page and dma_map_page.
2477         if (p2p_caps_params.indirectAccess && (!gpu->mem_info.numa.enabled || !other_gpu->mem_info.numa.enabled))
2478             continue;
2479 
2480         status = enable_nvlink_peer_access(gpu, other_gpu, &p2p_caps_params);
2481         if (status != NV_OK)
2482             goto cleanup;
2483     }
2484 
2485     return NV_OK;
2486 
2487 cleanup:
2488     destroy_nvlink_peers(gpu);
2489 
2490     return status;
2491 }
2492 
2493 static void destroy_nvlink_peers(uvm_gpu_t *gpu)
2494 {
2495     uvm_gpu_t *other_gpu;
2496 
2497     UVM_ASSERT(gpu);
2498     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2499 
2500     if (gpu->parent->smc.enabled)
2501         return;
2502 
2503     for_each_global_gpu(other_gpu) {
2504         uvm_gpu_peer_t *peer_caps;
2505 
2506         if ((other_gpu == gpu) || other_gpu->parent->smc.enabled)
2507             continue;
2508 
2509         peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
2510 
2511         // PCIe peers need to be explicitly destroyed via UvmDisablePeerAccess
2512         if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_PCIE)
2513             continue;
2514 
2515         disable_peer_access(gpu, other_gpu);
2516     }
2517 }
2518 
2519 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
2520 {
2521     NV_STATUS status = NV_OK;
2522     uvm_gpu_peer_t *peer_caps;
2523 
2524     UVM_ASSERT(gpu0);
2525     UVM_ASSERT(gpu1);
2526     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2527 
2528     peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2529 
2530     // Insert an entry into global peer table, if not present.
2531     if (peer_caps->link_type == UVM_GPU_LINK_INVALID) {
2532         UVM_ASSERT(peer_caps->ref_count == 0);
2533 
2534         status = enable_pcie_peer_access(gpu0, gpu1);
2535         if (status != NV_OK)
2536             return status;
2537     }
2538     else if (peer_caps->link_type != UVM_GPU_LINK_PCIE) {
2539         return NV_ERR_INVALID_DEVICE;
2540     }
2541 
2542     // GPUs can't be destroyed until their peer pairings have also been
2543     // destroyed.
2544     uvm_gpu_retain(gpu0);
2545     uvm_gpu_retain(gpu1);
2546 
2547     peer_caps->ref_count++;
2548 
2549     return status;
2550 }
2551 
2552 static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
2553 {
2554     uvm_gpu_peer_t *peer_caps;
2555     NvHandle p2p_handle = 0;
2556 
2557     UVM_ASSERT(gpu0);
2558     UVM_ASSERT(gpu1);
2559 
2560     // P2P is not supported under SMC partitioning
2561     UVM_ASSERT(!gpu0->parent->smc.enabled);
2562     UVM_ASSERT(!gpu1->parent->smc.enabled);
2563 
2564     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2565 
2566     peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2567 
2568     if (uvm_procfs_is_debug_enabled())
2569         deinit_procfs_peer_cap_files(peer_caps);
2570 
2571     p2p_handle = peer_caps->p2p_handle;
2572 
2573     if (peer_caps->is_indirect_peer) {
2574         uvm_pmm_gpu_indirect_peer_destroy(&gpu0->pmm, gpu1);
2575         uvm_pmm_gpu_indirect_peer_destroy(&gpu1->pmm, gpu0);
2576     }
2577     else {
2578         UVM_ASSERT(p2p_handle);
2579 
2580         uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
2581         uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);
2582 
2583         uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_gpu_session_handle(gpu0), p2p_handle));
2584 
2585         UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0);
2586         UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1);
2587 
2588         uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
2589         uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
2590         gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL;
2591         uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
2592 
2593         uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
2594         uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
2595         gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL;
2596         uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
2597     }
2598 
2599     // Flush the access counter buffer to avoid getting stale notifications for
2600     // accesses to GPUs to which peer access is being disabled. This is also
2601     // needed in the case of disabling automatic (NVLINK) peers on GPU
2602     // unregister, because access counter processing might still be using GPU
2603     // IDs queried from the peer table above which are about to be removed from
2604     // the global table.
2605     if (gpu0->parent->access_counters_supported)
2606         uvm_gpu_access_counter_buffer_flush(gpu0);
2607     if (gpu1->parent->access_counters_supported)
2608         uvm_gpu_access_counter_buffer_flush(gpu1);
2609 
2610     memset(peer_caps, 0, sizeof(*peer_caps));
2611 }
2612 
2613 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
2614 {
2615     uvm_gpu_peer_t *peer_caps;
2616     UVM_ASSERT(gpu0);
2617     UVM_ASSERT(gpu1);
2618     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
2619 
2620     peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2621 
2622     UVM_ASSERT(peer_caps->ref_count > 0);
2623     UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_PCIE);
2624     peer_caps->ref_count--;
2625 
2626     if (peer_caps->ref_count == 0)
2627         disable_peer_access(gpu0, gpu1);
2628 
2629     uvm_gpu_release_locked(gpu0);
2630     uvm_gpu_release_locked(gpu1);
2631 }
2632 
2633 static uvm_aperture_t uvm_gpu_peer_caps_aperture(uvm_gpu_peer_t *peer_caps, uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu)
2634 {
2635     size_t peer_index;
2636     UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID);
2637 
2638     // Indirect peers are accessed as sysmem addresses
2639     if (peer_caps->is_indirect_peer)
2640         return UVM_APERTURE_SYS;
2641 
2642     if (uvm_id_value(local_gpu->id) < uvm_id_value(remote_gpu->id))
2643         peer_index = 0;
2644     else
2645         peer_index = 1;
2646 
2647     return UVM_APERTURE_PEER(peer_caps->peer_ids[peer_index]);
2648 }
2649 
2650 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu)
2651 {
2652     uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(local_gpu, remote_gpu);
2653     return uvm_gpu_peer_caps_aperture(peer_caps, local_gpu, remote_gpu);
2654 }
2655 
2656 uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu)
2657 {
2658     // See comment in page_tree_set_location
2659     if (uvm_gpu_is_virt_mode_sriov_heavy(gpu))
2660         return UVM_APERTURE_VID;
2661 
2662     if (uvm_conf_computing_mode_enabled(gpu))
2663         return UVM_APERTURE_VID;
2664 
2665     return UVM_APERTURE_DEFAULT;
2666 }
2667 
2668 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr)
2669 {
2670     uvm_processor_id_t id = UVM_ID_INVALID;
2671 
2672     // TODO: Bug 1899622: On P9 systems with multiple CPU sockets, SYS aperture
2673     // is also reported for accesses to remote GPUs connected to a different CPU
2674     // NUMA domain. We will need to determine the actual processor id using the
2675     // reported physical address.
2676     if (addr.aperture == UVM_APERTURE_SYS)
2677         return UVM_ID_CPU;
2678     else if (addr.aperture == UVM_APERTURE_VID)
2679         return gpu->id;
2680 
2681     uvm_spin_lock(&gpu->peer_info.peer_gpus_lock);
2682 
2683     for_each_gpu_id_in_mask(id, &gpu->peer_info.peer_gpu_mask) {
2684         uvm_gpu_t *other_gpu = gpu->peer_info.peer_gpus[uvm_id_gpu_index(id)];
2685         UVM_ASSERT(other_gpu);
2686 
2687         if (uvm_gpus_are_nvswitch_connected(gpu, other_gpu)) {
2688             // NVSWITCH connected systems use an extended physical address to
2689             // map to peers.  Find the physical memory 'slot' containing the
2690             // given physical address to find the peer gpu that owns the
2691             // physical address
2692             NvU64 fabric_window_end = other_gpu->parent->nvswitch_info.fabric_memory_window_start +
2693                                       other_gpu->mem_info.max_allocatable_address;
2694 
2695             if (other_gpu->parent->nvswitch_info.fabric_memory_window_start <= addr.address &&
2696                 fabric_window_end >= addr.address)
2697                 break;
2698         }
2699         else if (uvm_gpu_peer_aperture(gpu, other_gpu) == addr.aperture) {
2700             break;
2701         }
2702     }
2703 
2704     uvm_spin_unlock(&gpu->peer_info.peer_gpus_lock);
2705 
2706     return id;
2707 }
2708 
2709 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id1, const uvm_gpu_id_t gpu_id2)
2710 {
2711     NvU32 table_index = uvm_gpu_peer_table_index(gpu_id1, gpu_id2);
2712     return &g_uvm_global.peers[table_index];
2713 }
2714 
2715 static NvU64 instance_ptr_to_key(uvm_gpu_phys_address_t instance_ptr)
2716 {
2717     NvU64 key;
2718     int is_sys = (instance_ptr.aperture == UVM_APERTURE_SYS);
2719 
2720     // Instance pointers must be 4k aligned and they must have either VID or SYS
2721     // apertures. Compress them as much as we can both to guarantee that the key
2722     // fits within 64 bits, and to make the table as shallow as possible.
2723     UVM_ASSERT(IS_ALIGNED(instance_ptr.address, UVM_PAGE_SIZE_4K));
2724     UVM_ASSERT(instance_ptr.aperture == UVM_APERTURE_VID || instance_ptr.aperture == UVM_APERTURE_SYS);
2725 
2726     key = (instance_ptr.address >> 11) | is_sys;
2727 
2728     return key;
2729 }
2730 
2731 static NV_STATUS gpu_add_user_channel_subctx_info(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2732 {
2733     uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr;
2734     NV_STATUS status = NV_OK;
2735     uvm_rb_tree_node_t *channel_tree_node;
2736     uvm_user_channel_subctx_info_t *channel_subctx_info;
2737     uvm_user_channel_subctx_info_t *new_channel_subctx_info = NULL;
2738     uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space;
2739 
2740     if (!user_channel->in_subctx)
2741         return NV_OK;
2742 
2743     // Pre-allocate a subcontext info descriptor out of the lock, in case we
2744     // need to add a new entry to the tree
2745     new_channel_subctx_info = uvm_kvmalloc_zero(sizeof(*new_channel_subctx_info));
2746 
2747     // Don't check for the result of the allocation since it is only needed
2748     // if the TSG has not been registered yet, and we do that under the lock
2749     // below
2750     if (new_channel_subctx_info) {
2751         new_channel_subctx_info->subctxs =
2752             uvm_kvmalloc_zero(sizeof(*new_channel_subctx_info->subctxs) * user_channel->tsg.max_subctx_count);
2753     }
2754 
2755     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
2756 
2757     // Check if the subcontext information for the channel already exists
2758     channel_tree_node = uvm_rb_tree_find(&gpu->parent->tsg_table, user_channel->tsg.id);
2759 
2760     if (!channel_tree_node) {
2761         // We could not allocate the descriptor before taking the lock. Exiting
2762         if (!new_channel_subctx_info || !new_channel_subctx_info->subctxs) {
2763             status = NV_ERR_NO_MEMORY;
2764             goto exit_unlock;
2765         }
2766 
2767         // Insert the new subcontext information descriptor
2768         new_channel_subctx_info->node.key = user_channel->tsg.id;
2769         status = uvm_rb_tree_insert(&gpu->parent->tsg_table, &new_channel_subctx_info->node);
2770         UVM_ASSERT(status == NV_OK);
2771 
2772         channel_subctx_info = new_channel_subctx_info;
2773         channel_subctx_info->smc_engine_id = user_channel->smc_engine_id;
2774     }
2775     else {
2776         channel_subctx_info = container_of(channel_tree_node, uvm_user_channel_subctx_info_t, node);
2777         UVM_ASSERT(channel_subctx_info->smc_engine_id == user_channel->smc_engine_id);
2778     }
2779 
2780     user_channel->subctx_info = channel_subctx_info;
2781 
2782     // Register the VA space of the channel subcontext info descriptor, or
2783     // check that the existing one matches the channel's
2784     if (channel_subctx_info->subctxs[user_channel->subctx_id].refcount++ > 0) {
2785         UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == va_space,
2786                        "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n",
2787                        user_channel->hw_runlist_id,
2788                        user_channel->hw_channel_id,
2789                        instance_ptr.address,
2790                        uvm_aperture_string(instance_ptr.aperture),
2791                        user_channel->subctx_id,
2792                        user_channel->tsg.id,
2793                        (NvU64)va_space,
2794                        (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space);
2795         UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space != NULL,
2796                        "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: VA space is NULL\n",
2797                        user_channel->hw_runlist_id,
2798                        user_channel->hw_channel_id,
2799                        instance_ptr.address,
2800                        uvm_aperture_string(instance_ptr.aperture),
2801                        user_channel->subctx_id,
2802                        user_channel->tsg.id);
2803         UVM_ASSERT_MSG(channel_subctx_info->total_refcount > 0,
2804                        "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: TSG refcount is 0\n",
2805                        user_channel->hw_runlist_id,
2806                        user_channel->hw_channel_id,
2807                        instance_ptr.address,
2808                        uvm_aperture_string(instance_ptr.aperture),
2809                        user_channel->subctx_id,
2810                        user_channel->tsg.id);
2811     }
2812     else {
2813         UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == NULL,
2814                        "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space NULL but got 0x%llx instead\n",
2815                        user_channel->hw_runlist_id,
2816                        user_channel->hw_channel_id,
2817                        instance_ptr.address,
2818                        uvm_aperture_string(instance_ptr.aperture),
2819                        user_channel->subctx_id,
2820                        user_channel->tsg.id,
2821                        (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space);
2822 
2823         channel_subctx_info->subctxs[user_channel->subctx_id].va_space = va_space;
2824     }
2825 
2826     ++channel_subctx_info->total_refcount;
2827 
2828 exit_unlock:
2829     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
2830 
2831     // Remove the pre-allocated per-TSG subctx information struct if there was
2832     // some error or it was not used
2833     if (status != NV_OK || user_channel->subctx_info != new_channel_subctx_info) {
2834         if (new_channel_subctx_info)
2835             uvm_kvfree(new_channel_subctx_info->subctxs);
2836 
2837         uvm_kvfree(new_channel_subctx_info);
2838     }
2839 
2840     return status;
2841 }
2842 
2843 static void gpu_remove_user_channel_subctx_info_locked(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2844 {
2845     uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr;
2846     uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space;
2847 
2848     uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock);
2849 
2850     // Channel subcontext info descriptor may not have been registered in
2851     // tsg_table since this function is called in some teardown paths during
2852     // channel creation
2853     if (!user_channel->subctx_info)
2854         return;
2855 
2856     UVM_ASSERT_MSG(&user_channel->subctx_info->node ==
2857                    uvm_rb_tree_find(&gpu->parent->tsg_table, user_channel->subctx_info->node.key),
2858                    "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: SubCTX not found in TSG table\n",
2859                    user_channel->hw_runlist_id,
2860                    user_channel->hw_channel_id,
2861                    instance_ptr.address,
2862                    uvm_aperture_string(instance_ptr.aperture),
2863                    user_channel->subctx_id,
2864                    user_channel->tsg.id);
2865 
2866     UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].refcount > 0,
2867                    "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: SubCTX refcount is 0\n",
2868                    user_channel->hw_runlist_id,
2869                    user_channel->hw_channel_id,
2870                    instance_ptr.address,
2871                    uvm_aperture_string(instance_ptr.aperture),
2872                    user_channel->subctx_id,
2873                    user_channel->tsg.id);
2874 
2875     UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space == va_space,
2876                    "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n",
2877                    user_channel->hw_runlist_id,
2878                    user_channel->hw_channel_id,
2879                    instance_ptr.address,
2880                    uvm_aperture_string(instance_ptr.aperture),
2881                    user_channel->subctx_id,
2882                    user_channel->tsg.id,
2883                    (NvU64)va_space,
2884                    (NvU64)user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space);
2885 
2886     UVM_ASSERT_MSG(user_channel->subctx_info->total_refcount > 0,
2887                    "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: TSG refcount is 0\n",
2888                    user_channel->hw_runlist_id,
2889                    user_channel->hw_channel_id,
2890                    instance_ptr.address,
2891                    uvm_aperture_string(instance_ptr.aperture),
2892                    user_channel->subctx_id,
2893                    user_channel->tsg.id);
2894 
2895     // Decrement VA space refcount. If it gets to zero, unregister the pointer
2896     if (--user_channel->subctx_info->subctxs[user_channel->subctx_id].refcount == 0)
2897         user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space = NULL;
2898 
2899     if (--user_channel->subctx_info->total_refcount == 0) {
2900         uvm_rb_tree_remove(&gpu->parent->tsg_table, &user_channel->subctx_info->node);
2901         uvm_kvfree(user_channel->subctx_info->subctxs);
2902         uvm_kvfree(user_channel->subctx_info);
2903     }
2904 
2905     user_channel->subctx_info = NULL;
2906 }
2907 
2908 static void gpu_remove_user_channel_subctx_info(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2909 {
2910     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
2911     gpu_remove_user_channel_subctx_info_locked(gpu, user_channel);
2912     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
2913 }
2914 
2915 static void gpu_add_user_channel_instance_ptr(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2916 {
2917     uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr;
2918     NvU64 instance_ptr_key = instance_ptr_to_key(instance_ptr);
2919     NV_STATUS status;
2920 
2921     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
2922 
2923     // Insert the instance_ptr -> user_channel mapping
2924     user_channel->instance_ptr.node.key = instance_ptr_key;
2925     status = uvm_rb_tree_insert(&gpu->parent->instance_ptr_table, &user_channel->instance_ptr.node);
2926 
2927     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
2928 
2929     UVM_ASSERT_MSG(status == NV_OK, "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: error %s\n",
2930                    user_channel->hw_runlist_id,
2931                    user_channel->hw_channel_id,
2932                    instance_ptr.address,
2933                    uvm_aperture_string(instance_ptr.aperture),
2934                    user_channel->subctx_id,
2935                    user_channel->tsg.id,
2936                    nvstatusToString(status));
2937 }
2938 
2939 static void gpu_remove_user_channel_instance_ptr_locked(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2940 {
2941     uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock);
2942 
2943     if (UVM_RB_TREE_EMPTY_NODE(&user_channel->instance_ptr.node))
2944         return;
2945 
2946     uvm_rb_tree_remove(&gpu->parent->instance_ptr_table, &user_channel->instance_ptr.node);
2947 }
2948 
2949 NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
2950 {
2951     uvm_va_space_t *va_space;
2952     uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space;
2953     NV_STATUS status;
2954 
2955     UVM_ASSERT(user_channel->rm_retained_channel);
2956     UVM_ASSERT(gpu_va_space);
2957     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
2958     va_space = gpu_va_space->va_space;
2959     uvm_assert_rwsem_locked(&va_space->lock);
2960 
2961     status = gpu_add_user_channel_subctx_info(gpu, user_channel);
2962     if (status != NV_OK)
2963         return status;
2964 
2965     gpu_add_user_channel_instance_ptr(gpu, user_channel);
2966 
2967     return NV_OK;
2968 }
2969 
2970 static uvm_user_channel_t *instance_ptr_to_user_channel(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr)
2971 {
2972     NvU64 key = instance_ptr_to_key(instance_ptr);
2973     uvm_rb_tree_node_t *instance_node;
2974 
2975     uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock);
2976 
2977     instance_node = uvm_rb_tree_find(&gpu->parent->instance_ptr_table, key);
2978     if (!instance_node)
2979         return NULL;
2980 
2981     return get_user_channel(instance_node);
2982 }
2983 
2984 static uvm_va_space_t *user_channel_and_subctx_to_va_space(uvm_user_channel_t *user_channel, NvU32 subctx_id)
2985 {
2986     uvm_user_channel_subctx_info_t *channel_subctx_info;
2987 
2988     UVM_ASSERT(user_channel);
2989     UVM_ASSERT(user_channel->in_subctx);
2990     UVM_ASSERT(user_channel->subctx_info);
2991 
2992     uvm_assert_spinlock_locked(&user_channel->gpu->parent->instance_ptr_table_lock);
2993 
2994     channel_subctx_info = user_channel->subctx_info;
2995 
2996     UVM_ASSERT_MSG(subctx_id < user_channel->tsg.max_subctx_count,
2997                    "instance_ptr {0x%llx:%s} in TSG %u. Invalid SubCTX %u\n",
2998                    user_channel->instance_ptr.addr.address,
2999                    uvm_aperture_string(user_channel->instance_ptr.addr.aperture),
3000                    user_channel->tsg.id,
3001                    subctx_id);
3002     UVM_ASSERT_MSG(channel_subctx_info->total_refcount > 0,
3003                    "instance_ptr {0x%llx:%s} in TSG %u: TSG refcount is 0\n",
3004                    user_channel->instance_ptr.addr.address,
3005                    uvm_aperture_string(user_channel->instance_ptr.addr.aperture),
3006                    user_channel->tsg.id);
3007 
3008     // A subcontext's refcount can be zero if that subcontext is torn down
3009     // uncleanly and work from that subcontext continues running with work from
3010     // other subcontexts.
3011     if (channel_subctx_info->subctxs[subctx_id].refcount == 0) {
3012         UVM_ASSERT(channel_subctx_info->subctxs[subctx_id].va_space == NULL);
3013     }
3014     else {
3015         UVM_ASSERT_MSG(channel_subctx_info->subctxs[subctx_id].va_space,
3016                        "instance_ptr {0x%llx:%s} in TSG %u: no VA space for SubCTX %u\n",
3017                        user_channel->instance_ptr.addr.address,
3018                        uvm_aperture_string(user_channel->instance_ptr.addr.aperture),
3019                        user_channel->tsg.id,
3020                        subctx_id);
3021     }
3022 
3023     return channel_subctx_info->subctxs[subctx_id].va_space;
3024 }
3025 
3026 NV_STATUS uvm_gpu_fault_entry_to_va_space(uvm_gpu_t *gpu,
3027                                           uvm_fault_buffer_entry_t *fault,
3028                                           uvm_va_space_t **out_va_space)
3029 {
3030     uvm_user_channel_t *user_channel;
3031     NV_STATUS status = NV_OK;
3032 
3033     *out_va_space = NULL;
3034 
3035     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
3036 
3037     user_channel = instance_ptr_to_user_channel(gpu, fault->instance_ptr);
3038     if (!user_channel) {
3039         status = NV_ERR_INVALID_CHANNEL;
3040         goto exit_unlock;
3041     }
3042 
3043     // Faults from HUB clients will always report VEID 0 even if the channel
3044     // belongs a TSG with many subcontexts. Therefore, we cannot use the per-TSG
3045     // subctx table and we need to directly return the channel's VA space
3046     if (!user_channel->in_subctx || (fault->fault_source.client_type == UVM_FAULT_CLIENT_TYPE_HUB)) {
3047         UVM_ASSERT_MSG(fault->fault_source.ve_id == 0,
3048                        "Fault packet contains SubCTX %u for channel not in subctx\n",
3049                        fault->fault_source.ve_id);
3050 
3051         // We can safely access user_channel->gpu_va_space under the
3052         // instance_ptr_table_lock since gpu_va_space is set to NULL after this
3053         // function is called in uvm_user_channel_detach
3054         UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
3055         *out_va_space = user_channel->gpu_va_space->va_space;
3056     }
3057     else {
3058         NvU32 ve_id = fault->fault_source.ve_id;
3059 
3060         // Compute the SMC engine-local VEID
3061         UVM_ASSERT(ve_id >= user_channel->smc_engine_ve_id_offset);
3062 
3063         ve_id -= user_channel->smc_engine_ve_id_offset;
3064 
3065         *out_va_space = user_channel_and_subctx_to_va_space(user_channel, ve_id);
3066 
3067         // Instance pointer is valid but the fault targets a non-existent
3068         // subcontext.
3069         if (!*out_va_space)
3070             status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
3071     }
3072 
3073 exit_unlock:
3074     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
3075 
3076     return status;
3077 }
3078 
3079 NV_STATUS uvm_gpu_access_counter_entry_to_va_space(uvm_gpu_t *gpu,
3080                                                    uvm_access_counter_buffer_entry_t *entry,
3081                                                    uvm_va_space_t **out_va_space)
3082 {
3083     uvm_user_channel_t *user_channel;
3084     NV_STATUS status = NV_OK;
3085 
3086     *out_va_space = NULL;
3087     UVM_ASSERT(entry->address.is_virtual);
3088 
3089     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
3090 
3091     user_channel = instance_ptr_to_user_channel(gpu, entry->virtual_info.instance_ptr);
3092     if (!user_channel) {
3093         status = NV_ERR_INVALID_CHANNEL;
3094         goto exit_unlock;
3095     }
3096 
3097     if (!user_channel->in_subctx) {
3098         UVM_ASSERT_MSG(entry->virtual_info.ve_id == 0,
3099                        "Access counter packet contains SubCTX %u for channel not in subctx\n",
3100                        entry->virtual_info.ve_id);
3101 
3102         UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
3103         *out_va_space = user_channel->gpu_va_space->va_space;
3104     }
3105     else {
3106         *out_va_space = user_channel_and_subctx_to_va_space(user_channel, entry->virtual_info.ve_id);
3107         if (!*out_va_space)
3108             status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
3109     }
3110 
3111 exit_unlock:
3112     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
3113 
3114     return status;
3115 }
3116 
3117 void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel)
3118 {
3119     uvm_va_space_t *va_space;
3120     uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space;
3121 
3122     UVM_ASSERT(user_channel->rm_retained_channel);
3123     UVM_ASSERT(gpu_va_space);
3124     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
3125     va_space = gpu_va_space->va_space;
3126     uvm_assert_rwsem_locked_write(&va_space->lock);
3127 
3128     uvm_spin_lock(&gpu->parent->instance_ptr_table_lock);
3129     gpu_remove_user_channel_subctx_info_locked(gpu, user_channel);
3130     gpu_remove_user_channel_instance_ptr_locked(gpu, user_channel);
3131     uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);
3132 }
3133 
3134 static NvU64 gpu_addr_to_dma_addr(uvm_parent_gpu_t *parent_gpu, NvU64 gpu_addr)
3135 {
3136     NvU64 dma_addr = gpu_addr;
3137     UVM_ASSERT(dma_addr <= dma_addr + parent_gpu->dma_addressable_start);
3138 
3139     if (parent_gpu->npu)
3140         dma_addr = nv_expand_nvlink_addr(dma_addr);
3141 
3142     dma_addr += parent_gpu->dma_addressable_start;
3143 
3144     return dma_addr;
3145 }
3146 
3147 // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to
3148 // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when
3149 // referencing sysmem from the GPU, dma_addressable_start should be
3150 // subtracted from the DMA address we get from the OS.
3151 static NvU64 dma_addr_to_gpu_addr(uvm_parent_gpu_t *parent_gpu, NvU64 dma_addr)
3152 {
3153     NvU64 gpu_addr = dma_addr - parent_gpu->dma_addressable_start;
3154     UVM_ASSERT(dma_addr >= gpu_addr);
3155 
3156     // See Bug 1920398 for background and details about NVLink DMA address
3157     // transformations being applied here.
3158     if (parent_gpu->npu)
3159         gpu_addr = nv_compress_nvlink_addr(gpu_addr);
3160 
3161     return gpu_addr;
3162 }
3163 
3164 void *uvm_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu, gfp_t gfp_flags, NvU64 *dma_address_out)
3165 {
3166     NvU64 dma_addr;
3167     void *cpu_addr;
3168 
3169     cpu_addr = dma_alloc_coherent(&parent_gpu->pci_dev->dev, PAGE_SIZE, &dma_addr, gfp_flags);
3170 
3171     if (!cpu_addr)
3172         return cpu_addr;
3173 
3174     *dma_address_out = dma_addr_to_gpu_addr(parent_gpu, dma_addr);
3175     atomic64_add(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
3176     return cpu_addr;
3177 }
3178 
3179 void uvm_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address)
3180 {
3181     dma_address = gpu_addr_to_dma_addr(parent_gpu, dma_address);
3182     dma_free_coherent(&parent_gpu->pci_dev->dev, PAGE_SIZE, va, dma_address);
3183     atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
3184 }
3185 
3186 NV_STATUS uvm_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out)
3187 {
3188     NvU64 dma_addr;
3189 
3190     UVM_ASSERT(PAGE_ALIGNED(size));
3191 
3192     dma_addr = dma_map_page(&parent_gpu->pci_dev->dev, page, 0, size, DMA_BIDIRECTIONAL);
3193     if (dma_mapping_error(&parent_gpu->pci_dev->dev, dma_addr))
3194         return NV_ERR_OPERATING_SYSTEM;
3195 
3196     if (dma_addr < parent_gpu->dma_addressable_start ||
3197         dma_addr + size - 1 > parent_gpu->dma_addressable_limit) {
3198         dma_unmap_page(&parent_gpu->pci_dev->dev, dma_addr, size, DMA_BIDIRECTIONAL);
3199         UVM_ERR_PRINT_RL("PCI mapped range [0x%llx, 0x%llx) not in the addressable range [0x%llx, 0x%llx), GPU %s\n",
3200                          dma_addr,
3201                          dma_addr + (NvU64)size,
3202                          parent_gpu->dma_addressable_start,
3203                          parent_gpu->dma_addressable_limit + 1,
3204                          parent_gpu->name);
3205         return NV_ERR_INVALID_ADDRESS;
3206     }
3207 
3208     atomic64_add(size, &parent_gpu->mapped_cpu_pages_size);
3209     *dma_address_out = dma_addr_to_gpu_addr(parent_gpu, dma_addr);
3210 
3211     return NV_OK;
3212 }
3213 
3214 void uvm_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size)
3215 {
3216     UVM_ASSERT(PAGE_ALIGNED(size));
3217 
3218     dma_address = gpu_addr_to_dma_addr(parent_gpu, dma_address);
3219     dma_unmap_page(&parent_gpu->pci_dev->dev, dma_address, size, DMA_BIDIRECTIONAL);
3220     atomic64_sub(size, &parent_gpu->mapped_cpu_pages_size);
3221 }
3222 
3223 // This function implements the UvmRegisterGpu API call, as described in uvm.h.
3224 // Notes:
3225 //
3226 // 1. The UVM VA space has a 1-to-1 relationship with an open instance of
3227 // /dev/nvidia-uvm. That, in turn, has a 1-to-1 relationship with a process,
3228 // because the user-level UVM code (os-user-linux.c, for example) enforces an
3229 // "open /dev/nvidia-uvm only once per process" policy. So a UVM VA space is
3230 // very close to a process's VA space.
3231 //
3232 // If that user space code fails or is not used, then the relationship is no
3233 // longer 1-to-1. That situation requires that this code should avoid crashing,
3234 // leaking resources, exhibiting security holes, etc, but it does not have to
3235 // provide correct UVM API behavior. Correct UVM API behavior requires doing
3236 // the right things in user space before calling into the kernel.
3237 //
3238 // 2. The uvm_api*() routines are invoked directly from the top-level ioctl
3239 // handler. They are considered "API routing routines", because they are
3240 // responsible for providing the behavior that is described in the UVM
3241 // user-to-kernel API documentation, in uvm.h.
3242 //
3243 // 3. A GPU VA space, which you'll see in other parts of the driver,
3244 // is something different: there may be more than one
3245 // GPU VA space within a process, and therefore within a UVM VA space.
3246 //
3247 NV_STATUS uvm_api_register_gpu(UVM_REGISTER_GPU_PARAMS *params, struct file *filp)
3248 {
3249     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3250     uvm_rm_user_object_t user_rm_va_space = {
3251         .rm_control_fd = params->rmCtrlFd,
3252         .user_client   = params->hClient,
3253         .user_object   = params->hSmcPartRef,
3254     };
3255 
3256     return uvm_va_space_register_gpu(va_space,
3257                                      &params->gpu_uuid,
3258                                      &user_rm_va_space,
3259                                      &params->numaEnabled,
3260                                      &params->numaNodeId);
3261 }
3262 
3263 NV_STATUS uvm_api_unregister_gpu(UVM_UNREGISTER_GPU_PARAMS *params, struct file *filp)
3264 {
3265     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3266 
3267     return uvm_va_space_unregister_gpu(va_space, &params->gpu_uuid);
3268 }
3269 
3270 NV_STATUS uvm_api_register_gpu_va_space(UVM_REGISTER_GPU_VASPACE_PARAMS *params, struct file *filp)
3271 {
3272     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3273     uvm_rm_user_object_t user_rm_va_space = {
3274         .rm_control_fd = params->rmCtrlFd,
3275         .user_client   = params->hClient,
3276         .user_object   = params->hVaSpace
3277     };
3278     return uvm_va_space_register_gpu_va_space(va_space, &user_rm_va_space, &params->gpuUuid);
3279 }
3280 
3281 NV_STATUS uvm_api_unregister_gpu_va_space(UVM_UNREGISTER_GPU_VASPACE_PARAMS *params, struct file *filp)
3282 {
3283     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3284     return uvm_va_space_unregister_gpu_va_space(va_space, &params->gpuUuid);
3285 }
3286 
3287 NV_STATUS uvm_api_pageable_mem_access_on_gpu(UVM_PAGEABLE_MEM_ACCESS_ON_GPU_PARAMS *params, struct file *filp)
3288 {
3289     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3290     uvm_gpu_t *gpu;
3291 
3292     uvm_va_space_down_read(va_space);
3293     gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
3294 
3295     if (!gpu) {
3296         uvm_va_space_up_read(va_space);
3297         return NV_ERR_INVALID_DEVICE;
3298     }
3299 
3300     if (uvm_va_space_pageable_mem_access_supported(va_space) && gpu->parent->replayable_faults_supported)
3301         params->pageableMemAccess = NV_TRUE;
3302     else
3303         params->pageableMemAccess = NV_FALSE;
3304 
3305     uvm_va_space_up_read(va_space);
3306     return NV_OK;
3307 }
3308 
3309 NV_STATUS uvm_test_set_prefetch_filtering(UVM_TEST_SET_PREFETCH_FILTERING_PARAMS *params, struct file *filp)
3310 {
3311     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3312     uvm_gpu_t *gpu = NULL;
3313     NV_STATUS status = NV_OK;
3314 
3315     uvm_mutex_lock(&g_uvm_global.global_lock);
3316 
3317     uvm_va_space_down_read(va_space);
3318 
3319     gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
3320 
3321     if (!gpu) {
3322         status = NV_ERR_INVALID_DEVICE;
3323         goto done;
3324     }
3325 
3326     if (!gpu->parent->isr.replayable_faults.handling || !gpu->parent->prefetch_fault_supported) {
3327         status = NV_ERR_INVALID_DEVICE;
3328         goto done;
3329     }
3330 
3331     switch (params->filtering_mode) {
3332         case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_ALL:
3333             uvm_gpu_disable_prefetch_faults(gpu->parent);
3334             break;
3335         case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_NONE:
3336             uvm_gpu_enable_prefetch_faults(gpu->parent);
3337             break;
3338         default:
3339             status = NV_ERR_INVALID_ARGUMENT;
3340             break;
3341     }
3342 
3343 done:
3344     uvm_va_space_up_read(va_space);
3345 
3346     uvm_mutex_unlock(&g_uvm_global.global_lock);
3347     return status;
3348 }
3349 
3350 NV_STATUS uvm_test_get_gpu_time(UVM_TEST_GET_GPU_TIME_PARAMS *params, struct file *filp)
3351 {
3352     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3353     uvm_gpu_t *gpu = NULL;
3354     NV_STATUS status = NV_OK;
3355 
3356     uvm_va_space_down_read(va_space);
3357 
3358     gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
3359 
3360     if (gpu)
3361         params->timestamp_ns = gpu->parent->host_hal->get_time(gpu);
3362     else
3363         status = NV_ERR_INVALID_DEVICE;
3364 
3365     uvm_va_space_up_read(va_space);
3366 
3367     return status;
3368 }
3369