1 /*******************************************************************************
2     Copyright (c) 2016-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_linux.h"
26 #include "uvm_forward_decl.h"
27 #include "uvm_lock.h"
28 #include "uvm_mmu.h"
29 #include "uvm_api.h"
30 #include "uvm_global.h"
31 #include "uvm_gpu.h"
32 #include "uvm_push.h"
33 #include "uvm_va_space.h"
34 #include "uvm_va_range.h"
35 #include "uvm_tracker.h"
36 #include "uvm_hal.h"
37 #include "uvm_hal_types.h"
38 #include "uvm_map_external.h"
39 #include "uvm_pte_batch.h"
40 #include "uvm_tlb_batch.h"
41 #include "nv_uvm_interface.h"
42 
43 #include "uvm_pushbuffer.h"
44 
45 // Assume almost all of the push space can be used for PTEs leaving 1K of margin.
46 #define MAX_COPY_SIZE_PER_PUSH ((size_t)(UVM_MAX_PUSH_SIZE - 1024))
47 
48 typedef struct
49 {
50     // The VA range the buffer is for
51     uvm_va_range_t *va_range;
52 
53     // The GPU that's mapping the VA range
54     uvm_gpu_t *gpu;
55 
56     // Mapping info used for querying PTEs from RM
57     UvmGpuExternalMappingInfo mapping_info;
58 
59     // Size of the buffer
60     size_t buffer_size;
61 
62     // Page size in bytes
63     NvU32 page_size;
64 
65     // Size of a single PTE in bytes
66     NvU32 pte_size;
67 
68     // Max PTE offset covered by the VA range.
69     //
70     // Notably the mapping might not start at offset 0 and max PTE offset can be
71     // larger than number of PTEs covering the VA range.
72     size_t max_pte_offset;
73 
74     // Number of PTEs currently in the buffer
75     size_t num_ptes;
76 
77     // PTE offset at which the currently buffered PTEs start.
78     size_t pte_offset;
79 } uvm_pte_buffer_t;
80 
81 // Max PTE buffer size is the size of the buffer used for querying PTEs from RM.
82 // It has to be big enough to amortize the cost of calling into RM, but small
83 // enough to fit in CPU caches as it's written and read multiple times on the
84 // CPU before it ends up in the pushbuffer.
85 // 96K seems to be a sweet spot at least on a Xeon W5580 system. This could use
86 // some benchmarking on more systems though.
87 #define MAX_PTE_BUFFER_SIZE ((size_t)96 * 1024)
88 
89 static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,
90                                      uvm_gpu_t *gpu,
91                                      const uvm_map_rm_params_t *map_rm_params,
92                                      NvU64 length,
93                                      NvU32 page_size,
94                                      uvm_pte_buffer_t *pte_buffer)
95 {
96     uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, gpu);
97     uvm_page_tree_t *tree = &gpu_va_space->page_tables;
98     size_t num_all_ptes;
99 
100     memset(pte_buffer, 0, sizeof(*pte_buffer));
101 
102     pte_buffer->va_range = va_range;
103     pte_buffer->gpu = gpu;
104     pte_buffer->mapping_info.cachingType = map_rm_params->caching_type;
105     pte_buffer->mapping_info.mappingType = map_rm_params->mapping_type;
106     pte_buffer->mapping_info.formatType = map_rm_params->format_type;
107     pte_buffer->mapping_info.elementBits = map_rm_params->element_bits;
108     pte_buffer->mapping_info.compressionType = map_rm_params->compression_type;
109     if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL)
110         pte_buffer->mapping_info.mappingPageSize = page_size;
111 
112     pte_buffer->page_size = page_size;
113     pte_buffer->pte_size = uvm_mmu_pte_size(tree, page_size);
114     num_all_ptes = uvm_div_pow2_64(length, page_size);
115     pte_buffer->max_pte_offset = uvm_div_pow2_64(map_rm_params->map_offset, page_size) + num_all_ptes;
116     pte_buffer->buffer_size = min(MAX_PTE_BUFFER_SIZE, num_all_ptes * pte_buffer->pte_size);
117 
118     pte_buffer->mapping_info.pteBuffer = uvm_kvmalloc(pte_buffer->buffer_size);
119     if (!pte_buffer->mapping_info.pteBuffer)
120         return NV_ERR_NO_MEMORY;
121 
122     return NV_OK;
123 }
124 
125 static void uvm_pte_buffer_deinit(uvm_pte_buffer_t *pte_buffer)
126 {
127     uvm_kvfree(pte_buffer->mapping_info.pteBuffer);
128 }
129 
130 // Get the PTEs for mapping the [map_offset, map_offset + map_size) VA range.
131 static NV_STATUS uvm_pte_buffer_get(uvm_pte_buffer_t *pte_buffer,
132                                     NvHandle mem_handle,
133                                     NvU64 map_offset,
134                                     NvU64 map_size,
135                                     NvU64 **ptes_out)
136 {
137     NV_STATUS status;
138     size_t pte_offset;
139     size_t num_ptes;
140     size_t ptes_left;
141     uvm_va_range_t *va_range = pte_buffer->va_range;
142     uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, pte_buffer->gpu);
143 
144     UVM_ASSERT(IS_ALIGNED(map_offset, pte_buffer->page_size));
145     UVM_ASSERT(IS_ALIGNED(map_size, pte_buffer->page_size));
146 
147     pte_offset = uvm_div_pow2_64(map_offset, pte_buffer->page_size);
148     num_ptes = uvm_div_pow2_64(map_size, pte_buffer->page_size);
149 
150     UVM_ASSERT(num_ptes <= pte_buffer->buffer_size / pte_buffer->pte_size);
151 
152     // If the requested range is already fully cached, just calculate its
153     // offset within the buffer and return.
154     if (pte_buffer->pte_offset <= pte_offset && pte_buffer->pte_offset + pte_buffer->num_ptes >= pte_offset + num_ptes) {
155         pte_offset -= pte_buffer->pte_offset;
156         *ptes_out = (NvU64 *)((char *)pte_buffer->mapping_info.pteBuffer + pte_offset * pte_buffer->pte_size);
157         return NV_OK;
158     }
159 
160     // Otherwise get max possible PTEs from RM starting at the requested offset.
161     pte_buffer->pte_offset = pte_offset;
162     ptes_left = pte_buffer->max_pte_offset - pte_offset;
163     pte_buffer->num_ptes = min(pte_buffer->buffer_size / pte_buffer->pte_size, ptes_left);
164 
165     UVM_ASSERT_MSG(pte_buffer->num_ptes >= num_ptes, "buffer num ptes %zu < num ptes %zu\n",
166             pte_buffer->num_ptes, num_ptes);
167 
168     // TODO: Bug 1735291: RM can determine the buffer size from the map_size
169     //       parameter.
170     pte_buffer->mapping_info.pteBufferSize = pte_buffer->num_ptes * pte_buffer->pte_size;
171 
172     if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) {
173         status = uvm_rm_locked_call(nvUvmInterfaceGetChannelResourcePtes(gpu_va_space->duped_gpu_va_space,
174                                                                          va_range->channel.rm_descriptor,
175                                                                          map_offset,
176                                                                          pte_buffer->num_ptes * pte_buffer->page_size,
177                                                                          &pte_buffer->mapping_info));
178     }
179     else {
180         status = uvm_rm_locked_call(nvUvmInterfaceGetExternalAllocPtes(gpu_va_space->duped_gpu_va_space,
181                                                                        mem_handle,
182                                                                        map_offset,
183                                                                        pte_buffer->num_ptes * pte_buffer->page_size,
184                                                                        &pte_buffer->mapping_info));
185     }
186 
187     if (status != NV_OK) {
188         if (status != NV_ERR_NOT_READY) {
189             UVM_ERR_PRINT("Failed to get %s mappings for VA range [0x%llx, 0x%llx], offset 0x%llx, size 0x%llx: %s\n",
190                           va_range->type == UVM_VA_RANGE_TYPE_CHANNEL ? "channel" : "external",
191                           va_range->node.start,
192                           va_range->node.end,
193                           map_offset,
194                           map_size,
195                           nvstatusToString(status));
196         }
197         return status;
198     }
199 
200     *ptes_out = pte_buffer->mapping_info.pteBuffer;
201 
202     return NV_OK;
203 }
204 
205 // Copies the input ptes buffer to the given physical address, with an optional
206 // TLB invalidate. The copy acquires the input tracker then updates it.
207 static NV_STATUS copy_ptes(uvm_page_tree_t *tree,
208                            NvU64 page_size,
209                            uvm_gpu_phys_address_t pte_addr,
210                            NvU64 *ptes,
211                            NvU32 num_ptes,
212                            bool last_mapping,
213                            uvm_range_tree_node_t *range_node,
214                            uvm_tracker_t *tracker)
215 {
216     uvm_push_t push;
217     NV_STATUS status;
218     NvU32 pte_size = uvm_mmu_pte_size(tree, page_size);
219 
220     UVM_ASSERT(((NvU64)pte_size) * num_ptes == pte_size * num_ptes);
221     UVM_ASSERT(pte_size * num_ptes <= MAX_COPY_SIZE_PER_PUSH);
222 
223     status = uvm_push_begin_acquire(tree->gpu->channel_manager,
224                                     UVM_CHANNEL_TYPE_MEMOPS,
225                                     tracker,
226                                     &push,
227                                     "Writing %u bytes of PTEs to {%s, 0x%llx}",
228                                     pte_size * num_ptes,
229                                     uvm_aperture_string(pte_addr.aperture),
230                                     pte_addr.address);
231     if (status != NV_OK)
232         return status;
233 
234     uvm_pte_batch_single_write_ptes(&push, pte_addr, ptes, pte_size, num_ptes);
235 
236     if (last_mapping) {
237         // Do a TLB invalidate if this is the last mapping in the VA range
238         // Membar: This is a permissions upgrade, so no post-invalidate membar
239         //         is needed.
240         uvm_tlb_batch_single_invalidate(tree,
241                                         &push,
242                                         range_node->start,
243                                         uvm_range_tree_node_size(range_node),
244                                         page_size,
245                                         UVM_MEMBAR_NONE);
246     }
247     else {
248         // For pushes prior to the last one, the PTE batch write has
249         // already pushed a membar that's enough to order the PTE writes
250         // with the TLB invalidate in the last push and that's all
251         // that's needed.
252         // If a failure happens before the push for the last mapping, it is
253         // still ok as what will follow is more CE writes to unmap the PTEs and
254         // those will get ordered by the membar from the PTE batch.
255         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
256     }
257 
258     uvm_push_end(&push);
259 
260     // The push acquired the tracker so it's ok to just overwrite it with
261     // the entry tracking the push.
262     uvm_tracker_overwrite_with_push(tracker, &push);
263 
264     return NV_OK;
265 }
266 
267 // Map all of pt_range, which is contained with the va_range and begins at
268 // virtual address map_start. The PTE values are queried from RM and the pushed
269 // writes are added to the input tracker.
270 //
271 // If the mapped range ends on range_node->end, a TLB invalidate for upgrade is
272 // also issued.
273 static NV_STATUS map_rm_pt_range(uvm_page_tree_t *tree,
274                                  uvm_page_table_range_t *pt_range,
275                                  uvm_pte_buffer_t *pte_buffer,
276                                  uvm_range_tree_node_t *range_node,
277                                  NvHandle mem_handle,
278                                  NvU64 map_start,
279                                  NvU64 map_offset,
280                                  uvm_tracker_t *tracker)
281 {
282     uvm_gpu_phys_address_t pte_addr;
283     NvU64 page_size = pt_range->page_size;
284     NvU32 pte_size = uvm_mmu_pte_size(tree, page_size);
285     NvU64 addr, end;
286     size_t max_ptes, ptes_left, num_ptes;
287     NvU64 map_size;
288     bool last_mapping;
289     NV_STATUS status = NV_OK;
290 
291     end = map_start + uvm_page_table_range_size(pt_range) - 1;
292 
293     UVM_ASSERT(map_start >= range_node->start);
294     UVM_ASSERT(end <= range_node->end);
295     UVM_ASSERT(page_size & tree->hal->page_sizes());
296     UVM_ASSERT(IS_ALIGNED(map_start, page_size));
297     UVM_ASSERT(IS_ALIGNED(map_offset, page_size));
298 
299     pte_addr = uvm_page_table_range_entry_address(tree, pt_range, 0);
300     max_ptes = min((size_t)(uvm_mmu_pde_coverage(tree, page_size) / page_size), MAX_COPY_SIZE_PER_PUSH / pte_size);
301     max_ptes = min(max_ptes, pte_buffer->buffer_size / pte_size);
302 
303     addr = map_start;
304     ptes_left = (size_t)uvm_div_pow2_64(uvm_page_table_range_size(pt_range), page_size);
305     while (addr < end) {
306         NvU64 *pte_bits;
307 
308         num_ptes = min(max_ptes, ptes_left);
309         map_size = num_ptes * page_size;
310         UVM_ASSERT(addr + map_size <= end + 1);
311 
312         status = uvm_pte_buffer_get(pte_buffer, mem_handle, map_offset, map_size, &pte_bits);
313         if (status != NV_OK)
314             return status;
315 
316         last_mapping = (addr + map_size - 1 == range_node->end);
317 
318         // These copies are technically independent, except for the last one
319         // which issues the TLB invalidate and thus must wait for all others.
320         // However, since each copy will saturate the bus anyway we force them
321         // to serialize to avoid bus contention.
322         status = copy_ptes(tree,
323                            page_size,
324                            pte_addr,
325                            pte_bits,
326                            num_ptes,
327                            last_mapping,
328                            range_node,
329                            tracker);
330         if (status != NV_OK)
331             return status;
332 
333         ptes_left -= num_ptes;
334         pte_addr.address += num_ptes * pte_size;
335         addr += map_size;
336         map_offset += map_size;
337     }
338 
339     return NV_OK;
340 }
341 
342 // Determine the appropriate membar for downgrades on a VA range with type
343 // UVM_VA_RANGE_TYPE_EXTERNAL or UVM_VA_RANGE_TYPE_CHANNEL.
344 static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_gpu_map_t *ext_gpu_map)
345 {
346     if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) {
347         return uvm_hal_downgrade_membar_type(va_range->channel.gpu_va_space->gpu,
348                                              va_range->channel.aperture == UVM_APERTURE_VID);
349     }
350 
351     // If there is no mem_handle, this is a sparse mapping.
352     // UVM_MEMBAR_GPU is sufficient because the debug pages remain allocated
353     // until the GPU is torn down. GPU tear down implies that our context has
354     // been switched out. In turn, this implies a sysmembar.
355     if (!ext_gpu_map->mem_handle)
356         return UVM_MEMBAR_GPU;
357 
358     return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu,
359                                          !ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu);
360 }
361 
362 NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,
363                                          uvm_gpu_t *mapping_gpu,
364                                          const UvmGpuMemoryInfo *mem_info,
365                                          const uvm_map_rm_params_t *map_rm_params,
366                                          uvm_ext_gpu_map_t *ext_gpu_map,
367                                          uvm_tracker_t *out_tracker)
368 {
369     uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, mapping_gpu);
370     uvm_page_tree_t *page_tree;
371     uvm_pte_buffer_t pte_buffer;
372     uvm_page_table_range_vec_t *pt_range_vec;
373     uvm_page_table_range_t *pt_range;
374     uvm_range_tree_node_t *node;
375     NvU64 addr, size;
376     NvU64 map_offset = map_rm_params->map_offset;
377     size_t i;
378     NV_STATUS status;
379     uvm_tracker_t *tracker;
380 
381     // Track local pushes in a separate tracker, instead of adding them
382     // directly to the output tracker, to avoid false dependencies
383     // (serialization) on unrelated work. The local tracker is added to the
384     // output tracker before the function returns.
385     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
386 
387     // Local tracker is used when this function is called to map allocations
388     // other than external allocations. Otherwise, the external allocations
389     // use their own tracker.
390     if (ext_gpu_map)
391         tracker = &ext_gpu_map->tracker;
392     else
393         tracker = &local_tracker;
394 
395     UVM_ASSERT(gpu_va_space);
396     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL || va_range->type == UVM_VA_RANGE_TYPE_CHANNEL);
397     UVM_ASSERT(IS_ALIGNED(mem_info->size, mem_info->pageSize));
398     UVM_ASSERT(out_tracker);
399 
400     page_tree = &gpu_va_space->page_tables;
401 
402     UVM_ASSERT(uvm_mmu_page_size_supported(page_tree, mem_info->pageSize));
403 
404     if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) {
405         // We should be never called with ext_gpu_map == NULL
406         // and UVM_VA_RANGE_TYPE_EXTERNAL
407         UVM_ASSERT(ext_gpu_map != NULL);
408         node = &ext_gpu_map->node;
409         pt_range_vec = &ext_gpu_map->pt_range_vec;
410     }
411     else {
412         node = &va_range->node;
413         pt_range_vec = &va_range->channel.pt_range_vec;
414     }
415 
416     if (map_offset + uvm_range_tree_node_size(node) > mem_info->size)
417         return NV_ERR_INVALID_OFFSET;
418 
419     UVM_ASSERT(IS_ALIGNED(node->start, mem_info->pageSize) &&
420                IS_ALIGNED(node->end + 1, mem_info->pageSize) &&
421                IS_ALIGNED(map_offset, mem_info->pageSize));
422 
423     status = uvm_pte_buffer_init(va_range,
424                                  mapping_gpu,
425                                  map_rm_params,
426                                  uvm_range_tree_node_size(node),
427                                  mem_info->pageSize,
428                                  &pte_buffer);
429     if (status != NV_OK)
430         return status;
431 
432     // Allocate all page tables for this VA range.
433     //
434     // TODO: Bug 1766649: Benchmark to see if we get any performance improvement
435     //       from parallelizing page range allocation with writing PTEs for
436     //       earlier ranges.
437     status = uvm_page_table_range_vec_init(page_tree,
438                                            node->start,
439                                            uvm_range_tree_node_size(node),
440                                            mem_info->pageSize,
441                                            UVM_PMM_ALLOC_FLAGS_EVICT,
442                                            pt_range_vec);
443     if (status != NV_OK)
444         goto out;
445 
446     addr = node->start;
447     for (i = 0; i < pt_range_vec->range_count; i++) {
448         pt_range = &pt_range_vec->ranges[i];
449 
450         // External allocations track pushes in their own trackers. User channel
451         // mappings don't have their own trackers, so for those the local tracker
452         // is used.
453         status = map_rm_pt_range(page_tree,
454                                  pt_range,
455                                  &pte_buffer,
456                                  node,
457                                  ext_gpu_map ? ext_gpu_map->mem_handle->rm_handle : 0,
458                                  addr,
459                                  map_offset,
460                                  tracker);
461         if (status != NV_OK)
462             goto out;
463 
464         size = uvm_page_table_range_size(pt_range);
465         addr += size;
466         map_offset += size;
467     }
468 
469     status = uvm_tracker_add_tracker(out_tracker, tracker);
470 
471 out:
472     if (status != NV_OK) {
473         // We could have any number of mappings in flight to these page tables,
474         // so wait for everything before we clear and free them.
475         if (uvm_tracker_wait(tracker) != NV_OK) {
476             // System-fatal error. Just leak.
477             return status;
478         }
479 
480         if (pt_range_vec->ranges) {
481             uvm_page_table_range_vec_clear_ptes(pt_range_vec, va_range_downgrade_membar(va_range, ext_gpu_map));
482             uvm_page_table_range_vec_deinit(pt_range_vec);
483         }
484     }
485 
486     uvm_pte_buffer_deinit(&pte_buffer);
487     uvm_tracker_deinit(&local_tracker);
488     return status;
489 }
490 
491 static bool uvm_api_mapping_type_invalid(UvmGpuMappingType map_type)
492 {
493     BUILD_BUG_ON((int)UvmGpuMappingTypeDefault != (int)UvmRmGpuMappingTypeDefault);
494     BUILD_BUG_ON((int)UvmGpuMappingTypeReadWriteAtomic != (int)UvmRmGpuMappingTypeReadWriteAtomic);
495     BUILD_BUG_ON((int)UvmGpuMappingTypeReadWrite != (int)UvmRmGpuMappingTypeReadWrite);
496     BUILD_BUG_ON((int)UvmGpuMappingTypeReadOnly != (int)UvmRmGpuMappingTypeReadOnly);
497     BUILD_BUG_ON((int)UvmGpuMappingTypeCount != (int)UvmRmGpuMappingTypeCount);
498 
499     switch (map_type) {
500         case UvmGpuMappingTypeDefault:
501         case UvmGpuMappingTypeReadWriteAtomic:
502         case UvmGpuMappingTypeReadWrite:
503         case UvmGpuMappingTypeReadOnly:
504             return false;
505         default:
506             return true;
507     }
508 }
509 
510 static bool uvm_api_caching_type_invalid(UvmGpuCachingType cache_type)
511 {
512     BUILD_BUG_ON((int)UvmGpuCachingTypeDefault != (int)UvmRmGpuCachingTypeDefault);
513     BUILD_BUG_ON((int)UvmGpuCachingTypeForceUncached != (int)UvmRmGpuCachingTypeForceUncached);
514     BUILD_BUG_ON((int)UvmGpuCachingTypeForceCached != (int)UvmRmGpuCachingTypeForceCached);
515     BUILD_BUG_ON((int)UvmGpuCachingTypeCount != (int)UvmRmGpuCachingTypeCount);
516 
517     switch (cache_type) {
518         case UvmGpuCachingTypeDefault:
519         case UvmGpuCachingTypeForceUncached:
520         case UvmGpuCachingTypeForceCached:
521             return false;
522         default:
523             return true;
524     }
525 }
526 
527 static bool uvm_api_kind_type_invalid(UvmGpuFormatType format_type,
528                                       UvmGpuFormatElementBits element_bits,
529                                       UvmGpuCompressionType compression_type)
530 {
531     BUILD_BUG_ON((int)UvmGpuFormatTypeDefault != (int)UvmRmGpuFormatTypeDefault);
532     BUILD_BUG_ON((int)UvmGpuFormatTypeBlockLinear != (int)UvmRmGpuFormatTypeBlockLinear);
533     BUILD_BUG_ON((int)UvmGpuFormatTypeCount != (int)UvmRmGpuFormatTypeCount);
534 
535     BUILD_BUG_ON((int)UvmGpuFormatElementBitsDefault != (int)UvmRmGpuFormatElementBitsDefault);
536     BUILD_BUG_ON((int)UvmGpuFormatElementBits8 != (int)UvmRmGpuFormatElementBits8);
537     BUILD_BUG_ON((int)UvmGpuFormatElementBits16 != (int)UvmRmGpuFormatElementBits16);
538     BUILD_BUG_ON((int)UvmGpuFormatElementBits32 != (int)UvmRmGpuFormatElementBits32);
539     BUILD_BUG_ON((int)UvmGpuFormatElementBits64 != (int)UvmRmGpuFormatElementBits64);
540     BUILD_BUG_ON((int)UvmGpuFormatElementBits128 != (int)UvmRmGpuFormatElementBits128);
541     BUILD_BUG_ON((int)UvmGpuFormatElementBitsCount != (int)UvmRmGpuFormatElementBitsCount);
542 
543     BUILD_BUG_ON((int)UvmGpuCompressionTypeDefault != (int)UvmRmGpuCompressionTypeDefault);
544     BUILD_BUG_ON((int)UvmGpuCompressionTypeEnabledNoPlc != (int)UvmRmGpuCompressionTypeEnabledNoPlc);
545     BUILD_BUG_ON((int)UvmGpuCompressionTypeCount != (int)UvmRmGpuCompressionTypeCount);
546 
547     if (compression_type >= UvmGpuCompressionTypeCount)
548         return true;
549 
550     switch (format_type) {
551         case UvmGpuFormatTypeDefault:
552         case UvmGpuFormatTypeBlockLinear:
553             break;
554         default:
555             return true;
556     }
557 
558     switch (element_bits) {
559         case UvmGpuFormatElementBitsDefault:
560         case UvmGpuFormatElementBits8:
561         case UvmGpuFormatElementBits16:
562         // CUDA does not support 24-bit width
563         case UvmGpuFormatElementBits32:
564         case UvmGpuFormatElementBits64:
565         case UvmGpuFormatElementBits128:
566             break;
567         default:
568             return true;
569     }
570 
571     if (((format_type != UvmGpuFormatTypeDefault) && (element_bits == UvmGpuFormatElementBitsDefault)) ||
572         ((element_bits != UvmGpuFormatElementBitsDefault) && (format_type == UvmGpuFormatTypeDefault)))
573         return true;
574 
575     return false;
576 }
577 
578 static void uvm_release_rm_handle(struct nv_kref *ref)
579 {
580     uvm_ext_gpu_mem_handle *mem_handle = container_of(ref, uvm_ext_gpu_mem_handle, ref_count);
581 
582     if (mem_handle->rm_handle) {
583         NV_STATUS status;
584 
585         status = uvm_rm_locked_call(nvUvmInterfaceFreeDupedHandle(uvm_gpu_device_handle(mem_handle->gpu),
586                                                                   mem_handle->rm_handle));
587         UVM_ASSERT(status == NV_OK);
588     }
589     uvm_kvfree(mem_handle);
590 }
591 
592 static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_EXTERNAL_RANGE_PARAMS *params)
593 {
594     uvm_va_range_t *va_range = NULL;
595     struct mm_struct *mm;
596     NV_STATUS status = NV_OK;
597 
598     // Before we know the page size used by the allocation, we can only enforce
599     // 4K alignment as that's the minimum page size used for GPU allocations.
600     // Later uvm_map_external_allocation_on_gpu() will enforce alignment to the
601     // page size used by the allocation.
602     if (uvm_api_range_invalid_4k(params->base, params->length))
603         return NV_ERR_INVALID_ADDRESS;
604 
605     // The mm needs to be locked in order to remove stale HMM va_blocks.
606     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
607     uvm_va_space_down_write(va_space);
608 
609     // Create the new external VA range.
610     // uvm_va_range_create_external handles any collisions when it attempts to
611     // insert the new range into the va_space range tree.
612     status = uvm_va_range_create_external(va_space, mm, params->base, params->length, &va_range);
613     if (status != NV_OK) {
614         UVM_DBG_PRINT_RL("Failed to create external VA range [0x%llx, 0x%llx)\n",
615                          params->base,
616                          params->base + params->length);
617     }
618 
619     uvm_va_space_up_write(va_space);
620     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
621     return status;
622 }
623 
624 NV_STATUS uvm_api_create_external_range(UVM_CREATE_EXTERNAL_RANGE_PARAMS *params, struct file *filp)
625 {
626     uvm_va_space_t *va_space = uvm_va_space_get(filp);
627     return uvm_create_external_range(va_space, params);
628 }
629 
630 static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
631                                           uvm_va_space_t *va_space,
632                                           uvm_gpu_t *mapping_gpu,
633                                           const UvmGpuMemoryInfo *mem_info)
634 {
635     uvm_gpu_t *owning_gpu;
636 
637     if (!mem_info->deviceDescendant && !mem_info->sysmem) {
638         ext_gpu_map->owning_gpu = NULL;
639         ext_gpu_map->is_sysmem = false;
640         return NV_OK;
641     }
642     // This is a local or peer allocation, so the owning GPU must have been
643     // registered.
644     owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
645     if (!owning_gpu)
646         return NV_ERR_INVALID_DEVICE;
647 
648     // Even if the allocation is in sysmem then it still matters which GPU owns
649     // it, because our dup is not enough to keep the owning GPU around and that
650     // exposes a bug in RM where the memory can outlast the GPU and then cause
651     // crashes when it's eventually freed.
652     // TODO: Bug 1811006: Bug tracking the RM issue, its fix might change the
653     // semantics of sysmem allocations.
654     if (mem_info->sysmem) {
655         ext_gpu_map->owning_gpu = owning_gpu;
656         ext_gpu_map->is_sysmem = true;
657         return NV_OK;
658     }
659 
660     if (owning_gpu != mapping_gpu) {
661         // TODO: Bug 1757136: In SLI, the returned UUID may be different but a
662         //       local mapping must be used. We need to query SLI groups to know
663         //       that.
664         if (!uvm_va_space_peer_enabled(va_space, mapping_gpu, owning_gpu))
665             return NV_ERR_INVALID_DEVICE;
666     }
667 
668     ext_gpu_map->owning_gpu = owning_gpu;
669     ext_gpu_map->is_sysmem = false;
670     return NV_OK;
671 }
672 
673 static uvm_ext_gpu_map_t *uvm_va_range_ext_gpu_map(uvm_va_range_t *va_range, uvm_gpu_t *mapping_gpu, NvU64 addr)
674 {
675     uvm_ext_gpu_map_t *ext_gpu_map = NULL;
676     uvm_range_tree_node_t *node;
677     uvm_ext_gpu_range_tree_t *range_tree;
678 
679     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL);
680     uvm_assert_rwsem_locked(&va_range->va_space->lock);
681 
682     range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
683 
684     if (uvm_processor_mask_test(&va_range->external.mapped_gpus, mapping_gpu->id)) {
685         UVM_ASSERT(!uvm_range_tree_empty(&range_tree->tree));
686         node = uvm_range_tree_find(&range_tree->tree, addr);
687         if (node) {
688             ext_gpu_map = uvm_ext_gpu_map_container(node);
689             UVM_ASSERT(ext_gpu_map->gpu == mapping_gpu);
690         }
691     }
692     else {
693         UVM_ASSERT(uvm_range_tree_empty(&range_tree->tree));
694     }
695 
696     return ext_gpu_map;
697 }
698 
699 static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree,
700                                        uvm_ext_gpu_map_t *existing_map,
701                                        NvU64 new_end,
702                                        uvm_ext_gpu_map_t **new_map)
703 {
704     uvm_ext_gpu_map_t *new;
705     NV_STATUS status;
706     NvU64 new_start = new_end + 1;
707 
708     if (!IS_ALIGNED(new_start, existing_map->pt_range_vec.page_size))
709         return NV_ERR_INVALID_ADDRESS;
710 
711     UVM_ASSERT(new_start >= existing_map->node.start && new_start < existing_map->node.end);
712 
713     new = uvm_kvmalloc_zero(sizeof(*new));
714     if (!new)
715         return NV_ERR_NO_MEMORY;
716 
717     RB_CLEAR_NODE(&new->node.rb_node);
718     new->mem_handle = existing_map->mem_handle;
719     new->gpu = existing_map->gpu;
720     new->owning_gpu = existing_map->owning_gpu;
721     new->is_sysmem = existing_map->is_sysmem;
722 
723     // Initialize the new ext_gpu_map tracker as a copy of the existing_map tracker.
724     // This way, any operations on any of the two ext_gpu_maps will be able to
725     // wait for any uncompleted work prior to the split.
726     status = uvm_tracker_init_from(&new->tracker, &existing_map->tracker);
727     if (status != NV_OK) {
728         uvm_kvfree(new);
729         return status;
730     }
731 
732     status = uvm_page_table_range_vec_split_upper(&existing_map->pt_range_vec, new_start - 1, &new->pt_range_vec);
733     if (status != NV_OK) {
734         uvm_tracker_deinit(&new->tracker);
735         uvm_kvfree(new);
736         return status;
737     }
738 
739     new->node.start = new_start;
740 
741     // Sparse mappings don't have actual allocations.
742     if (new->mem_handle)
743         nv_kref_get(&new->mem_handle->ref_count);
744 
745     uvm_range_tree_split(tree, &existing_map->node, &new->node);
746 
747     if (new_map)
748         *new_map = new;
749 
750     return NV_OK;
751 }
752 
753 static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range,
754                                              uvm_gpu_t *gpu,
755                                              NvU64 start,
756                                              NvU64 end,
757                                              struct list_head *deferred_list)
758 {
759     uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
760     uvm_ext_gpu_map_t *ext_map, *ext_map_next = NULL;
761     NV_STATUS status = NV_OK;
762 
763     uvm_assert_mutex_locked(&range_tree->lock);
764 
765     // If a previously existing sub-range is found (ext_map != NULL), the
766     // new sub-range can be overlapping with the existing one in one of the
767     // following ways:
768     //
769     //   1. complete overlap (exact start and end boundary match is special
770     //      cases of this):
771     //           [---- existing ----]
772     //       [----       new        ----]
773     //   2. partial overlap at the start (end boundary match is a special case
774     //      of this):
775     //           [---- existing ----]
776     //               [----    new    ----]
777     //   3. partial overlap at the end (start boundary match is a special case
778     //      of this):
779     //           [---- existing ----]
780     //       [----   new    ----]
781     //   4. completely contained (start of new != start of existing and end of
782     //      new != end of existing, otherwise see 1):
783     //           [---- existing ----]
784     //                [-- new --]
785     //
786     // The algorithm below is:
787     //   1. If the start of the new mapping is greater than the start of the
788     //      existing mapping, split the existing mapping at start. The newly
789     //      created uvm_ext_gpu_map_t will be inserted into the tree. Note that
790     //      the newly created uvm_ext_gpu_map_t is the one that we want to visit
791     //      next. When the loop visits the newly created uvm_ext_gpu_map_t and
792     //      its boundaries are completely overlapped by the new mapping, it will
793     //      cause the algorithm to destroy it.
794     //   2. If the end of the new mapping is less than the end of the existing
795     //      mapping, split the existing mapping at end. The newly created
796     //      uvm_ext_gpu_map_t will be inserted into the tree. The overlapping
797     //      portion of the existing mapping will be destroyed.
798     //   3. If the existing mapping is completely overlapped by the new mapping,
799     //      the existing mapping is destroyed.
800     //
801     // The loop cannot use any of the existing iterators because:
802     //   1. It needs to be able to destroy ext_gpu_map structures. This means it
803     //      can't use non-safe iterators.
804     //   2. It needs to visit newly created uvm_ext_gpu_map_t, as a result of
805     //      splits. This means it can't use safe iterators as they will skip the
806     //      newly created uvm_ext_gpu_map_t.
807     ext_map = uvm_ext_gpu_map_iter_first(va_range, gpu, start, end);
808     while (ext_map) {
809         if (start > ext_map->node.start) {
810             status = uvm_ext_gpu_map_split(&range_tree->tree, ext_map, start - 1, &ext_map_next);
811             if (status != NV_OK)
812                 break;
813         }
814         else {
815             if (end < ext_map->node.end) {
816                 status = uvm_ext_gpu_map_split(&range_tree->tree, ext_map, end, NULL);
817                 if (status != NV_OK)
818                     break;
819                 ext_map_next = NULL;
820             }
821             else {
822                 ext_map_next = uvm_ext_gpu_map_iter_next(va_range, ext_map, end);
823             }
824 
825             uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_list);
826         }
827 
828         ext_map = ext_map_next;
829     }
830 
831     return status;
832 }
833 
834 static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
835                                                     uvm_gpu_t *mapping_gpu,
836                                                     const uvm_rm_user_object_t *user_rm_mem,
837                                                     const uvm_map_rm_params_t *map_rm_params,
838                                                     NvU64 base,
839                                                     NvU64 length,
840                                                     uvm_tracker_t *out_tracker)
841 {
842     uvm_va_space_t *va_space = va_range->va_space;
843     uvm_ext_gpu_map_t *ext_gpu_map = NULL;
844     uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
845     UvmGpuMemoryInfo mem_info;
846     uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
847     NvU32 mapping_page_size;
848     NvU64 alignments;
849     NvU32 smallest_alignment;
850     NV_STATUS status;
851 
852     uvm_assert_rwsem_locked_read(&va_space->lock);
853 
854     if ((map_rm_params->compression_type == UvmGpuCompressionTypeEnabledNoPlc) && !mapping_gpu->parent->plc_supported)
855         return NV_ERR_INVALID_DEVICE;
856 
857     // Check if the GPU can access the VA
858     if (!uvm_gpu_can_address(mapping_gpu, base, length))
859         return NV_ERR_OUT_OF_RANGE;
860 
861     uvm_mutex_lock(&range_tree->lock);
862 
863     status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, NULL);
864     if (status != NV_OK)
865         goto error;
866 
867     ext_gpu_map = uvm_kvmalloc_zero(sizeof(*ext_gpu_map));
868     if (!ext_gpu_map) {
869         status = NV_ERR_NO_MEMORY;
870         goto error;
871     }
872 
873     // Insert the ext_gpu_map into the VA range immediately since some of the
874     // below calls require it to be there.
875     ext_gpu_map->node.start = base;
876     ext_gpu_map->node.end = base + length - 1;
877     RB_CLEAR_NODE(&ext_gpu_map->node.rb_node);
878     uvm_tracker_init(&ext_gpu_map->tracker);
879     ext_gpu_map->mem_handle = uvm_kvmalloc_zero(sizeof(*ext_gpu_map->mem_handle));
880     if (!ext_gpu_map->mem_handle) {
881         status = NV_ERR_NO_MEMORY;
882         goto error;
883     }
884 
885     // Due to the fact that any overlapping mappings were already unmapped,
886     // adding the new mapping to the tree cannot fail.
887     status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node);
888     UVM_ASSERT(status == NV_OK);
889 
890     uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id);
891     ext_gpu_map->gpu = mapping_gpu;
892     ext_gpu_map->mem_handle->gpu = mapping_gpu;
893     nv_kref_init(&ext_gpu_map->mem_handle->ref_count);
894 
895     // Error paths after this point may call uvm_va_range_ext_gpu_map, so do a
896     // sanity check now to make sure it doesn't trigger any asserts.
897     UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map);
898 
899     // Dup the memory. This verifies the input handles, takes a ref count on the
900     // physical allocation so it can't go away under us, and returns us the
901     // allocation info.
902     status = uvm_rm_locked_call(nvUvmInterfaceDupMemory(uvm_gpu_device_handle(mapping_gpu),
903                                                         user_rm_mem->user_client,
904                                                         user_rm_mem->user_object,
905                                                         &ext_gpu_map->mem_handle->rm_handle,
906                                                         &mem_info));
907     if (status != NV_OK) {
908         UVM_DBG_PRINT("Failed to dup memory handle {0x%x, 0x%x}: %s, GPU: %s\n",
909                       user_rm_mem->user_client,
910                       user_rm_mem->user_object,
911                       nvstatusToString(status),
912                       uvm_gpu_name(mapping_gpu));
913         goto error;
914     }
915 
916     status = set_ext_gpu_map_location(ext_gpu_map, va_space, mapping_gpu, &mem_info);
917     if (status != NV_OK)
918         goto error;
919 
920     // Determine the proper mapping page size.
921     // This will be the largest supported page size less than or equal to the
922     // smallest of the base VA address, length, offset, and allocation page size
923     // alignments.
924     alignments = mem_info.pageSize | base | length | map_rm_params->map_offset;
925     smallest_alignment = alignments & ~(alignments - 1);
926 
927     // Check that alignment bits did not get truncated.
928     UVM_ASSERT(smallest_alignment);
929 
930     mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables, smallest_alignment);
931     if (!mapping_page_size) {
932         status = NV_ERR_INVALID_ADDRESS;
933         goto error;
934     }
935 
936     mem_info.pageSize = mapping_page_size;
937 
938     status = uvm_va_range_map_rm_allocation(va_range, mapping_gpu, &mem_info, map_rm_params, ext_gpu_map, out_tracker);
939     if (status != NV_OK)
940         goto error;
941 
942     uvm_mutex_unlock(&range_tree->lock);
943     return NV_OK;
944 
945 error:
946     uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL);
947     uvm_mutex_unlock(&range_tree->lock);
948     return status;
949 }
950 
951 // Actual implementation of UvmMapExternalAllocation
952 static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_ALLOCATION_PARAMS *params)
953 {
954     uvm_va_range_t *va_range = NULL;
955     uvm_gpu_t *mapping_gpu;
956     uvm_processor_mask_t mapped_gpus;
957     NV_STATUS status = NV_OK;
958     size_t i;
959     uvm_map_rm_params_t map_rm_params;
960     uvm_rm_user_object_t user_rm_mem =
961     {
962         .rm_control_fd = params->rmCtrlFd,
963         .user_client   = params->hClient,
964         .user_object   = params->hMemory
965     };
966     uvm_tracker_t tracker = UVM_TRACKER_INIT();
967 
968     if (uvm_api_range_invalid_4k(params->base, params->length))
969         return NV_ERR_INVALID_ADDRESS;
970 
971     if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS)
972         return NV_ERR_INVALID_ARGUMENT;
973 
974     uvm_va_space_down_read_rm(va_space);
975     va_range = uvm_va_range_find(va_space, params->base);
976 
977     if (!va_range ||
978         va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL ||
979         va_range->node.end < params->base + params->length - 1) {
980         uvm_va_space_up_read_rm(va_space);
981         return NV_ERR_INVALID_ADDRESS;
982     }
983 
984     uvm_processor_mask_zero(&mapped_gpus);
985     for (i = 0; i < params->gpuAttributesCount; i++) {
986         if (uvm_api_mapping_type_invalid(params->perGpuAttributes[i].gpuMappingType) ||
987             uvm_api_caching_type_invalid(params->perGpuAttributes[i].gpuCachingType) ||
988             uvm_api_kind_type_invalid(params->perGpuAttributes[i].gpuFormatType,
989                                       params->perGpuAttributes[i].gpuElementBits,
990                                       params->perGpuAttributes[i].gpuCompressionType)) {
991             status = NV_ERR_INVALID_ARGUMENT;
992             goto error;
993         }
994 
995         mapping_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->perGpuAttributes[i].gpuUuid);
996         if (!mapping_gpu) {
997             status = NV_ERR_INVALID_DEVICE;
998             goto error;
999         }
1000 
1001         // Use a tracker to get as much parallelization as possible among GPUs,
1002         // so one GPU can have its PTE writes in flight while we're working on
1003         // the next one.
1004         map_rm_params.map_offset = params->offset;
1005         map_rm_params.mapping_type = params->perGpuAttributes[i].gpuMappingType;
1006         map_rm_params.caching_type = params->perGpuAttributes[i].gpuCachingType;
1007         map_rm_params.format_type = params->perGpuAttributes[i].gpuFormatType;
1008         map_rm_params.element_bits = params->perGpuAttributes[i].gpuElementBits;
1009         map_rm_params.compression_type = params->perGpuAttributes[i].gpuCompressionType;
1010         status = uvm_map_external_allocation_on_gpu(va_range,
1011                                                     mapping_gpu,
1012                                                     &user_rm_mem,
1013                                                     &map_rm_params,
1014                                                     params->base,
1015                                                     params->length,
1016                                                     &tracker);
1017         if (status != NV_OK)
1018             goto error;
1019 
1020         uvm_processor_mask_set(&mapped_gpus, mapping_gpu->id);
1021     }
1022 
1023     // Wait for outstanding page table operations to finish across all GPUs. We
1024     // just need to hold the VA space lock to prevent the GPUs on which we're
1025     // waiting from getting unregistered underneath us.
1026     status = uvm_tracker_wait_deinit(&tracker);
1027 
1028     uvm_va_space_up_read_rm(va_space);
1029     return status;
1030 
1031 error:
1032     // We still have to wait for page table writes to finish, since the teardown
1033     // could free them.
1034     (void)uvm_tracker_wait_deinit(&tracker);
1035 
1036     // Tear down only those mappings we created during this call
1037     for_each_va_space_gpu_in_mask(mapping_gpu, va_space, &mapped_gpus) {
1038         uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
1039         uvm_ext_gpu_map_t *ext_map, *ext_map_next;
1040 
1041         uvm_mutex_lock(&range_tree->lock);
1042         uvm_ext_gpu_map_for_each_in_safe(ext_map,
1043                                          ext_map_next,
1044                                          va_range,
1045                                          mapping_gpu,
1046                                          params->base,
1047                                          params->base + params->length - 1)
1048             uvm_ext_gpu_map_destroy(va_range, ext_map, NULL);
1049         uvm_mutex_unlock(&range_tree->lock);
1050     }
1051 
1052     uvm_va_space_up_read_rm(va_space);
1053 
1054     return status;
1055 }
1056 
1057 NV_STATUS uvm_api_map_external_allocation(UVM_MAP_EXTERNAL_ALLOCATION_PARAMS *params, struct file *filp)
1058 {
1059     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1060     return uvm_map_external_allocation(va_space, params);
1061 }
1062 
1063 static NvU64 external_sparse_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *caller_data)
1064 {
1065     return range_vec->tree->hal->make_sparse_pte();
1066 }
1067 
1068 static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range,
1069                                                 uvm_gpu_t *mapping_gpu,
1070                                                 NvU64 base,
1071                                                 NvU64 length,
1072                                                 struct list_head *deferred_free_list)
1073 {
1074     uvm_va_space_t *va_space = va_range->va_space;
1075     uvm_ext_gpu_map_t *ext_gpu_map = NULL;
1076     uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
1077     uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
1078     uvm_page_tree_t *page_tree;
1079     NV_STATUS status;
1080 
1081     uvm_assert_rwsem_locked(&va_space->lock);
1082 
1083     if (!uvm_gpu_can_address(mapping_gpu, base, length))
1084         return NV_ERR_OUT_OF_RANGE;
1085 
1086     UVM_ASSERT(gpu_va_space);
1087 
1088     page_tree = &gpu_va_space->page_tables;
1089 
1090     uvm_mutex_lock(&range_tree->lock);
1091 
1092     status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, deferred_free_list);
1093     if (status != NV_OK)
1094         goto error;
1095 
1096     ext_gpu_map = uvm_kvmalloc_zero(sizeof(*ext_gpu_map));
1097     if (!ext_gpu_map) {
1098         status = NV_ERR_NO_MEMORY;
1099         goto error;
1100     }
1101 
1102     ext_gpu_map->node.start = base;
1103     ext_gpu_map->node.end = base + length - 1;
1104     RB_CLEAR_NODE(&ext_gpu_map->node.rb_node);
1105     uvm_tracker_init(&ext_gpu_map->tracker);
1106 
1107     // Due to the fact that any overlapping mappings were already unmapped,
1108     // adding the new mapping to the tree cannot fail.
1109     status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node);
1110     UVM_ASSERT(status == NV_OK);
1111 
1112     uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id);
1113     ext_gpu_map->gpu = mapping_gpu;
1114 
1115     UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map);
1116 
1117     status = uvm_page_table_range_vec_init(page_tree,
1118                                            ext_gpu_map->node.start,
1119                                            uvm_range_tree_node_size(&ext_gpu_map->node),
1120                                            UVM_PAGE_SIZE_64K,
1121                                            UVM_PMM_ALLOC_FLAGS_EVICT,
1122                                            &ext_gpu_map->pt_range_vec);
1123     if (status != NV_OK)
1124         goto error;
1125 
1126     status = uvm_page_table_range_vec_write_ptes(&ext_gpu_map->pt_range_vec,
1127                                                  UVM_MEMBAR_NONE,
1128                                                  external_sparse_pte_maker,
1129                                                  NULL);
1130     if (status != NV_OK)
1131         goto error;
1132 
1133     uvm_mutex_unlock(&range_tree->lock);
1134     return NV_OK;
1135 
1136 error:
1137     uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL);
1138     uvm_mutex_unlock(&range_tree->lock);
1139     return status;
1140 }
1141 
1142 static NV_STATUS uvm_map_external_sparse(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_SPARSE_PARAMS *params)
1143 {
1144     uvm_va_range_t *va_range = NULL;
1145     uvm_gpu_t *mapping_gpu = NULL;
1146     NV_STATUS status = NV_OK;
1147     LIST_HEAD(deferred_free_list);
1148 
1149     if (uvm_api_range_invalid_64k(params->base, params->length))
1150         return NV_ERR_INVALID_ADDRESS;
1151 
1152     uvm_va_space_down_read(va_space);
1153     va_range = uvm_va_range_find(va_space, params->base);
1154     if (!va_range ||
1155         va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL ||
1156         va_range->node.end < params->base + params->length - 1) {
1157         status = NV_ERR_INVALID_ADDRESS;
1158         goto out;
1159     }
1160 
1161     mapping_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->gpuUuid);
1162     if (!mapping_gpu) {
1163         status = NV_ERR_INVALID_DEVICE;
1164         goto out;
1165     }
1166 
1167     // Sparse mappings are unsupported on GPUs prior to Pascal.
1168     if (!mapping_gpu->parent->sparse_mappings_supported) {
1169         status = NV_ERR_INVALID_DEVICE;
1170         goto out;
1171     }
1172 
1173     status = uvm_map_external_sparse_on_gpu(va_range, mapping_gpu, params->base, params->length, &deferred_free_list);
1174 
1175     if (!list_empty(&deferred_free_list))
1176         uvm_gpu_retain(mapping_gpu);
1177 
1178 out:
1179     uvm_va_space_up_read(va_space);
1180 
1181     if (!list_empty(&deferred_free_list)) {
1182         uvm_deferred_free_object_list(&deferred_free_list);
1183         uvm_gpu_release(mapping_gpu);
1184     }
1185 
1186     return status;
1187 }
1188 
1189 NV_STATUS uvm_api_map_external_sparse(UVM_MAP_EXTERNAL_SPARSE_PARAMS *params, struct file *filp)
1190 {
1191     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1192     return uvm_map_external_sparse(va_space, params);
1193 }
1194 
1195 // Version of free which returns but doesn't release the owning GPU
1196 static uvm_gpu_t *uvm_ext_gpu_map_free_internal(uvm_ext_gpu_map_t *ext_gpu_map)
1197 {
1198     uvm_gpu_t *owning_gpu;
1199 
1200     if (!ext_gpu_map)
1201         return NULL;
1202 
1203     UVM_ASSERT(!ext_gpu_map->pt_range_vec.ranges);
1204 
1205     if (ext_gpu_map->mem_handle)
1206         nv_kref_put(&ext_gpu_map->mem_handle->ref_count, uvm_release_rm_handle);
1207 
1208     owning_gpu = ext_gpu_map->owning_gpu;
1209     uvm_kvfree(ext_gpu_map);
1210 
1211     return owning_gpu;
1212 }
1213 
1214 void uvm_ext_gpu_map_free(uvm_ext_gpu_map_t *ext_gpu_map)
1215 {
1216     uvm_gpu_t *owning_gpu = uvm_ext_gpu_map_free_internal(ext_gpu_map);
1217     if (owning_gpu)
1218         uvm_gpu_release(owning_gpu);
1219 }
1220 
1221 void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range,
1222                              uvm_ext_gpu_map_t *ext_gpu_map,
1223                              struct list_head *deferred_free_list)
1224 {
1225     uvm_membar_t membar;
1226     uvm_ext_gpu_range_tree_t *range_tree;
1227     uvm_gpu_t *mapped_gpu;
1228 
1229     if (!ext_gpu_map)
1230         return;
1231 
1232     (void)uvm_tracker_wait_deinit(&ext_gpu_map->tracker);
1233 
1234     // The external map is inserted into the tree prior to the rest of the mapping
1235     // steps. So, if it has not been inserted yet, there is nothing to clean up. Just
1236     // free the memory.
1237     if (RB_EMPTY_NODE(&ext_gpu_map->node.rb_node)) {
1238         uvm_kvfree(ext_gpu_map->mem_handle);
1239         uvm_kvfree(ext_gpu_map);
1240         return;
1241     }
1242 
1243     mapped_gpu = ext_gpu_map->gpu;
1244 
1245     range_tree = uvm_ext_gpu_range_tree(va_range, mapped_gpu);
1246 
1247     uvm_assert_mutex_locked(&range_tree->lock);
1248     UVM_ASSERT(uvm_gpu_va_space_get(va_range->va_space, mapped_gpu));
1249 
1250     uvm_range_tree_remove(&range_tree->tree, &ext_gpu_map->node);
1251 
1252     // Unmap the PTEs
1253     if (ext_gpu_map->pt_range_vec.ranges) {
1254         membar = va_range_downgrade_membar(va_range, ext_gpu_map);
1255         uvm_page_table_range_vec_clear_ptes(&ext_gpu_map->pt_range_vec, membar);
1256         uvm_page_table_range_vec_deinit(&ext_gpu_map->pt_range_vec);
1257     }
1258 
1259     if (deferred_free_list && ext_gpu_map->mem_handle) {
1260         // If this is a GPU allocation, we have to prevent that GPU from going
1261         // away until we've freed the handle.
1262         if (ext_gpu_map->owning_gpu)
1263             uvm_gpu_retain(ext_gpu_map->owning_gpu);
1264 
1265         uvm_deferred_free_object_add(deferred_free_list,
1266                                      &ext_gpu_map->deferred_free,
1267                                      UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION);
1268     }
1269     else {
1270         uvm_ext_gpu_map_free_internal(ext_gpu_map);
1271     }
1272 
1273     // Check if the sub-range tree is empty. Only then can the GPU be removed from
1274     // the mapped_gpus bitmap.
1275     if (uvm_range_tree_empty(&range_tree->tree))
1276         uvm_processor_mask_clear_atomic(&va_range->external.mapped_gpus, mapped_gpu->id);
1277 }
1278 
1279 static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space,
1280                                     NvU64 base,
1281                                     NvU64 length,
1282                                     const NvProcessorUuid *gpu_uuid)
1283 {
1284     uvm_va_range_t *va_range;
1285     uvm_gpu_t *gpu = NULL;
1286     NV_STATUS status = NV_OK;
1287     uvm_ext_gpu_range_tree_t *range_tree;
1288     LIST_HEAD(deferred_free_list);
1289 
1290     if (uvm_api_range_invalid_4k(base, length))
1291         return NV_ERR_INVALID_ADDRESS;
1292 
1293     uvm_va_space_down_read(va_space);
1294 
1295     va_range = uvm_va_range_find(va_space, base);
1296     if (!va_range || va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL || base + length - 1 > va_range->node.end) {
1297         status = NV_ERR_INVALID_ADDRESS;
1298         goto out;
1299     }
1300 
1301     gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
1302     if (!gpu) {
1303         status = NV_ERR_INVALID_DEVICE;
1304         goto out;
1305     }
1306 
1307     range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
1308     uvm_mutex_lock(&range_tree->lock);
1309     status = uvm_unmap_external_in_range(va_range, gpu, base, base + length - 1, &deferred_free_list);
1310     uvm_mutex_unlock(&range_tree->lock);
1311 
1312     // If the deferred_free_list is not empty, retain the GPU which maps the
1313     // allocation because it's the parent of dup_handle. The owning GPU (if any)
1314     // is retained internally by the deferred free layer.
1315     if (!list_empty(&deferred_free_list))
1316         uvm_gpu_retain(gpu);
1317 
1318 out:
1319     uvm_va_space_up_read(va_space);
1320 
1321     if (!list_empty(&deferred_free_list)) {
1322         uvm_deferred_free_object_list(&deferred_free_list);
1323         uvm_gpu_release(gpu);
1324     }
1325 
1326     return status;
1327 }
1328 
1329 NV_STATUS uvm_api_unmap_external(UVM_UNMAP_EXTERNAL_PARAMS *params, struct file *filp)
1330 {
1331     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1332     return uvm_unmap_external(va_space, params->base, params->length, &params->gpuUuid);
1333 }
1334 
1335 // This destroys VA ranges created by UvmMapExternalAllocation,
1336 // UvmMapDynamicParallelismRegion, and UvmAllocSemaphorePool *only*. VA ranges
1337 // created by UvmMemMap and UvmAlloc go through mmap/munmap.
1338 static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
1339 {
1340     uvm_va_range_t *va_range;
1341     NV_STATUS status = NV_OK;
1342     uvm_global_processor_mask_t retained_mask;
1343     LIST_HEAD(deferred_free_list);
1344 
1345     if (uvm_api_range_invalid_4k(base, length))
1346         return NV_ERR_INVALID_ADDRESS;
1347 
1348     uvm_va_space_down_write(va_space);
1349 
1350     // Non-managed ranges are defined to not require splitting, so a partial
1351     // free attempt is an error.
1352     //
1353     // TODO: Bug 1763676: The length parameter may be needed for MPS. If not, it
1354     //       should be removed from the ioctl.
1355     va_range = uvm_va_range_find(va_space, base);
1356     if (!va_range                                    ||
1357         (va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL &&
1358          va_range->type != UVM_VA_RANGE_TYPE_SKED_REFLECTED &&
1359          va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) ||
1360         va_range->node.start != base                 ||
1361         va_range->node.end != base + length - 1) {
1362         status = NV_ERR_INVALID_ADDRESS;
1363         goto out;
1364     }
1365 
1366     if ((va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) &&
1367         uvm_mem_mapped_on_cpu_user(va_range->semaphore_pool.mem)) {
1368         // Semaphore pools must be first unmapped from the CPU with munmap to
1369         // invalidate the vma.
1370         status = NV_ERR_INVALID_ARGUMENT;
1371         goto out;
1372     }
1373 
1374     if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) {
1375         // External ranges may have deferred free work, so the GPUs may have to
1376         // be retained. Construct the mask of all the GPUs that need to be
1377         // retained.
1378         uvm_va_space_global_gpus_in_mask(va_space, &retained_mask, &va_range->external.mapped_gpus);
1379     }
1380 
1381     uvm_va_range_destroy(va_range, &deferred_free_list);
1382 
1383     // If there is deferred work, retain the required GPUs.
1384     if (!list_empty(&deferred_free_list))
1385         uvm_global_mask_retain(&retained_mask);
1386 
1387 out:
1388     uvm_va_space_up_write(va_space);
1389 
1390     if (!list_empty(&deferred_free_list)) {
1391         UVM_ASSERT(status == NV_OK);
1392         uvm_deferred_free_object_list(&deferred_free_list);
1393         uvm_global_mask_release(&retained_mask);
1394     }
1395 
1396     return status;
1397 }
1398 
1399 NV_STATUS uvm_api_free(UVM_FREE_PARAMS *params, struct file *filp)
1400 {
1401     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1402     return uvm_free(va_space, params->base, params->length);
1403 }
1404