1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright 2020-2021 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include <linux/types.h> 25 #include <linux/sched/task.h> 26 #include "amdgpu_sync.h" 27 #include "amdgpu_object.h" 28 #include "amdgpu_vm.h" 29 #include "amdgpu_mn.h" 30 #include "amdgpu.h" 31 #include "amdgpu_xgmi.h" 32 #include "kfd_priv.h" 33 #include "kfd_svm.h" 34 #include "kfd_migrate.h" 35 #include "kfd_smi_events.h" 36 37 #ifdef dev_fmt 38 #undef dev_fmt 39 #endif 40 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 41 42 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 43 44 /* Long enough to ensure no retry fault comes after svm range is restored and 45 * page table is updated. 46 */ 47 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) 48 49 /* Giant svm range split into smaller ranges based on this, it is decided using 50 * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to 51 * power of 2MB. 52 */ 53 static uint64_t max_svm_range_pages; 54 55 struct criu_svm_metadata { 56 struct list_head list; 57 struct kfd_criu_svm_range_priv_data data; 58 }; 59 60 static void svm_range_evict_svm_bo_worker(struct work_struct *work); 61 static bool 62 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 63 const struct mmu_notifier_range *range, 64 unsigned long cur_seq); 65 static int 66 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 67 uint64_t *bo_s, uint64_t *bo_l); 68 static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 69 .invalidate = svm_range_cpu_invalidate_pagetables, 70 }; 71 72 /** 73 * svm_range_unlink - unlink svm_range from lists and interval tree 74 * @prange: svm range structure to be removed 75 * 76 * Remove the svm_range from the svms and svm_bo lists and the svms 77 * interval tree. 78 * 79 * Context: The caller must hold svms->lock 80 */ 81 static void svm_range_unlink(struct svm_range *prange) 82 { 83 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 84 prange, prange->start, prange->last); 85 86 if (prange->svm_bo) { 87 spin_lock(&prange->svm_bo->list_lock); 88 list_del(&prange->svm_bo_list); 89 spin_unlock(&prange->svm_bo->list_lock); 90 } 91 92 list_del(&prange->list); 93 if (prange->it_node.start != 0 && prange->it_node.last != 0) 94 interval_tree_remove(&prange->it_node, &prange->svms->objects); 95 } 96 97 static void 98 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 99 { 100 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 101 prange, prange->start, prange->last); 102 103 mmu_interval_notifier_insert_locked(&prange->notifier, mm, 104 prange->start << PAGE_SHIFT, 105 prange->npages << PAGE_SHIFT, 106 &svm_range_mn_ops); 107 } 108 109 /** 110 * svm_range_add_to_svms - add svm range to svms 111 * @prange: svm range structure to be added 112 * 113 * Add the svm range to svms interval tree and link list 114 * 115 * Context: The caller must hold svms->lock 116 */ 117 static void svm_range_add_to_svms(struct svm_range *prange) 118 { 119 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 120 prange, prange->start, prange->last); 121 122 list_move_tail(&prange->list, &prange->svms->list); 123 prange->it_node.start = prange->start; 124 prange->it_node.last = prange->last; 125 interval_tree_insert(&prange->it_node, &prange->svms->objects); 126 } 127 128 static void svm_range_remove_notifier(struct svm_range *prange) 129 { 130 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 131 prange->svms, prange, 132 prange->notifier.interval_tree.start >> PAGE_SHIFT, 133 prange->notifier.interval_tree.last >> PAGE_SHIFT); 134 135 if (prange->notifier.interval_tree.start != 0 && 136 prange->notifier.interval_tree.last != 0) 137 mmu_interval_notifier_remove(&prange->notifier); 138 } 139 140 static bool 141 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 142 { 143 return dma_addr && !dma_mapping_error(dev, dma_addr) && 144 !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 145 } 146 147 static int 148 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 149 unsigned long offset, unsigned long npages, 150 unsigned long *hmm_pfns, uint32_t gpuidx) 151 { 152 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 153 dma_addr_t *addr = prange->dma_addr[gpuidx]; 154 struct device *dev = adev->dev; 155 struct page *page; 156 int i, r; 157 158 if (!addr) { 159 addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL); 160 if (!addr) 161 return -ENOMEM; 162 prange->dma_addr[gpuidx] = addr; 163 } 164 165 addr += offset; 166 for (i = 0; i < npages; i++) { 167 if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 168 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 169 170 page = hmm_pfn_to_page(hmm_pfns[i]); 171 if (is_zone_device_page(page)) { 172 struct amdgpu_device *bo_adev = 173 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 174 175 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 176 bo_adev->vm_manager.vram_base_offset - 177 bo_adev->kfd.dev->pgmap.range.start; 178 addr[i] |= SVM_RANGE_VRAM_DOMAIN; 179 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 180 continue; 181 } 182 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 183 r = dma_mapping_error(dev, addr[i]); 184 if (r) { 185 dev_err(dev, "failed %d dma_map_page\n", r); 186 return r; 187 } 188 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 189 addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 190 } 191 return 0; 192 } 193 194 static int 195 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 196 unsigned long offset, unsigned long npages, 197 unsigned long *hmm_pfns) 198 { 199 struct kfd_process *p; 200 uint32_t gpuidx; 201 int r; 202 203 p = container_of(prange->svms, struct kfd_process, svms); 204 205 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 206 struct kfd_process_device *pdd; 207 208 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 209 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 210 if (!pdd) { 211 pr_debug("failed to find device idx %d\n", gpuidx); 212 return -EINVAL; 213 } 214 215 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 216 hmm_pfns, gpuidx); 217 if (r) 218 break; 219 } 220 221 return r; 222 } 223 224 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 225 unsigned long offset, unsigned long npages) 226 { 227 enum dma_data_direction dir = DMA_BIDIRECTIONAL; 228 int i; 229 230 if (!dma_addr) 231 return; 232 233 for (i = offset; i < offset + npages; i++) { 234 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 235 continue; 236 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 237 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 238 dma_addr[i] = 0; 239 } 240 } 241 242 void svm_range_free_dma_mappings(struct svm_range *prange) 243 { 244 struct kfd_process_device *pdd; 245 dma_addr_t *dma_addr; 246 struct device *dev; 247 struct kfd_process *p; 248 uint32_t gpuidx; 249 250 p = container_of(prange->svms, struct kfd_process, svms); 251 252 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 253 dma_addr = prange->dma_addr[gpuidx]; 254 if (!dma_addr) 255 continue; 256 257 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 258 if (!pdd) { 259 pr_debug("failed to find device idx %d\n", gpuidx); 260 continue; 261 } 262 dev = &pdd->dev->pdev->dev; 263 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 264 kvfree(dma_addr); 265 prange->dma_addr[gpuidx] = NULL; 266 } 267 } 268 269 static void svm_range_free(struct svm_range *prange, bool update_mem_usage) 270 { 271 uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; 272 struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); 273 274 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 275 prange->start, prange->last); 276 277 svm_range_vram_node_free(prange); 278 svm_range_free_dma_mappings(prange); 279 280 if (update_mem_usage && !p->xnack_enabled) { 281 pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); 282 amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 283 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 284 } 285 mutex_destroy(&prange->lock); 286 mutex_destroy(&prange->migrate_mutex); 287 kfree(prange); 288 } 289 290 static void 291 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 292 uint8_t *granularity, uint32_t *flags) 293 { 294 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 295 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 296 *granularity = 9; 297 *flags = 298 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 299 } 300 301 static struct 302 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 303 uint64_t last, bool update_mem_usage) 304 { 305 uint64_t size = last - start + 1; 306 struct svm_range *prange; 307 struct kfd_process *p; 308 309 prange = kzalloc(sizeof(*prange), GFP_KERNEL); 310 if (!prange) 311 return NULL; 312 313 p = container_of(svms, struct kfd_process, svms); 314 if (!p->xnack_enabled && update_mem_usage && 315 amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, 316 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) { 317 pr_info("SVM mapping failed, exceeds resident system memory limit\n"); 318 kfree(prange); 319 return NULL; 320 } 321 prange->npages = size; 322 prange->svms = svms; 323 prange->start = start; 324 prange->last = last; 325 INIT_LIST_HEAD(&prange->list); 326 INIT_LIST_HEAD(&prange->update_list); 327 INIT_LIST_HEAD(&prange->svm_bo_list); 328 INIT_LIST_HEAD(&prange->deferred_list); 329 INIT_LIST_HEAD(&prange->child_list); 330 atomic_set(&prange->invalid, 0); 331 prange->validate_timestamp = 0; 332 mutex_init(&prange->migrate_mutex); 333 mutex_init(&prange->lock); 334 335 if (p->xnack_enabled) 336 bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 337 MAX_GPU_INSTANCE); 338 339 svm_range_set_default_attributes(&prange->preferred_loc, 340 &prange->prefetch_loc, 341 &prange->granularity, &prange->flags); 342 343 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 344 345 return prange; 346 } 347 348 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 349 { 350 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 351 return false; 352 353 return true; 354 } 355 356 static void svm_range_bo_release(struct kref *kref) 357 { 358 struct svm_range_bo *svm_bo; 359 360 svm_bo = container_of(kref, struct svm_range_bo, kref); 361 pr_debug("svm_bo 0x%p\n", svm_bo); 362 363 spin_lock(&svm_bo->list_lock); 364 while (!list_empty(&svm_bo->range_list)) { 365 struct svm_range *prange = 366 list_first_entry(&svm_bo->range_list, 367 struct svm_range, svm_bo_list); 368 /* list_del_init tells a concurrent svm_range_vram_node_new when 369 * it's safe to reuse the svm_bo pointer and svm_bo_list head. 370 */ 371 list_del_init(&prange->svm_bo_list); 372 spin_unlock(&svm_bo->list_lock); 373 374 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 375 prange->start, prange->last); 376 mutex_lock(&prange->lock); 377 prange->svm_bo = NULL; 378 mutex_unlock(&prange->lock); 379 380 spin_lock(&svm_bo->list_lock); 381 } 382 spin_unlock(&svm_bo->list_lock); 383 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { 384 /* We're not in the eviction worker. 385 * Signal the fence and synchronize with any 386 * pending eviction work. 387 */ 388 dma_fence_signal(&svm_bo->eviction_fence->base); 389 cancel_work_sync(&svm_bo->eviction_work); 390 } 391 dma_fence_put(&svm_bo->eviction_fence->base); 392 amdgpu_bo_unref(&svm_bo->bo); 393 kfree(svm_bo); 394 } 395 396 static void svm_range_bo_wq_release(struct work_struct *work) 397 { 398 struct svm_range_bo *svm_bo; 399 400 svm_bo = container_of(work, struct svm_range_bo, release_work); 401 svm_range_bo_release(&svm_bo->kref); 402 } 403 404 static void svm_range_bo_release_async(struct kref *kref) 405 { 406 struct svm_range_bo *svm_bo; 407 408 svm_bo = container_of(kref, struct svm_range_bo, kref); 409 pr_debug("svm_bo 0x%p\n", svm_bo); 410 INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); 411 schedule_work(&svm_bo->release_work); 412 } 413 414 void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) 415 { 416 kref_put(&svm_bo->kref, svm_range_bo_release_async); 417 } 418 419 static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 420 { 421 if (svm_bo) 422 kref_put(&svm_bo->kref, svm_range_bo_release); 423 } 424 425 static bool 426 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) 427 { 428 struct amdgpu_device *bo_adev; 429 430 mutex_lock(&prange->lock); 431 if (!prange->svm_bo) { 432 mutex_unlock(&prange->lock); 433 return false; 434 } 435 if (prange->ttm_res) { 436 /* We still have a reference, all is well */ 437 mutex_unlock(&prange->lock); 438 return true; 439 } 440 if (svm_bo_ref_unless_zero(prange->svm_bo)) { 441 /* 442 * Migrate from GPU to GPU, remove range from source bo_adev 443 * svm_bo range list, and return false to allocate svm_bo from 444 * destination adev. 445 */ 446 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 447 if (bo_adev != adev) { 448 mutex_unlock(&prange->lock); 449 450 spin_lock(&prange->svm_bo->list_lock); 451 list_del_init(&prange->svm_bo_list); 452 spin_unlock(&prange->svm_bo->list_lock); 453 454 svm_range_bo_unref(prange->svm_bo); 455 return false; 456 } 457 if (READ_ONCE(prange->svm_bo->evicting)) { 458 struct dma_fence *f; 459 struct svm_range_bo *svm_bo; 460 /* The BO is getting evicted, 461 * we need to get a new one 462 */ 463 mutex_unlock(&prange->lock); 464 svm_bo = prange->svm_bo; 465 f = dma_fence_get(&svm_bo->eviction_fence->base); 466 svm_range_bo_unref(prange->svm_bo); 467 /* wait for the fence to avoid long spin-loop 468 * at list_empty_careful 469 */ 470 dma_fence_wait(f, false); 471 dma_fence_put(f); 472 } else { 473 /* The BO was still around and we got 474 * a new reference to it 475 */ 476 mutex_unlock(&prange->lock); 477 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 478 prange->svms, prange->start, prange->last); 479 480 prange->ttm_res = prange->svm_bo->bo->tbo.resource; 481 return true; 482 } 483 484 } else { 485 mutex_unlock(&prange->lock); 486 } 487 488 /* We need a new svm_bo. Spin-loop to wait for concurrent 489 * svm_range_bo_release to finish removing this range from 490 * its range list and set prange->svm_bo to null. After this, 491 * it is safe to reuse the svm_bo pointer and svm_bo_list head. 492 */ 493 while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo) 494 cond_resched(); 495 496 return false; 497 } 498 499 static struct svm_range_bo *svm_range_bo_new(void) 500 { 501 struct svm_range_bo *svm_bo; 502 503 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 504 if (!svm_bo) 505 return NULL; 506 507 kref_init(&svm_bo->kref); 508 INIT_LIST_HEAD(&svm_bo->range_list); 509 spin_lock_init(&svm_bo->list_lock); 510 511 return svm_bo; 512 } 513 514 int 515 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, 516 bool clear) 517 { 518 struct amdgpu_bo_param bp; 519 struct svm_range_bo *svm_bo; 520 struct amdgpu_bo_user *ubo; 521 struct amdgpu_bo *bo; 522 struct kfd_process *p; 523 struct mm_struct *mm; 524 int r; 525 526 p = container_of(prange->svms, struct kfd_process, svms); 527 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 528 prange->start, prange->last); 529 530 if (svm_range_validate_svm_bo(adev, prange)) 531 return 0; 532 533 svm_bo = svm_range_bo_new(); 534 if (!svm_bo) { 535 pr_debug("failed to alloc svm bo\n"); 536 return -ENOMEM; 537 } 538 mm = get_task_mm(p->lead_thread); 539 if (!mm) { 540 pr_debug("failed to get mm\n"); 541 kfree(svm_bo); 542 return -ESRCH; 543 } 544 svm_bo->eviction_fence = 545 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 546 mm, 547 svm_bo); 548 mmput(mm); 549 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 550 svm_bo->evicting = 0; 551 memset(&bp, 0, sizeof(bp)); 552 bp.size = prange->npages * PAGE_SIZE; 553 bp.byte_align = PAGE_SIZE; 554 bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 555 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 556 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 557 bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE; 558 bp.type = ttm_bo_type_device; 559 bp.resv = NULL; 560 561 r = amdgpu_bo_create_user(adev, &bp, &ubo); 562 if (r) { 563 pr_debug("failed %d to create bo\n", r); 564 goto create_bo_failed; 565 } 566 bo = &ubo->bo; 567 r = amdgpu_bo_reserve(bo, true); 568 if (r) { 569 pr_debug("failed %d to reserve bo\n", r); 570 goto reserve_bo_failed; 571 } 572 573 if (clear) { 574 r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); 575 if (r) { 576 pr_debug("failed %d to sync bo\n", r); 577 amdgpu_bo_unreserve(bo); 578 goto reserve_bo_failed; 579 } 580 } 581 582 r = dma_resv_reserve_fences(bo->tbo.base.resv, 1); 583 if (r) { 584 pr_debug("failed %d to reserve bo\n", r); 585 amdgpu_bo_unreserve(bo); 586 goto reserve_bo_failed; 587 } 588 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 589 590 amdgpu_bo_unreserve(bo); 591 592 svm_bo->bo = bo; 593 prange->svm_bo = svm_bo; 594 prange->ttm_res = bo->tbo.resource; 595 prange->offset = 0; 596 597 spin_lock(&svm_bo->list_lock); 598 list_add(&prange->svm_bo_list, &svm_bo->range_list); 599 spin_unlock(&svm_bo->list_lock); 600 601 return 0; 602 603 reserve_bo_failed: 604 amdgpu_bo_unref(&bo); 605 create_bo_failed: 606 dma_fence_put(&svm_bo->eviction_fence->base); 607 kfree(svm_bo); 608 prange->ttm_res = NULL; 609 610 return r; 611 } 612 613 void svm_range_vram_node_free(struct svm_range *prange) 614 { 615 /* serialize prange->svm_bo unref */ 616 mutex_lock(&prange->lock); 617 /* prange->svm_bo has not been unref */ 618 if (prange->ttm_res) { 619 prange->ttm_res = NULL; 620 mutex_unlock(&prange->lock); 621 svm_range_bo_unref(prange->svm_bo); 622 } else 623 mutex_unlock(&prange->lock); 624 } 625 626 struct amdgpu_device * 627 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) 628 { 629 struct kfd_process_device *pdd; 630 struct kfd_process *p; 631 int32_t gpu_idx; 632 633 p = container_of(prange->svms, struct kfd_process, svms); 634 635 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); 636 if (gpu_idx < 0) { 637 pr_debug("failed to get device by id 0x%x\n", gpu_id); 638 return NULL; 639 } 640 pdd = kfd_process_device_from_gpuidx(p, gpu_idx); 641 if (!pdd) { 642 pr_debug("failed to get device by idx 0x%x\n", gpu_idx); 643 return NULL; 644 } 645 646 return pdd->dev->adev; 647 } 648 649 struct kfd_process_device * 650 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) 651 { 652 struct kfd_process *p; 653 int32_t gpu_idx, gpuid; 654 int r; 655 656 p = container_of(prange->svms, struct kfd_process, svms); 657 658 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); 659 if (r) { 660 pr_debug("failed to get device id by adev %p\n", adev); 661 return NULL; 662 } 663 664 return kfd_process_device_from_gpuidx(p, gpu_idx); 665 } 666 667 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 668 { 669 struct ttm_operation_ctx ctx = { false, false }; 670 671 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 672 673 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 674 } 675 676 static int 677 svm_range_check_attr(struct kfd_process *p, 678 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 679 { 680 uint32_t i; 681 682 for (i = 0; i < nattr; i++) { 683 uint32_t val = attrs[i].value; 684 int gpuidx = MAX_GPU_INSTANCE; 685 686 switch (attrs[i].type) { 687 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 688 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 689 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 690 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 691 break; 692 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 693 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 694 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 695 break; 696 case KFD_IOCTL_SVM_ATTR_ACCESS: 697 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 698 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 699 gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 700 break; 701 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 702 break; 703 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 704 break; 705 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 706 break; 707 default: 708 pr_debug("unknown attr type 0x%x\n", attrs[i].type); 709 return -EINVAL; 710 } 711 712 if (gpuidx < 0) { 713 pr_debug("no GPU 0x%x found\n", val); 714 return -EINVAL; 715 } else if (gpuidx < MAX_GPU_INSTANCE && 716 !test_bit(gpuidx, p->svms.bitmap_supported)) { 717 pr_debug("GPU 0x%x not supported\n", val); 718 return -EINVAL; 719 } 720 } 721 722 return 0; 723 } 724 725 static void 726 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 727 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 728 bool *update_mapping) 729 { 730 uint32_t i; 731 int gpuidx; 732 733 for (i = 0; i < nattr; i++) { 734 switch (attrs[i].type) { 735 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 736 prange->preferred_loc = attrs[i].value; 737 break; 738 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 739 prange->prefetch_loc = attrs[i].value; 740 break; 741 case KFD_IOCTL_SVM_ATTR_ACCESS: 742 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 743 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 744 *update_mapping = true; 745 gpuidx = kfd_process_gpuidx_from_gpuid(p, 746 attrs[i].value); 747 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 748 bitmap_clear(prange->bitmap_access, gpuidx, 1); 749 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 750 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 751 bitmap_set(prange->bitmap_access, gpuidx, 1); 752 bitmap_clear(prange->bitmap_aip, gpuidx, 1); 753 } else { 754 bitmap_clear(prange->bitmap_access, gpuidx, 1); 755 bitmap_set(prange->bitmap_aip, gpuidx, 1); 756 } 757 break; 758 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 759 *update_mapping = true; 760 prange->flags |= attrs[i].value; 761 break; 762 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 763 *update_mapping = true; 764 prange->flags &= ~attrs[i].value; 765 break; 766 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 767 prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); 768 break; 769 default: 770 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 771 } 772 } 773 } 774 775 static bool 776 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, 777 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 778 { 779 uint32_t i; 780 int gpuidx; 781 782 for (i = 0; i < nattr; i++) { 783 switch (attrs[i].type) { 784 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 785 if (prange->preferred_loc != attrs[i].value) 786 return false; 787 break; 788 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 789 /* Prefetch should always trigger a migration even 790 * if the value of the attribute didn't change. 791 */ 792 return false; 793 case KFD_IOCTL_SVM_ATTR_ACCESS: 794 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 795 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 796 gpuidx = kfd_process_gpuidx_from_gpuid(p, 797 attrs[i].value); 798 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 799 if (test_bit(gpuidx, prange->bitmap_access) || 800 test_bit(gpuidx, prange->bitmap_aip)) 801 return false; 802 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 803 if (!test_bit(gpuidx, prange->bitmap_access)) 804 return false; 805 } else { 806 if (!test_bit(gpuidx, prange->bitmap_aip)) 807 return false; 808 } 809 break; 810 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 811 if ((prange->flags & attrs[i].value) != attrs[i].value) 812 return false; 813 break; 814 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 815 if ((prange->flags & attrs[i].value) != 0) 816 return false; 817 break; 818 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 819 if (prange->granularity != attrs[i].value) 820 return false; 821 break; 822 default: 823 WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 824 } 825 } 826 827 return true; 828 } 829 830 /** 831 * svm_range_debug_dump - print all range information from svms 832 * @svms: svm range list header 833 * 834 * debug output svm range start, end, prefetch location from svms 835 * interval tree and link list 836 * 837 * Context: The caller must hold svms->lock 838 */ 839 static void svm_range_debug_dump(struct svm_range_list *svms) 840 { 841 struct interval_tree_node *node; 842 struct svm_range *prange; 843 844 pr_debug("dump svms 0x%p list\n", svms); 845 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 846 847 list_for_each_entry(prange, &svms->list, list) { 848 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 849 prange, prange->start, prange->npages, 850 prange->start + prange->npages - 1, 851 prange->actual_loc); 852 } 853 854 pr_debug("dump svms 0x%p interval tree\n", svms); 855 pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 856 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 857 while (node) { 858 prange = container_of(node, struct svm_range, it_node); 859 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 860 prange, prange->start, prange->npages, 861 prange->start + prange->npages - 1, 862 prange->actual_loc); 863 node = interval_tree_iter_next(node, 0, ~0ULL); 864 } 865 } 866 867 static int 868 svm_range_split_array(void *ppnew, void *ppold, size_t size, 869 uint64_t old_start, uint64_t old_n, 870 uint64_t new_start, uint64_t new_n) 871 { 872 unsigned char *new, *old, *pold; 873 uint64_t d; 874 875 if (!ppold) 876 return 0; 877 pold = *(unsigned char **)ppold; 878 if (!pold) 879 return 0; 880 881 new = kvmalloc_array(new_n, size, GFP_KERNEL); 882 if (!new) 883 return -ENOMEM; 884 885 d = (new_start - old_start) * size; 886 memcpy(new, pold + d, new_n * size); 887 888 old = kvmalloc_array(old_n, size, GFP_KERNEL); 889 if (!old) { 890 kvfree(new); 891 return -ENOMEM; 892 } 893 894 d = (new_start == old_start) ? new_n * size : 0; 895 memcpy(old, pold + d, old_n * size); 896 897 kvfree(pold); 898 *(void **)ppold = old; 899 *(void **)ppnew = new; 900 901 return 0; 902 } 903 904 static int 905 svm_range_split_pages(struct svm_range *new, struct svm_range *old, 906 uint64_t start, uint64_t last) 907 { 908 uint64_t npages = last - start + 1; 909 int i, r; 910 911 for (i = 0; i < MAX_GPU_INSTANCE; i++) { 912 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 913 sizeof(*old->dma_addr[i]), old->start, 914 npages, new->start, new->npages); 915 if (r) 916 return r; 917 } 918 919 return 0; 920 } 921 922 static int 923 svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 924 uint64_t start, uint64_t last) 925 { 926 uint64_t npages = last - start + 1; 927 928 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 929 new->svms, new, new->start, start, last); 930 931 if (new->start == old->start) { 932 new->offset = old->offset; 933 old->offset += new->npages; 934 } else { 935 new->offset = old->offset + npages; 936 } 937 938 new->svm_bo = svm_range_bo_ref(old->svm_bo); 939 new->ttm_res = old->ttm_res; 940 941 spin_lock(&new->svm_bo->list_lock); 942 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 943 spin_unlock(&new->svm_bo->list_lock); 944 945 return 0; 946 } 947 948 /** 949 * svm_range_split_adjust - split range and adjust 950 * 951 * @new: new range 952 * @old: the old range 953 * @start: the old range adjust to start address in pages 954 * @last: the old range adjust to last address in pages 955 * 956 * Copy system memory dma_addr or vram ttm_res in old range to new 957 * range from new_start up to size new->npages, the remaining old range is from 958 * start to last 959 * 960 * Return: 961 * 0 - OK, -ENOMEM - out of memory 962 */ 963 static int 964 svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 965 uint64_t start, uint64_t last) 966 { 967 int r; 968 969 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 970 new->svms, new->start, old->start, old->last, start, last); 971 972 if (new->start < old->start || 973 new->last > old->last) { 974 WARN_ONCE(1, "invalid new range start or last\n"); 975 return -EINVAL; 976 } 977 978 r = svm_range_split_pages(new, old, start, last); 979 if (r) 980 return r; 981 982 if (old->actual_loc && old->ttm_res) { 983 r = svm_range_split_nodes(new, old, start, last); 984 if (r) 985 return r; 986 } 987 988 old->npages = last - start + 1; 989 old->start = start; 990 old->last = last; 991 new->flags = old->flags; 992 new->preferred_loc = old->preferred_loc; 993 new->prefetch_loc = old->prefetch_loc; 994 new->actual_loc = old->actual_loc; 995 new->granularity = old->granularity; 996 new->mapped_to_gpu = old->mapped_to_gpu; 997 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 998 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 999 1000 return 0; 1001 } 1002 1003 /** 1004 * svm_range_split - split a range in 2 ranges 1005 * 1006 * @prange: the svm range to split 1007 * @start: the remaining range start address in pages 1008 * @last: the remaining range last address in pages 1009 * @new: the result new range generated 1010 * 1011 * Two cases only: 1012 * case 1: if start == prange->start 1013 * prange ==> prange[start, last] 1014 * new range [last + 1, prange->last] 1015 * 1016 * case 2: if last == prange->last 1017 * prange ==> prange[start, last] 1018 * new range [prange->start, start - 1] 1019 * 1020 * Return: 1021 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 1022 */ 1023 static int 1024 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 1025 struct svm_range **new) 1026 { 1027 uint64_t old_start = prange->start; 1028 uint64_t old_last = prange->last; 1029 struct svm_range_list *svms; 1030 int r = 0; 1031 1032 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 1033 old_start, old_last, start, last); 1034 1035 if (old_start != start && old_last != last) 1036 return -EINVAL; 1037 if (start < old_start || last > old_last) 1038 return -EINVAL; 1039 1040 svms = prange->svms; 1041 if (old_start == start) 1042 *new = svm_range_new(svms, last + 1, old_last, false); 1043 else 1044 *new = svm_range_new(svms, old_start, start - 1, false); 1045 if (!*new) 1046 return -ENOMEM; 1047 1048 r = svm_range_split_adjust(*new, prange, start, last); 1049 if (r) { 1050 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 1051 r, old_start, old_last, start, last); 1052 svm_range_free(*new, false); 1053 *new = NULL; 1054 } 1055 1056 return r; 1057 } 1058 1059 static int 1060 svm_range_split_tail(struct svm_range *prange, 1061 uint64_t new_last, struct list_head *insert_list) 1062 { 1063 struct svm_range *tail; 1064 int r = svm_range_split(prange, prange->start, new_last, &tail); 1065 1066 if (!r) 1067 list_add(&tail->list, insert_list); 1068 return r; 1069 } 1070 1071 static int 1072 svm_range_split_head(struct svm_range *prange, 1073 uint64_t new_start, struct list_head *insert_list) 1074 { 1075 struct svm_range *head; 1076 int r = svm_range_split(prange, new_start, prange->last, &head); 1077 1078 if (!r) 1079 list_add(&head->list, insert_list); 1080 return r; 1081 } 1082 1083 static void 1084 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 1085 struct svm_range *pchild, enum svm_work_list_ops op) 1086 { 1087 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 1088 pchild, pchild->start, pchild->last, prange, op); 1089 1090 pchild->work_item.mm = mm; 1091 pchild->work_item.op = op; 1092 list_add_tail(&pchild->child_list, &prange->child_list); 1093 } 1094 1095 /** 1096 * svm_range_split_by_granularity - collect ranges within granularity boundary 1097 * 1098 * @p: the process with svms list 1099 * @mm: mm structure 1100 * @addr: the vm fault address in pages, to split the prange 1101 * @parent: parent range if prange is from child list 1102 * @prange: prange to split 1103 * 1104 * Trims @prange to be a single aligned block of prange->granularity if 1105 * possible. The head and tail are added to the child_list in @parent. 1106 * 1107 * Context: caller must hold mmap_read_lock and prange->lock 1108 * 1109 * Return: 1110 * 0 - OK, otherwise error code 1111 */ 1112 int 1113 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 1114 unsigned long addr, struct svm_range *parent, 1115 struct svm_range *prange) 1116 { 1117 struct svm_range *head, *tail; 1118 unsigned long start, last, size; 1119 int r; 1120 1121 /* Align splited range start and size to granularity size, then a single 1122 * PTE will be used for whole range, this reduces the number of PTE 1123 * updated and the L1 TLB space used for translation. 1124 */ 1125 size = 1UL << prange->granularity; 1126 start = ALIGN_DOWN(addr, size); 1127 last = ALIGN(addr + 1, size) - 1; 1128 1129 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 1130 prange->svms, prange->start, prange->last, start, last, size); 1131 1132 if (start > prange->start) { 1133 r = svm_range_split(prange, start, prange->last, &head); 1134 if (r) 1135 return r; 1136 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 1137 } 1138 1139 if (last < prange->last) { 1140 r = svm_range_split(prange, prange->start, last, &tail); 1141 if (r) 1142 return r; 1143 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 1144 } 1145 1146 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 1147 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 1148 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 1149 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 1150 prange, prange->start, prange->last, 1151 SVM_OP_ADD_RANGE_AND_MAP); 1152 } 1153 return 0; 1154 } 1155 1156 static uint64_t 1157 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, 1158 int domain) 1159 { 1160 struct amdgpu_device *bo_adev; 1161 uint32_t flags = prange->flags; 1162 uint32_t mapping_flags = 0; 1163 uint64_t pte_flags; 1164 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 1165 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1166 1167 if (domain == SVM_RANGE_VRAM_DOMAIN) 1168 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1169 1170 switch (KFD_GC_VERSION(adev->kfd.dev)) { 1171 case IP_VERSION(9, 4, 1): 1172 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1173 if (bo_adev == adev) { 1174 mapping_flags |= coherent ? 1175 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1176 } else { 1177 mapping_flags |= coherent ? 1178 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1179 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1180 snoop = true; 1181 } 1182 } else { 1183 mapping_flags |= coherent ? 1184 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1185 } 1186 break; 1187 case IP_VERSION(9, 4, 2): 1188 if (domain == SVM_RANGE_VRAM_DOMAIN) { 1189 if (bo_adev == adev) { 1190 mapping_flags |= coherent ? 1191 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1192 if (adev->gmc.xgmi.connected_to_cpu) 1193 snoop = true; 1194 } else { 1195 mapping_flags |= coherent ? 1196 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1197 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 1198 snoop = true; 1199 } 1200 } else { 1201 mapping_flags |= coherent ? 1202 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1203 } 1204 break; 1205 default: 1206 mapping_flags |= coherent ? 1207 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1208 } 1209 1210 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 1211 1212 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 1213 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 1214 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 1215 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 1216 1217 pte_flags = AMDGPU_PTE_VALID; 1218 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 1219 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1220 1221 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); 1222 return pte_flags; 1223 } 1224 1225 static int 1226 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 1227 uint64_t start, uint64_t last, 1228 struct dma_fence **fence) 1229 { 1230 uint64_t init_pte_value = 0; 1231 1232 pr_debug("[0x%llx 0x%llx]\n", start, last); 1233 1234 return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start, 1235 last, init_pte_value, 0, 0, NULL, NULL, 1236 fence); 1237 } 1238 1239 static int 1240 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 1241 unsigned long last, uint32_t trigger) 1242 { 1243 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1244 struct kfd_process_device *pdd; 1245 struct dma_fence *fence = NULL; 1246 struct kfd_process *p; 1247 uint32_t gpuidx; 1248 int r = 0; 1249 1250 if (!prange->mapped_to_gpu) { 1251 pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n", 1252 prange, prange->start, prange->last); 1253 return 0; 1254 } 1255 1256 if (prange->start == start && prange->last == last) { 1257 pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange); 1258 prange->mapped_to_gpu = false; 1259 } 1260 1261 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 1262 MAX_GPU_INSTANCE); 1263 p = container_of(prange->svms, struct kfd_process, svms); 1264 1265 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1266 pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 1267 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1268 if (!pdd) { 1269 pr_debug("failed to find device idx %d\n", gpuidx); 1270 return -EINVAL; 1271 } 1272 1273 kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid, 1274 start, last, trigger); 1275 1276 r = svm_range_unmap_from_gpu(pdd->dev->adev, 1277 drm_priv_to_vm(pdd->drm_priv), 1278 start, last, &fence); 1279 if (r) 1280 break; 1281 1282 if (fence) { 1283 r = dma_fence_wait(fence, false); 1284 dma_fence_put(fence); 1285 fence = NULL; 1286 if (r) 1287 break; 1288 } 1289 kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT); 1290 } 1291 1292 return r; 1293 } 1294 1295 static int 1296 svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, 1297 unsigned long offset, unsigned long npages, bool readonly, 1298 dma_addr_t *dma_addr, struct amdgpu_device *bo_adev, 1299 struct dma_fence **fence, bool flush_tlb) 1300 { 1301 struct amdgpu_device *adev = pdd->dev->adev; 1302 struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); 1303 uint64_t pte_flags; 1304 unsigned long last_start; 1305 int last_domain; 1306 int r = 0; 1307 int64_t i, j; 1308 1309 last_start = prange->start + offset; 1310 1311 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 1312 last_start, last_start + npages - 1, readonly); 1313 1314 for (i = offset; i < offset + npages; i++) { 1315 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 1316 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 1317 1318 /* Collect all pages in the same address range and memory domain 1319 * that can be mapped with a single call to update mapping. 1320 */ 1321 if (i < offset + npages - 1 && 1322 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 1323 continue; 1324 1325 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 1326 last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 1327 1328 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); 1329 if (readonly) 1330 pte_flags &= ~AMDGPU_PTE_WRITEABLE; 1331 1332 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 1333 prange->svms, last_start, prange->start + i, 1334 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 1335 pte_flags); 1336 1337 r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, 1338 last_start, prange->start + i, 1339 pte_flags, 1340 (last_start - prange->start) << PAGE_SHIFT, 1341 bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, 1342 NULL, dma_addr, &vm->last_update); 1343 1344 for (j = last_start - prange->start; j <= i; j++) 1345 dma_addr[j] |= last_domain; 1346 1347 if (r) { 1348 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 1349 goto out; 1350 } 1351 last_start = prange->start + i + 1; 1352 } 1353 1354 r = amdgpu_vm_update_pdes(adev, vm, false); 1355 if (r) { 1356 pr_debug("failed %d to update directories 0x%lx\n", r, 1357 prange->start); 1358 goto out; 1359 } 1360 1361 if (fence) 1362 *fence = dma_fence_get(vm->last_update); 1363 1364 out: 1365 return r; 1366 } 1367 1368 static int 1369 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 1370 unsigned long npages, bool readonly, 1371 unsigned long *bitmap, bool wait, bool flush_tlb) 1372 { 1373 struct kfd_process_device *pdd; 1374 struct amdgpu_device *bo_adev; 1375 struct kfd_process *p; 1376 struct dma_fence *fence = NULL; 1377 uint32_t gpuidx; 1378 int r = 0; 1379 1380 if (prange->svm_bo && prange->ttm_res) 1381 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); 1382 else 1383 bo_adev = NULL; 1384 1385 p = container_of(prange->svms, struct kfd_process, svms); 1386 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 1387 pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 1388 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1389 if (!pdd) { 1390 pr_debug("failed to find device idx %d\n", gpuidx); 1391 return -EINVAL; 1392 } 1393 1394 pdd = kfd_bind_process_to_device(pdd->dev, p); 1395 if (IS_ERR(pdd)) 1396 return -EINVAL; 1397 1398 if (bo_adev && pdd->dev->adev != bo_adev && 1399 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 1400 pr_debug("cannot map to device idx %d\n", gpuidx); 1401 continue; 1402 } 1403 1404 r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly, 1405 prange->dma_addr[gpuidx], 1406 bo_adev, wait ? &fence : NULL, 1407 flush_tlb); 1408 if (r) 1409 break; 1410 1411 if (fence) { 1412 r = dma_fence_wait(fence, false); 1413 dma_fence_put(fence); 1414 fence = NULL; 1415 if (r) { 1416 pr_debug("failed %d to dma fence wait\n", r); 1417 break; 1418 } 1419 } 1420 1421 kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); 1422 } 1423 1424 return r; 1425 } 1426 1427 struct svm_validate_context { 1428 struct kfd_process *process; 1429 struct svm_range *prange; 1430 bool intr; 1431 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1432 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; 1433 struct list_head validate_list; 1434 struct ww_acquire_ctx ticket; 1435 }; 1436 1437 static int svm_range_reserve_bos(struct svm_validate_context *ctx) 1438 { 1439 struct kfd_process_device *pdd; 1440 struct amdgpu_vm *vm; 1441 uint32_t gpuidx; 1442 int r; 1443 1444 INIT_LIST_HEAD(&ctx->validate_list); 1445 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1446 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1447 if (!pdd) { 1448 pr_debug("failed to find device idx %d\n", gpuidx); 1449 return -EINVAL; 1450 } 1451 vm = drm_priv_to_vm(pdd->drm_priv); 1452 1453 ctx->tv[gpuidx].bo = &vm->root.bo->tbo; 1454 ctx->tv[gpuidx].num_shared = 4; 1455 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); 1456 } 1457 1458 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, 1459 ctx->intr, NULL); 1460 if (r) { 1461 pr_debug("failed %d to reserve bo\n", r); 1462 return r; 1463 } 1464 1465 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 1466 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 1467 if (!pdd) { 1468 pr_debug("failed to find device idx %d\n", gpuidx); 1469 r = -EINVAL; 1470 goto unreserve_out; 1471 } 1472 1473 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 1474 drm_priv_to_vm(pdd->drm_priv), 1475 svm_range_bo_validate, NULL); 1476 if (r) { 1477 pr_debug("failed %d validate pt bos\n", r); 1478 goto unreserve_out; 1479 } 1480 } 1481 1482 return 0; 1483 1484 unreserve_out: 1485 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1486 return r; 1487 } 1488 1489 static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 1490 { 1491 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); 1492 } 1493 1494 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 1495 { 1496 struct kfd_process_device *pdd; 1497 1498 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1499 1500 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 1501 } 1502 1503 /* 1504 * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 1505 * 1506 * To prevent concurrent destruction or change of range attributes, the 1507 * svm_read_lock must be held. The caller must not hold the svm_write_lock 1508 * because that would block concurrent evictions and lead to deadlocks. To 1509 * serialize concurrent migrations or validations of the same range, the 1510 * prange->migrate_mutex must be held. 1511 * 1512 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 1513 * eviction fence. 1514 * 1515 * The following sequence ensures race-free validation and GPU mapping: 1516 * 1517 * 1. Reserve page table (and SVM BO if range is in VRAM) 1518 * 2. hmm_range_fault to get page addresses (if system memory) 1519 * 3. DMA-map pages (if system memory) 1520 * 4-a. Take notifier lock 1521 * 4-b. Check that pages still valid (mmu_interval_read_retry) 1522 * 4-c. Check that the range was not split or otherwise invalidated 1523 * 4-d. Update GPU page table 1524 * 4.e. Release notifier lock 1525 * 5. Release page table (and SVM BO) reservation 1526 */ 1527 static int svm_range_validate_and_map(struct mm_struct *mm, 1528 struct svm_range *prange, int32_t gpuidx, 1529 bool intr, bool wait, bool flush_tlb) 1530 { 1531 struct svm_validate_context ctx; 1532 unsigned long start, end, addr; 1533 struct kfd_process *p; 1534 void *owner; 1535 int32_t idx; 1536 int r = 0; 1537 1538 ctx.process = container_of(prange->svms, struct kfd_process, svms); 1539 ctx.prange = prange; 1540 ctx.intr = intr; 1541 1542 if (gpuidx < MAX_GPU_INSTANCE) { 1543 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); 1544 bitmap_set(ctx.bitmap, gpuidx, 1); 1545 } else if (ctx.process->xnack_enabled) { 1546 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 1547 1548 /* If prefetch range to GPU, or GPU retry fault migrate range to 1549 * GPU, which has ACCESS attribute to the range, create mapping 1550 * on that GPU. 1551 */ 1552 if (prange->actual_loc) { 1553 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, 1554 prange->actual_loc); 1555 if (gpuidx < 0) { 1556 WARN_ONCE(1, "failed get device by id 0x%x\n", 1557 prange->actual_loc); 1558 return -EINVAL; 1559 } 1560 if (test_bit(gpuidx, prange->bitmap_access)) 1561 bitmap_set(ctx.bitmap, gpuidx, 1); 1562 } 1563 } else { 1564 bitmap_or(ctx.bitmap, prange->bitmap_access, 1565 prange->bitmap_aip, MAX_GPU_INSTANCE); 1566 } 1567 1568 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) { 1569 if (!prange->mapped_to_gpu) 1570 return 0; 1571 1572 bitmap_copy(ctx.bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); 1573 } 1574 1575 if (prange->actual_loc && !prange->ttm_res) { 1576 /* This should never happen. actual_loc gets set by 1577 * svm_migrate_ram_to_vram after allocating a BO. 1578 */ 1579 WARN_ONCE(1, "VRAM BO missing during validation\n"); 1580 return -EINVAL; 1581 } 1582 1583 svm_range_reserve_bos(&ctx); 1584 1585 p = container_of(prange->svms, struct kfd_process, svms); 1586 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, 1587 MAX_GPU_INSTANCE)); 1588 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { 1589 if (kfd_svm_page_owner(p, idx) != owner) { 1590 owner = NULL; 1591 break; 1592 } 1593 } 1594 1595 start = prange->start << PAGE_SHIFT; 1596 end = (prange->last + 1) << PAGE_SHIFT; 1597 for (addr = start; addr < end && !r; ) { 1598 struct hmm_range *hmm_range; 1599 struct vm_area_struct *vma; 1600 unsigned long next; 1601 unsigned long offset; 1602 unsigned long npages; 1603 bool readonly; 1604 1605 vma = find_vma(mm, addr); 1606 if (!vma || addr < vma->vm_start) { 1607 r = -EFAULT; 1608 goto unreserve_out; 1609 } 1610 readonly = !(vma->vm_flags & VM_WRITE); 1611 1612 next = min(vma->vm_end, end); 1613 npages = (next - addr) >> PAGE_SHIFT; 1614 WRITE_ONCE(p->svms.faulting_task, current); 1615 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, 1616 addr, npages, &hmm_range, 1617 readonly, true, owner); 1618 WRITE_ONCE(p->svms.faulting_task, NULL); 1619 if (r) { 1620 pr_debug("failed %d to get svm range pages\n", r); 1621 goto unreserve_out; 1622 } 1623 1624 offset = (addr - start) >> PAGE_SHIFT; 1625 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, 1626 hmm_range->hmm_pfns); 1627 if (r) { 1628 pr_debug("failed %d to dma map range\n", r); 1629 goto unreserve_out; 1630 } 1631 1632 svm_range_lock(prange); 1633 if (amdgpu_hmm_range_get_pages_done(hmm_range)) { 1634 pr_debug("hmm update the range, need validate again\n"); 1635 r = -EAGAIN; 1636 goto unlock_out; 1637 } 1638 if (!list_empty(&prange->child_list)) { 1639 pr_debug("range split by unmap in parallel, validate again\n"); 1640 r = -EAGAIN; 1641 goto unlock_out; 1642 } 1643 1644 r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1645 ctx.bitmap, wait, flush_tlb); 1646 1647 unlock_out: 1648 svm_range_unlock(prange); 1649 1650 addr = next; 1651 } 1652 1653 if (addr == end) { 1654 prange->validated_once = true; 1655 prange->mapped_to_gpu = true; 1656 } 1657 1658 unreserve_out: 1659 svm_range_unreserve_bos(&ctx); 1660 1661 if (!r) 1662 prange->validate_timestamp = ktime_get_boottime(); 1663 1664 return r; 1665 } 1666 1667 /** 1668 * svm_range_list_lock_and_flush_work - flush pending deferred work 1669 * 1670 * @svms: the svm range list 1671 * @mm: the mm structure 1672 * 1673 * Context: Returns with mmap write lock held, pending deferred work flushed 1674 * 1675 */ 1676 void 1677 svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 1678 struct mm_struct *mm) 1679 { 1680 retry_flush_work: 1681 flush_work(&svms->deferred_list_work); 1682 mmap_write_lock(mm); 1683 1684 if (list_empty(&svms->deferred_range_list)) 1685 return; 1686 mmap_write_unlock(mm); 1687 pr_debug("retry flush\n"); 1688 goto retry_flush_work; 1689 } 1690 1691 static void svm_range_restore_work(struct work_struct *work) 1692 { 1693 struct delayed_work *dwork = to_delayed_work(work); 1694 struct amdkfd_process_info *process_info; 1695 struct svm_range_list *svms; 1696 struct svm_range *prange; 1697 struct kfd_process *p; 1698 struct mm_struct *mm; 1699 int evicted_ranges; 1700 int invalid; 1701 int r; 1702 1703 svms = container_of(dwork, struct svm_range_list, restore_work); 1704 evicted_ranges = atomic_read(&svms->evicted_ranges); 1705 if (!evicted_ranges) 1706 return; 1707 1708 pr_debug("restore svm ranges\n"); 1709 1710 p = container_of(svms, struct kfd_process, svms); 1711 process_info = p->kgd_process_info; 1712 1713 /* Keep mm reference when svm_range_validate_and_map ranges */ 1714 mm = get_task_mm(p->lead_thread); 1715 if (!mm) { 1716 pr_debug("svms 0x%p process mm gone\n", svms); 1717 return; 1718 } 1719 1720 mutex_lock(&process_info->lock); 1721 svm_range_list_lock_and_flush_work(svms, mm); 1722 mutex_lock(&svms->lock); 1723 1724 evicted_ranges = atomic_read(&svms->evicted_ranges); 1725 1726 list_for_each_entry(prange, &svms->list, list) { 1727 invalid = atomic_read(&prange->invalid); 1728 if (!invalid) 1729 continue; 1730 1731 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 1732 prange->svms, prange, prange->start, prange->last, 1733 invalid); 1734 1735 /* 1736 * If range is migrating, wait for migration is done. 1737 */ 1738 mutex_lock(&prange->migrate_mutex); 1739 1740 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 1741 false, true, false); 1742 if (r) 1743 pr_debug("failed %d to map 0x%lx to gpus\n", r, 1744 prange->start); 1745 1746 mutex_unlock(&prange->migrate_mutex); 1747 if (r) 1748 goto out_reschedule; 1749 1750 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 1751 goto out_reschedule; 1752 } 1753 1754 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 1755 evicted_ranges) 1756 goto out_reschedule; 1757 1758 evicted_ranges = 0; 1759 1760 r = kgd2kfd_resume_mm(mm); 1761 if (r) { 1762 /* No recovery from this failure. Probably the CP is 1763 * hanging. No point trying again. 1764 */ 1765 pr_debug("failed %d to resume KFD\n", r); 1766 } 1767 1768 pr_debug("restore svm ranges successfully\n"); 1769 1770 out_reschedule: 1771 mutex_unlock(&svms->lock); 1772 mmap_write_unlock(mm); 1773 mutex_unlock(&process_info->lock); 1774 1775 /* If validation failed, reschedule another attempt */ 1776 if (evicted_ranges) { 1777 pr_debug("reschedule to restore svm range\n"); 1778 schedule_delayed_work(&svms->restore_work, 1779 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1780 1781 kfd_smi_event_queue_restore_rescheduled(mm); 1782 } 1783 mmput(mm); 1784 } 1785 1786 /** 1787 * svm_range_evict - evict svm range 1788 * @prange: svm range structure 1789 * @mm: current process mm_struct 1790 * @start: starting process queue number 1791 * @last: last process queue number 1792 * 1793 * Stop all queues of the process to ensure GPU doesn't access the memory, then 1794 * return to let CPU evict the buffer and proceed CPU pagetable update. 1795 * 1796 * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 1797 * If invalidation happens while restore work is running, restore work will 1798 * restart to ensure to get the latest CPU pages mapping to GPU, then start 1799 * the queues. 1800 */ 1801 static int 1802 svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 1803 unsigned long start, unsigned long last, 1804 enum mmu_notifier_event event) 1805 { 1806 struct svm_range_list *svms = prange->svms; 1807 struct svm_range *pchild; 1808 struct kfd_process *p; 1809 int r = 0; 1810 1811 p = container_of(svms, struct kfd_process, svms); 1812 1813 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 1814 svms, prange->start, prange->last, start, last); 1815 1816 if (!p->xnack_enabled || 1817 (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) { 1818 int evicted_ranges; 1819 bool mapped = prange->mapped_to_gpu; 1820 1821 list_for_each_entry(pchild, &prange->child_list, child_list) { 1822 if (!pchild->mapped_to_gpu) 1823 continue; 1824 mapped = true; 1825 mutex_lock_nested(&pchild->lock, 1); 1826 if (pchild->start <= last && pchild->last >= start) { 1827 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 1828 pchild->start, pchild->last); 1829 atomic_inc(&pchild->invalid); 1830 } 1831 mutex_unlock(&pchild->lock); 1832 } 1833 1834 if (!mapped) 1835 return r; 1836 1837 if (prange->start <= last && prange->last >= start) 1838 atomic_inc(&prange->invalid); 1839 1840 evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 1841 if (evicted_ranges != 1) 1842 return r; 1843 1844 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 1845 prange->svms, prange->start, prange->last); 1846 1847 /* First eviction, stop the queues */ 1848 r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM); 1849 if (r) 1850 pr_debug("failed to quiesce KFD\n"); 1851 1852 pr_debug("schedule to restore svm %p ranges\n", svms); 1853 schedule_delayed_work(&svms->restore_work, 1854 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 1855 } else { 1856 unsigned long s, l; 1857 uint32_t trigger; 1858 1859 if (event == MMU_NOTIFY_MIGRATE) 1860 trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE; 1861 else 1862 trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY; 1863 1864 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 1865 prange->svms, start, last); 1866 list_for_each_entry(pchild, &prange->child_list, child_list) { 1867 mutex_lock_nested(&pchild->lock, 1); 1868 s = max(start, pchild->start); 1869 l = min(last, pchild->last); 1870 if (l >= s) 1871 svm_range_unmap_from_gpus(pchild, s, l, trigger); 1872 mutex_unlock(&pchild->lock); 1873 } 1874 s = max(start, prange->start); 1875 l = min(last, prange->last); 1876 if (l >= s) 1877 svm_range_unmap_from_gpus(prange, s, l, trigger); 1878 } 1879 1880 return r; 1881 } 1882 1883 static struct svm_range *svm_range_clone(struct svm_range *old) 1884 { 1885 struct svm_range *new; 1886 1887 new = svm_range_new(old->svms, old->start, old->last, false); 1888 if (!new) 1889 return NULL; 1890 1891 if (old->svm_bo) { 1892 new->ttm_res = old->ttm_res; 1893 new->offset = old->offset; 1894 new->svm_bo = svm_range_bo_ref(old->svm_bo); 1895 spin_lock(&new->svm_bo->list_lock); 1896 list_add(&new->svm_bo_list, &new->svm_bo->range_list); 1897 spin_unlock(&new->svm_bo->list_lock); 1898 } 1899 new->flags = old->flags; 1900 new->preferred_loc = old->preferred_loc; 1901 new->prefetch_loc = old->prefetch_loc; 1902 new->actual_loc = old->actual_loc; 1903 new->granularity = old->granularity; 1904 new->mapped_to_gpu = old->mapped_to_gpu; 1905 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 1906 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 1907 1908 return new; 1909 } 1910 1911 void svm_range_set_max_pages(struct amdgpu_device *adev) 1912 { 1913 uint64_t max_pages; 1914 uint64_t pages, _pages; 1915 1916 /* 1/32 VRAM size in pages */ 1917 pages = adev->gmc.real_vram_size >> 17; 1918 pages = clamp(pages, 1ULL << 9, 1ULL << 18); 1919 pages = rounddown_pow_of_two(pages); 1920 do { 1921 max_pages = READ_ONCE(max_svm_range_pages); 1922 _pages = min_not_zero(max_pages, pages); 1923 } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages); 1924 } 1925 1926 static int 1927 svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, 1928 uint64_t max_pages, struct list_head *insert_list, 1929 struct list_head *update_list) 1930 { 1931 struct svm_range *prange; 1932 uint64_t l; 1933 1934 pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n", 1935 max_pages, start, last); 1936 1937 while (last >= start) { 1938 l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1); 1939 1940 prange = svm_range_new(svms, start, l, true); 1941 if (!prange) 1942 return -ENOMEM; 1943 list_add(&prange->list, insert_list); 1944 list_add(&prange->update_list, update_list); 1945 1946 start = l + 1; 1947 } 1948 return 0; 1949 } 1950 1951 /** 1952 * svm_range_add - add svm range and handle overlap 1953 * @p: the range add to this process svms 1954 * @start: page size aligned 1955 * @size: page size aligned 1956 * @nattr: number of attributes 1957 * @attrs: array of attributes 1958 * @update_list: output, the ranges need validate and update GPU mapping 1959 * @insert_list: output, the ranges need insert to svms 1960 * @remove_list: output, the ranges are replaced and need remove from svms 1961 * 1962 * Check if the virtual address range has overlap with any existing ranges, 1963 * split partly overlapping ranges and add new ranges in the gaps. All changes 1964 * should be applied to the range_list and interval tree transactionally. If 1965 * any range split or allocation fails, the entire update fails. Therefore any 1966 * existing overlapping svm_ranges are cloned and the original svm_ranges left 1967 * unchanged. 1968 * 1969 * If the transaction succeeds, the caller can update and insert clones and 1970 * new ranges, then free the originals. 1971 * 1972 * Otherwise the caller can free the clones and new ranges, while the old 1973 * svm_ranges remain unchanged. 1974 * 1975 * Context: Process context, caller must hold svms->lock 1976 * 1977 * Return: 1978 * 0 - OK, otherwise error code 1979 */ 1980 static int 1981 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 1982 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 1983 struct list_head *update_list, struct list_head *insert_list, 1984 struct list_head *remove_list) 1985 { 1986 unsigned long last = start + size - 1UL; 1987 struct svm_range_list *svms = &p->svms; 1988 struct interval_tree_node *node; 1989 struct svm_range *prange; 1990 struct svm_range *tmp; 1991 struct list_head new_list; 1992 int r = 0; 1993 1994 pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); 1995 1996 INIT_LIST_HEAD(update_list); 1997 INIT_LIST_HEAD(insert_list); 1998 INIT_LIST_HEAD(remove_list); 1999 INIT_LIST_HEAD(&new_list); 2000 2001 node = interval_tree_iter_first(&svms->objects, start, last); 2002 while (node) { 2003 struct interval_tree_node *next; 2004 unsigned long next_start; 2005 2006 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 2007 node->last); 2008 2009 prange = container_of(node, struct svm_range, it_node); 2010 next = interval_tree_iter_next(node, start, last); 2011 next_start = min(node->last, last) + 1; 2012 2013 if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { 2014 /* nothing to do */ 2015 } else if (node->start < start || node->last > last) { 2016 /* node intersects the update range and its attributes 2017 * will change. Clone and split it, apply updates only 2018 * to the overlapping part 2019 */ 2020 struct svm_range *old = prange; 2021 2022 prange = svm_range_clone(old); 2023 if (!prange) { 2024 r = -ENOMEM; 2025 goto out; 2026 } 2027 2028 list_add(&old->update_list, remove_list); 2029 list_add(&prange->list, insert_list); 2030 list_add(&prange->update_list, update_list); 2031 2032 if (node->start < start) { 2033 pr_debug("change old range start\n"); 2034 r = svm_range_split_head(prange, start, 2035 insert_list); 2036 if (r) 2037 goto out; 2038 } 2039 if (node->last > last) { 2040 pr_debug("change old range last\n"); 2041 r = svm_range_split_tail(prange, last, 2042 insert_list); 2043 if (r) 2044 goto out; 2045 } 2046 } else { 2047 /* The node is contained within start..last, 2048 * just update it 2049 */ 2050 list_add(&prange->update_list, update_list); 2051 } 2052 2053 /* insert a new node if needed */ 2054 if (node->start > start) { 2055 r = svm_range_split_new(svms, start, node->start - 1, 2056 READ_ONCE(max_svm_range_pages), 2057 &new_list, update_list); 2058 if (r) 2059 goto out; 2060 } 2061 2062 node = next; 2063 start = next_start; 2064 } 2065 2066 /* add a final range at the end if needed */ 2067 if (start <= last) 2068 r = svm_range_split_new(svms, start, last, 2069 READ_ONCE(max_svm_range_pages), 2070 &new_list, update_list); 2071 2072 out: 2073 if (r) { 2074 list_for_each_entry_safe(prange, tmp, insert_list, list) 2075 svm_range_free(prange, false); 2076 list_for_each_entry_safe(prange, tmp, &new_list, list) 2077 svm_range_free(prange, true); 2078 } else { 2079 list_splice(&new_list, insert_list); 2080 } 2081 2082 return r; 2083 } 2084 2085 static void 2086 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 2087 struct svm_range *prange) 2088 { 2089 unsigned long start; 2090 unsigned long last; 2091 2092 start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 2093 last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 2094 2095 if (prange->start == start && prange->last == last) 2096 return; 2097 2098 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 2099 prange->svms, prange, start, last, prange->start, 2100 prange->last); 2101 2102 if (start != 0 && last != 0) { 2103 interval_tree_remove(&prange->it_node, &prange->svms->objects); 2104 svm_range_remove_notifier(prange); 2105 } 2106 prange->it_node.start = prange->start; 2107 prange->it_node.last = prange->last; 2108 2109 interval_tree_insert(&prange->it_node, &prange->svms->objects); 2110 svm_range_add_notifier_locked(mm, prange); 2111 } 2112 2113 static void 2114 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, 2115 struct mm_struct *mm) 2116 { 2117 switch (prange->work_item.op) { 2118 case SVM_OP_NULL: 2119 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2120 svms, prange, prange->start, prange->last); 2121 break; 2122 case SVM_OP_UNMAP_RANGE: 2123 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2124 svms, prange, prange->start, prange->last); 2125 svm_range_unlink(prange); 2126 svm_range_remove_notifier(prange); 2127 svm_range_free(prange, true); 2128 break; 2129 case SVM_OP_UPDATE_RANGE_NOTIFIER: 2130 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2131 svms, prange, prange->start, prange->last); 2132 svm_range_update_notifier_and_interval_tree(mm, prange); 2133 break; 2134 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 2135 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 2136 svms, prange, prange->start, prange->last); 2137 svm_range_update_notifier_and_interval_tree(mm, prange); 2138 /* TODO: implement deferred validation and mapping */ 2139 break; 2140 case SVM_OP_ADD_RANGE: 2141 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 2142 prange->start, prange->last); 2143 svm_range_add_to_svms(prange); 2144 svm_range_add_notifier_locked(mm, prange); 2145 break; 2146 case SVM_OP_ADD_RANGE_AND_MAP: 2147 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 2148 prange, prange->start, prange->last); 2149 svm_range_add_to_svms(prange); 2150 svm_range_add_notifier_locked(mm, prange); 2151 /* TODO: implement deferred validation and mapping */ 2152 break; 2153 default: 2154 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 2155 prange->work_item.op); 2156 } 2157 } 2158 2159 static void svm_range_drain_retry_fault(struct svm_range_list *svms) 2160 { 2161 struct kfd_process_device *pdd; 2162 struct kfd_process *p; 2163 int drain; 2164 uint32_t i; 2165 2166 p = container_of(svms, struct kfd_process, svms); 2167 2168 restart: 2169 drain = atomic_read(&svms->drain_pagefaults); 2170 if (!drain) 2171 return; 2172 2173 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 2174 pdd = p->pdds[i]; 2175 if (!pdd) 2176 continue; 2177 2178 pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 2179 2180 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2181 &pdd->dev->adev->irq.ih1); 2182 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 2183 } 2184 if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) 2185 goto restart; 2186 } 2187 2188 static void svm_range_deferred_list_work(struct work_struct *work) 2189 { 2190 struct svm_range_list *svms; 2191 struct svm_range *prange; 2192 struct mm_struct *mm; 2193 2194 svms = container_of(work, struct svm_range_list, deferred_list_work); 2195 pr_debug("enter svms 0x%p\n", svms); 2196 2197 spin_lock(&svms->deferred_list_lock); 2198 while (!list_empty(&svms->deferred_range_list)) { 2199 prange = list_first_entry(&svms->deferred_range_list, 2200 struct svm_range, deferred_list); 2201 spin_unlock(&svms->deferred_list_lock); 2202 2203 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 2204 prange->start, prange->last, prange->work_item.op); 2205 2206 mm = prange->work_item.mm; 2207 retry: 2208 mmap_write_lock(mm); 2209 2210 /* Checking for the need to drain retry faults must be inside 2211 * mmap write lock to serialize with munmap notifiers. 2212 */ 2213 if (unlikely(atomic_read(&svms->drain_pagefaults))) { 2214 mmap_write_unlock(mm); 2215 svm_range_drain_retry_fault(svms); 2216 goto retry; 2217 } 2218 2219 /* Remove from deferred_list must be inside mmap write lock, for 2220 * two race cases: 2221 * 1. unmap_from_cpu may change work_item.op and add the range 2222 * to deferred_list again, cause use after free bug. 2223 * 2. svm_range_list_lock_and_flush_work may hold mmap write 2224 * lock and continue because deferred_list is empty, but 2225 * deferred_list work is actually waiting for mmap lock. 2226 */ 2227 spin_lock(&svms->deferred_list_lock); 2228 list_del_init(&prange->deferred_list); 2229 spin_unlock(&svms->deferred_list_lock); 2230 2231 mutex_lock(&svms->lock); 2232 mutex_lock(&prange->migrate_mutex); 2233 while (!list_empty(&prange->child_list)) { 2234 struct svm_range *pchild; 2235 2236 pchild = list_first_entry(&prange->child_list, 2237 struct svm_range, child_list); 2238 pr_debug("child prange 0x%p op %d\n", pchild, 2239 pchild->work_item.op); 2240 list_del_init(&pchild->child_list); 2241 svm_range_handle_list_op(svms, pchild, mm); 2242 } 2243 mutex_unlock(&prange->migrate_mutex); 2244 2245 svm_range_handle_list_op(svms, prange, mm); 2246 mutex_unlock(&svms->lock); 2247 mmap_write_unlock(mm); 2248 2249 /* Pairs with mmget in svm_range_add_list_work */ 2250 mmput(mm); 2251 2252 spin_lock(&svms->deferred_list_lock); 2253 } 2254 spin_unlock(&svms->deferred_list_lock); 2255 pr_debug("exit svms 0x%p\n", svms); 2256 } 2257 2258 void 2259 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 2260 struct mm_struct *mm, enum svm_work_list_ops op) 2261 { 2262 spin_lock(&svms->deferred_list_lock); 2263 /* if prange is on the deferred list */ 2264 if (!list_empty(&prange->deferred_list)) { 2265 pr_debug("update exist prange 0x%p work op %d\n", prange, op); 2266 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 2267 if (op != SVM_OP_NULL && 2268 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2269 prange->work_item.op = op; 2270 } else { 2271 prange->work_item.op = op; 2272 2273 /* Pairs with mmput in deferred_list_work */ 2274 mmget(mm); 2275 prange->work_item.mm = mm; 2276 list_add_tail(&prange->deferred_list, 2277 &prange->svms->deferred_range_list); 2278 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2279 prange, prange->start, prange->last, op); 2280 } 2281 spin_unlock(&svms->deferred_list_lock); 2282 } 2283 2284 void schedule_deferred_list_work(struct svm_range_list *svms) 2285 { 2286 spin_lock(&svms->deferred_list_lock); 2287 if (!list_empty(&svms->deferred_range_list)) 2288 schedule_work(&svms->deferred_list_work); 2289 spin_unlock(&svms->deferred_list_lock); 2290 } 2291 2292 static void 2293 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2294 struct svm_range *prange, unsigned long start, 2295 unsigned long last) 2296 { 2297 struct svm_range *head; 2298 struct svm_range *tail; 2299 2300 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2301 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 2302 prange->start, prange->last); 2303 return; 2304 } 2305 if (start > prange->last || last < prange->start) 2306 return; 2307 2308 head = tail = prange; 2309 if (start > prange->start) 2310 svm_range_split(prange, prange->start, start - 1, &tail); 2311 if (last < tail->last) 2312 svm_range_split(tail, last + 1, tail->last, &head); 2313 2314 if (head != prange && tail != prange) { 2315 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2316 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2317 } else if (tail != prange) { 2318 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2319 } else if (head != prange) { 2320 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2321 } else if (parent != prange) { 2322 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2323 } 2324 } 2325 2326 static void 2327 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 2328 unsigned long start, unsigned long last) 2329 { 2330 uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU; 2331 struct svm_range_list *svms; 2332 struct svm_range *pchild; 2333 struct kfd_process *p; 2334 unsigned long s, l; 2335 bool unmap_parent; 2336 2337 p = kfd_lookup_process_by_mm(mm); 2338 if (!p) 2339 return; 2340 svms = &p->svms; 2341 2342 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 2343 prange, prange->start, prange->last, start, last); 2344 2345 /* Make sure pending page faults are drained in the deferred worker 2346 * before the range is freed to avoid straggler interrupts on 2347 * unmapped memory causing "phantom faults". 2348 */ 2349 atomic_inc(&svms->drain_pagefaults); 2350 2351 unmap_parent = start <= prange->start && last >= prange->last; 2352 2353 list_for_each_entry(pchild, &prange->child_list, child_list) { 2354 mutex_lock_nested(&pchild->lock, 1); 2355 s = max(start, pchild->start); 2356 l = min(last, pchild->last); 2357 if (l >= s) 2358 svm_range_unmap_from_gpus(pchild, s, l, trigger); 2359 svm_range_unmap_split(mm, prange, pchild, start, last); 2360 mutex_unlock(&pchild->lock); 2361 } 2362 s = max(start, prange->start); 2363 l = min(last, prange->last); 2364 if (l >= s) 2365 svm_range_unmap_from_gpus(prange, s, l, trigger); 2366 svm_range_unmap_split(mm, prange, prange, start, last); 2367 2368 if (unmap_parent) 2369 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 2370 else 2371 svm_range_add_list_work(svms, prange, mm, 2372 SVM_OP_UPDATE_RANGE_NOTIFIER); 2373 schedule_deferred_list_work(svms); 2374 2375 kfd_unref_process(p); 2376 } 2377 2378 /** 2379 * svm_range_cpu_invalidate_pagetables - interval notifier callback 2380 * @mni: mmu_interval_notifier struct 2381 * @range: mmu_notifier_range struct 2382 * @cur_seq: value to pass to mmu_interval_set_seq() 2383 * 2384 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 2385 * is from migration, or CPU page invalidation callback. 2386 * 2387 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 2388 * work thread, and split prange if only part of prange is unmapped. 2389 * 2390 * For invalidation event, if GPU retry fault is not enabled, evict the queues, 2391 * then schedule svm_range_restore_work to update GPU mapping and resume queues. 2392 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 2393 * update GPU mapping to recover. 2394 * 2395 * Context: mmap lock, notifier_invalidate_start lock are held 2396 * for invalidate event, prange lock is held if this is from migration 2397 */ 2398 static bool 2399 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 2400 const struct mmu_notifier_range *range, 2401 unsigned long cur_seq) 2402 { 2403 struct svm_range *prange; 2404 unsigned long start; 2405 unsigned long last; 2406 2407 if (range->event == MMU_NOTIFY_RELEASE) 2408 return true; 2409 if (!mmget_not_zero(mni->mm)) 2410 return true; 2411 2412 start = mni->interval_tree.start; 2413 last = mni->interval_tree.last; 2414 start = max(start, range->start) >> PAGE_SHIFT; 2415 last = min(last, range->end - 1) >> PAGE_SHIFT; 2416 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 2417 start, last, range->start >> PAGE_SHIFT, 2418 (range->end - 1) >> PAGE_SHIFT, 2419 mni->interval_tree.start >> PAGE_SHIFT, 2420 mni->interval_tree.last >> PAGE_SHIFT, range->event); 2421 2422 prange = container_of(mni, struct svm_range, notifier); 2423 2424 svm_range_lock(prange); 2425 mmu_interval_set_seq(mni, cur_seq); 2426 2427 switch (range->event) { 2428 case MMU_NOTIFY_UNMAP: 2429 svm_range_unmap_from_cpu(mni->mm, prange, start, last); 2430 break; 2431 default: 2432 svm_range_evict(prange, mni->mm, start, last, range->event); 2433 break; 2434 } 2435 2436 svm_range_unlock(prange); 2437 mmput(mni->mm); 2438 2439 return true; 2440 } 2441 2442 /** 2443 * svm_range_from_addr - find svm range from fault address 2444 * @svms: svm range list header 2445 * @addr: address to search range interval tree, in pages 2446 * @parent: parent range if range is on child list 2447 * 2448 * Context: The caller must hold svms->lock 2449 * 2450 * Return: the svm_range found or NULL 2451 */ 2452 struct svm_range * 2453 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 2454 struct svm_range **parent) 2455 { 2456 struct interval_tree_node *node; 2457 struct svm_range *prange; 2458 struct svm_range *pchild; 2459 2460 node = interval_tree_iter_first(&svms->objects, addr, addr); 2461 if (!node) 2462 return NULL; 2463 2464 prange = container_of(node, struct svm_range, it_node); 2465 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 2466 addr, prange->start, prange->last, node->start, node->last); 2467 2468 if (addr >= prange->start && addr <= prange->last) { 2469 if (parent) 2470 *parent = prange; 2471 return prange; 2472 } 2473 list_for_each_entry(pchild, &prange->child_list, child_list) 2474 if (addr >= pchild->start && addr <= pchild->last) { 2475 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 2476 addr, pchild->start, pchild->last); 2477 if (parent) 2478 *parent = prange; 2479 return pchild; 2480 } 2481 2482 return NULL; 2483 } 2484 2485 /* svm_range_best_restore_location - decide the best fault restore location 2486 * @prange: svm range structure 2487 * @adev: the GPU on which vm fault happened 2488 * 2489 * This is only called when xnack is on, to decide the best location to restore 2490 * the range mapping after GPU vm fault. Caller uses the best location to do 2491 * migration if actual loc is not best location, then update GPU page table 2492 * mapping to the best location. 2493 * 2494 * If the preferred loc is accessible by faulting GPU, use preferred loc. 2495 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 2496 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 2497 * if range actual loc is cpu, best_loc is cpu 2498 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 2499 * range actual loc. 2500 * Otherwise, GPU no access, best_loc is -1. 2501 * 2502 * Return: 2503 * -1 means vm fault GPU no access 2504 * 0 for CPU or GPU id 2505 */ 2506 static int32_t 2507 svm_range_best_restore_location(struct svm_range *prange, 2508 struct amdgpu_device *adev, 2509 int32_t *gpuidx) 2510 { 2511 struct amdgpu_device *bo_adev, *preferred_adev; 2512 struct kfd_process *p; 2513 uint32_t gpuid; 2514 int r; 2515 2516 p = container_of(prange->svms, struct kfd_process, svms); 2517 2518 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); 2519 if (r < 0) { 2520 pr_debug("failed to get gpuid from kgd\n"); 2521 return -1; 2522 } 2523 2524 if (prange->preferred_loc == gpuid || 2525 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 2526 return prange->preferred_loc; 2527 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2528 preferred_adev = svm_range_get_adev_by_id(prange, 2529 prange->preferred_loc); 2530 if (amdgpu_xgmi_same_hive(adev, preferred_adev)) 2531 return prange->preferred_loc; 2532 /* fall through */ 2533 } 2534 2535 if (test_bit(*gpuidx, prange->bitmap_access)) 2536 return gpuid; 2537 2538 if (test_bit(*gpuidx, prange->bitmap_aip)) { 2539 if (!prange->actual_loc) 2540 return 0; 2541 2542 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); 2543 if (amdgpu_xgmi_same_hive(adev, bo_adev)) 2544 return prange->actual_loc; 2545 else 2546 return 0; 2547 } 2548 2549 return -1; 2550 } 2551 2552 static int 2553 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 2554 unsigned long *start, unsigned long *last, 2555 bool *is_heap_stack) 2556 { 2557 struct vm_area_struct *vma; 2558 struct interval_tree_node *node; 2559 unsigned long start_limit, end_limit; 2560 2561 vma = find_vma(p->mm, addr << PAGE_SHIFT); 2562 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2563 pr_debug("VMA does not exist in address [0x%llx]\n", addr); 2564 return -EFAULT; 2565 } 2566 2567 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && 2568 vma->vm_end >= vma->vm_mm->start_brk) || 2569 (vma->vm_start <= vma->vm_mm->start_stack && 2570 vma->vm_end >= vma->vm_mm->start_stack); 2571 2572 start_limit = max(vma->vm_start >> PAGE_SHIFT, 2573 (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 2574 end_limit = min(vma->vm_end >> PAGE_SHIFT, 2575 (unsigned long)ALIGN(addr + 1, 2UL << 8)); 2576 /* First range that starts after the fault address */ 2577 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 2578 if (node) { 2579 end_limit = min(end_limit, node->start); 2580 /* Last range that ends before the fault address */ 2581 node = container_of(rb_prev(&node->rb), 2582 struct interval_tree_node, rb); 2583 } else { 2584 /* Last range must end before addr because 2585 * there was no range after addr 2586 */ 2587 node = container_of(rb_last(&p->svms.objects.rb_root), 2588 struct interval_tree_node, rb); 2589 } 2590 if (node) { 2591 if (node->last >= addr) { 2592 WARN(1, "Overlap with prev node and page fault addr\n"); 2593 return -EFAULT; 2594 } 2595 start_limit = max(start_limit, node->last + 1); 2596 } 2597 2598 *start = start_limit; 2599 *last = end_limit - 1; 2600 2601 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 2602 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 2603 *start, *last, *is_heap_stack); 2604 2605 return 0; 2606 } 2607 2608 static int 2609 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 2610 uint64_t *bo_s, uint64_t *bo_l) 2611 { 2612 struct amdgpu_bo_va_mapping *mapping; 2613 struct interval_tree_node *node; 2614 struct amdgpu_bo *bo = NULL; 2615 unsigned long userptr; 2616 uint32_t i; 2617 int r; 2618 2619 for (i = 0; i < p->n_pdds; i++) { 2620 struct amdgpu_vm *vm; 2621 2622 if (!p->pdds[i]->drm_priv) 2623 continue; 2624 2625 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 2626 r = amdgpu_bo_reserve(vm->root.bo, false); 2627 if (r) 2628 return r; 2629 2630 /* Check userptr by searching entire vm->va interval tree */ 2631 node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 2632 while (node) { 2633 mapping = container_of((struct rb_node *)node, 2634 struct amdgpu_bo_va_mapping, rb); 2635 bo = mapping->bo_va->base.bo; 2636 2637 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 2638 start << PAGE_SHIFT, 2639 last << PAGE_SHIFT, 2640 &userptr)) { 2641 node = interval_tree_iter_next(node, 0, ~0ULL); 2642 continue; 2643 } 2644 2645 pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 2646 start, last); 2647 if (bo_s && bo_l) { 2648 *bo_s = userptr >> PAGE_SHIFT; 2649 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 2650 } 2651 amdgpu_bo_unreserve(vm->root.bo); 2652 return -EADDRINUSE; 2653 } 2654 amdgpu_bo_unreserve(vm->root.bo); 2655 } 2656 return 0; 2657 } 2658 2659 static struct 2660 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, 2661 struct kfd_process *p, 2662 struct mm_struct *mm, 2663 int64_t addr) 2664 { 2665 struct svm_range *prange = NULL; 2666 unsigned long start, last; 2667 uint32_t gpuid, gpuidx; 2668 bool is_heap_stack; 2669 uint64_t bo_s = 0; 2670 uint64_t bo_l = 0; 2671 int r; 2672 2673 if (svm_range_get_range_boundaries(p, addr, &start, &last, 2674 &is_heap_stack)) 2675 return NULL; 2676 2677 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 2678 if (r != -EADDRINUSE) 2679 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 2680 2681 if (r == -EADDRINUSE) { 2682 if (addr >= bo_s && addr <= bo_l) 2683 return NULL; 2684 2685 /* Create one page svm range if 2MB range overlapping */ 2686 start = addr; 2687 last = addr; 2688 } 2689 2690 prange = svm_range_new(&p->svms, start, last, true); 2691 if (!prange) { 2692 pr_debug("Failed to create prange in address [0x%llx]\n", addr); 2693 return NULL; 2694 } 2695 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { 2696 pr_debug("failed to get gpuid from kgd\n"); 2697 svm_range_free(prange, true); 2698 return NULL; 2699 } 2700 2701 if (is_heap_stack) 2702 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 2703 2704 svm_range_add_to_svms(prange); 2705 svm_range_add_notifier_locked(mm, prange); 2706 2707 return prange; 2708 } 2709 2710 /* svm_range_skip_recover - decide if prange can be recovered 2711 * @prange: svm range structure 2712 * 2713 * GPU vm retry fault handle skip recover the range for cases: 2714 * 1. prange is on deferred list to be removed after unmap, it is stale fault, 2715 * deferred list work will drain the stale fault before free the prange. 2716 * 2. prange is on deferred list to add interval notifier after split, or 2717 * 3. prange is child range, it is split from parent prange, recover later 2718 * after interval notifier is added. 2719 * 2720 * Return: true to skip recover, false to recover 2721 */ 2722 static bool svm_range_skip_recover(struct svm_range *prange) 2723 { 2724 struct svm_range_list *svms = prange->svms; 2725 2726 spin_lock(&svms->deferred_list_lock); 2727 if (list_empty(&prange->deferred_list) && 2728 list_empty(&prange->child_list)) { 2729 spin_unlock(&svms->deferred_list_lock); 2730 return false; 2731 } 2732 spin_unlock(&svms->deferred_list_lock); 2733 2734 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 2735 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 2736 svms, prange, prange->start, prange->last); 2737 return true; 2738 } 2739 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 2740 prange->work_item.op == SVM_OP_ADD_RANGE) { 2741 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 2742 svms, prange, prange->start, prange->last); 2743 return true; 2744 } 2745 return false; 2746 } 2747 2748 static void 2749 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, 2750 int32_t gpuidx) 2751 { 2752 struct kfd_process_device *pdd; 2753 2754 /* fault is on different page of same range 2755 * or fault is skipped to recover later 2756 * or fault is on invalid virtual address 2757 */ 2758 if (gpuidx == MAX_GPU_INSTANCE) { 2759 uint32_t gpuid; 2760 int r; 2761 2762 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); 2763 if (r < 0) 2764 return; 2765 } 2766 2767 /* fault is recovered 2768 * or fault cannot recover because GPU no access on the range 2769 */ 2770 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2771 if (pdd) 2772 WRITE_ONCE(pdd->faults, pdd->faults + 1); 2773 } 2774 2775 static bool 2776 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) 2777 { 2778 unsigned long requested = VM_READ; 2779 2780 if (write_fault) 2781 requested |= VM_WRITE; 2782 2783 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 2784 vma->vm_flags); 2785 return (vma->vm_flags & requested) == requested; 2786 } 2787 2788 int 2789 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2790 uint64_t addr, bool write_fault) 2791 { 2792 struct mm_struct *mm = NULL; 2793 struct svm_range_list *svms; 2794 struct svm_range *prange; 2795 struct kfd_process *p; 2796 ktime_t timestamp = ktime_get_boottime(); 2797 int32_t best_loc; 2798 int32_t gpuidx = MAX_GPU_INSTANCE; 2799 bool write_locked = false; 2800 struct vm_area_struct *vma; 2801 bool migration = false; 2802 int r = 0; 2803 2804 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { 2805 pr_debug("device does not support SVM\n"); 2806 return -EFAULT; 2807 } 2808 2809 p = kfd_lookup_process_by_pasid(pasid); 2810 if (!p) { 2811 pr_debug("kfd process not founded pasid 0x%x\n", pasid); 2812 return 0; 2813 } 2814 svms = &p->svms; 2815 2816 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 2817 2818 if (atomic_read(&svms->drain_pagefaults)) { 2819 pr_debug("draining retry fault, drop fault 0x%llx\n", addr); 2820 r = 0; 2821 goto out; 2822 } 2823 2824 if (!p->xnack_enabled) { 2825 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 2826 r = -EFAULT; 2827 goto out; 2828 } 2829 2830 /* p->lead_thread is available as kfd_process_wq_release flush the work 2831 * before releasing task ref. 2832 */ 2833 mm = get_task_mm(p->lead_thread); 2834 if (!mm) { 2835 pr_debug("svms 0x%p failed to get mm\n", svms); 2836 r = 0; 2837 goto out; 2838 } 2839 2840 mmap_read_lock(mm); 2841 retry_write_locked: 2842 mutex_lock(&svms->lock); 2843 prange = svm_range_from_addr(svms, addr, NULL); 2844 if (!prange) { 2845 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 2846 svms, addr); 2847 if (!write_locked) { 2848 /* Need the write lock to create new range with MMU notifier. 2849 * Also flush pending deferred work to make sure the interval 2850 * tree is up to date before we add a new range 2851 */ 2852 mutex_unlock(&svms->lock); 2853 mmap_read_unlock(mm); 2854 mmap_write_lock(mm); 2855 write_locked = true; 2856 goto retry_write_locked; 2857 } 2858 prange = svm_range_create_unregistered_range(adev, p, mm, addr); 2859 if (!prange) { 2860 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 2861 svms, addr); 2862 mmap_write_downgrade(mm); 2863 r = -EFAULT; 2864 goto out_unlock_svms; 2865 } 2866 } 2867 if (write_locked) 2868 mmap_write_downgrade(mm); 2869 2870 mutex_lock(&prange->migrate_mutex); 2871 2872 if (svm_range_skip_recover(prange)) { 2873 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2874 r = 0; 2875 goto out_unlock_range; 2876 } 2877 2878 /* skip duplicate vm fault on different pages of same range */ 2879 if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp, 2880 AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) { 2881 pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 2882 svms, prange->start, prange->last); 2883 r = 0; 2884 goto out_unlock_range; 2885 } 2886 2887 /* __do_munmap removed VMA, return success as we are handling stale 2888 * retry fault. 2889 */ 2890 vma = find_vma(mm, addr << PAGE_SHIFT); 2891 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { 2892 pr_debug("address 0x%llx VMA is removed\n", addr); 2893 r = 0; 2894 goto out_unlock_range; 2895 } 2896 2897 if (!svm_fault_allowed(vma, write_fault)) { 2898 pr_debug("fault addr 0x%llx no %s permission\n", addr, 2899 write_fault ? "write" : "read"); 2900 r = -EPERM; 2901 goto out_unlock_range; 2902 } 2903 2904 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); 2905 if (best_loc == -1) { 2906 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 2907 svms, prange->start, prange->last); 2908 r = -EACCES; 2909 goto out_unlock_range; 2910 } 2911 2912 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 2913 svms, prange->start, prange->last, best_loc, 2914 prange->actual_loc); 2915 2916 kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr, 2917 write_fault, timestamp); 2918 2919 if (prange->actual_loc != best_loc) { 2920 migration = true; 2921 if (best_loc) { 2922 r = svm_migrate_to_vram(prange, best_loc, mm, 2923 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); 2924 if (r) { 2925 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 2926 r, addr); 2927 /* Fallback to system memory if migration to 2928 * VRAM failed 2929 */ 2930 if (prange->actual_loc) 2931 r = svm_migrate_vram_to_ram(prange, mm, 2932 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, 2933 NULL); 2934 else 2935 r = 0; 2936 } 2937 } else { 2938 r = svm_migrate_vram_to_ram(prange, mm, 2939 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, 2940 NULL); 2941 } 2942 if (r) { 2943 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 2944 r, svms, prange->start, prange->last); 2945 goto out_unlock_range; 2946 } 2947 } 2948 2949 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false); 2950 if (r) 2951 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 2952 r, svms, prange->start, prange->last); 2953 2954 kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr, 2955 migration); 2956 2957 out_unlock_range: 2958 mutex_unlock(&prange->migrate_mutex); 2959 out_unlock_svms: 2960 mutex_unlock(&svms->lock); 2961 mmap_read_unlock(mm); 2962 2963 svm_range_count_fault(adev, p, gpuidx); 2964 2965 mmput(mm); 2966 out: 2967 kfd_unref_process(p); 2968 2969 if (r == -EAGAIN) { 2970 pr_debug("recover vm fault later\n"); 2971 amdgpu_gmc_filter_faults_remove(adev, addr, pasid); 2972 r = 0; 2973 } 2974 return r; 2975 } 2976 2977 int 2978 svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) 2979 { 2980 struct svm_range *prange, *pchild; 2981 uint64_t reserved_size = 0; 2982 uint64_t size; 2983 int r = 0; 2984 2985 pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled); 2986 2987 mutex_lock(&p->svms.lock); 2988 2989 list_for_each_entry(prange, &p->svms.list, list) { 2990 svm_range_lock(prange); 2991 list_for_each_entry(pchild, &prange->child_list, child_list) { 2992 size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; 2993 if (xnack_enabled) { 2994 amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 2995 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 2996 } else { 2997 r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, 2998 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 2999 if (r) 3000 goto out_unlock; 3001 reserved_size += size; 3002 } 3003 } 3004 3005 size = (prange->last - prange->start + 1) << PAGE_SHIFT; 3006 if (xnack_enabled) { 3007 amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 3008 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 3009 } else { 3010 r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, 3011 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 3012 if (r) 3013 goto out_unlock; 3014 reserved_size += size; 3015 } 3016 out_unlock: 3017 svm_range_unlock(prange); 3018 if (r) 3019 break; 3020 } 3021 3022 if (r) 3023 amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size, 3024 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); 3025 else 3026 /* Change xnack mode must be inside svms lock, to avoid race with 3027 * svm_range_deferred_list_work unreserve memory in parallel. 3028 */ 3029 p->xnack_enabled = xnack_enabled; 3030 3031 mutex_unlock(&p->svms.lock); 3032 return r; 3033 } 3034 3035 void svm_range_list_fini(struct kfd_process *p) 3036 { 3037 struct svm_range *prange; 3038 struct svm_range *next; 3039 3040 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 3041 3042 cancel_delayed_work_sync(&p->svms.restore_work); 3043 3044 /* Ensure list work is finished before process is destroyed */ 3045 flush_work(&p->svms.deferred_list_work); 3046 3047 /* 3048 * Ensure no retry fault comes in afterwards, as page fault handler will 3049 * not find kfd process and take mm lock to recover fault. 3050 */ 3051 atomic_inc(&p->svms.drain_pagefaults); 3052 svm_range_drain_retry_fault(&p->svms); 3053 3054 list_for_each_entry_safe(prange, next, &p->svms.list, list) { 3055 svm_range_unlink(prange); 3056 svm_range_remove_notifier(prange); 3057 svm_range_free(prange, true); 3058 } 3059 3060 mutex_destroy(&p->svms.lock); 3061 3062 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 3063 } 3064 3065 int svm_range_list_init(struct kfd_process *p) 3066 { 3067 struct svm_range_list *svms = &p->svms; 3068 int i; 3069 3070 svms->objects = RB_ROOT_CACHED; 3071 mutex_init(&svms->lock); 3072 INIT_LIST_HEAD(&svms->list); 3073 atomic_set(&svms->evicted_ranges, 0); 3074 atomic_set(&svms->drain_pagefaults, 0); 3075 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 3076 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 3077 INIT_LIST_HEAD(&svms->deferred_range_list); 3078 INIT_LIST_HEAD(&svms->criu_svm_metadata_list); 3079 spin_lock_init(&svms->deferred_list_lock); 3080 3081 for (i = 0; i < p->n_pdds; i++) 3082 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) 3083 bitmap_set(svms->bitmap_supported, i, 1); 3084 3085 return 0; 3086 } 3087 3088 /** 3089 * svm_range_check_vm - check if virtual address range mapped already 3090 * @p: current kfd_process 3091 * @start: range start address, in pages 3092 * @last: range last address, in pages 3093 * @bo_s: mapping start address in pages if address range already mapped 3094 * @bo_l: mapping last address in pages if address range already mapped 3095 * 3096 * The purpose is to avoid virtual address ranges already allocated by 3097 * kfd_ioctl_alloc_memory_of_gpu ioctl. 3098 * It looks for each pdd in the kfd_process. 3099 * 3100 * Context: Process context 3101 * 3102 * Return 0 - OK, if the range is not mapped. 3103 * Otherwise error code: 3104 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 3105 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 3106 * a signal. Release all buffer reservations and return to user-space. 3107 */ 3108 static int 3109 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 3110 uint64_t *bo_s, uint64_t *bo_l) 3111 { 3112 struct amdgpu_bo_va_mapping *mapping; 3113 struct interval_tree_node *node; 3114 uint32_t i; 3115 int r; 3116 3117 for (i = 0; i < p->n_pdds; i++) { 3118 struct amdgpu_vm *vm; 3119 3120 if (!p->pdds[i]->drm_priv) 3121 continue; 3122 3123 vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 3124 r = amdgpu_bo_reserve(vm->root.bo, false); 3125 if (r) 3126 return r; 3127 3128 node = interval_tree_iter_first(&vm->va, start, last); 3129 if (node) { 3130 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 3131 start, last); 3132 mapping = container_of((struct rb_node *)node, 3133 struct amdgpu_bo_va_mapping, rb); 3134 if (bo_s && bo_l) { 3135 *bo_s = mapping->start; 3136 *bo_l = mapping->last; 3137 } 3138 amdgpu_bo_unreserve(vm->root.bo); 3139 return -EADDRINUSE; 3140 } 3141 amdgpu_bo_unreserve(vm->root.bo); 3142 } 3143 3144 return 0; 3145 } 3146 3147 /** 3148 * svm_range_is_valid - check if virtual address range is valid 3149 * @p: current kfd_process 3150 * @start: range start address, in pages 3151 * @size: range size, in pages 3152 * 3153 * Valid virtual address range means it belongs to one or more VMAs 3154 * 3155 * Context: Process context 3156 * 3157 * Return: 3158 * 0 - OK, otherwise error code 3159 */ 3160 static int 3161 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 3162 { 3163 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 3164 struct vm_area_struct *vma; 3165 unsigned long end; 3166 unsigned long start_unchg = start; 3167 3168 start <<= PAGE_SHIFT; 3169 end = start + (size << PAGE_SHIFT); 3170 do { 3171 vma = find_vma(p->mm, start); 3172 if (!vma || start < vma->vm_start || 3173 (vma->vm_flags & device_vma)) 3174 return -EFAULT; 3175 start = min(end, vma->vm_end); 3176 } while (start < end); 3177 3178 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 3179 NULL); 3180 } 3181 3182 /** 3183 * svm_range_best_prefetch_location - decide the best prefetch location 3184 * @prange: svm range structure 3185 * 3186 * For xnack off: 3187 * If range map to single GPU, the best prefetch location is prefetch_loc, which 3188 * can be CPU or GPU. 3189 * 3190 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 3191 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 3192 * the best prefetch location is always CPU, because GPU can not have coherent 3193 * mapping VRAM of other GPUs even with large-BAR PCIe connection. 3194 * 3195 * For xnack on: 3196 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 3197 * prefetch_loc, other GPU access will generate vm fault and trigger migration. 3198 * 3199 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 3200 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 3201 * prefetch location is always CPU. 3202 * 3203 * Context: Process context 3204 * 3205 * Return: 3206 * 0 for CPU or GPU id 3207 */ 3208 static uint32_t 3209 svm_range_best_prefetch_location(struct svm_range *prange) 3210 { 3211 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 3212 uint32_t best_loc = prange->prefetch_loc; 3213 struct kfd_process_device *pdd; 3214 struct amdgpu_device *bo_adev; 3215 struct kfd_process *p; 3216 uint32_t gpuidx; 3217 3218 p = container_of(prange->svms, struct kfd_process, svms); 3219 3220 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 3221 goto out; 3222 3223 bo_adev = svm_range_get_adev_by_id(prange, best_loc); 3224 if (!bo_adev) { 3225 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); 3226 best_loc = 0; 3227 goto out; 3228 } 3229 3230 if (p->xnack_enabled) 3231 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 3232 else 3233 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 3234 MAX_GPU_INSTANCE); 3235 3236 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 3237 pdd = kfd_process_device_from_gpuidx(p, gpuidx); 3238 if (!pdd) { 3239 pr_debug("failed to get device by idx 0x%x\n", gpuidx); 3240 continue; 3241 } 3242 3243 if (pdd->dev->adev == bo_adev) 3244 continue; 3245 3246 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 3247 best_loc = 0; 3248 break; 3249 } 3250 } 3251 3252 out: 3253 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 3254 p->xnack_enabled, &p->svms, prange->start, prange->last, 3255 best_loc); 3256 3257 return best_loc; 3258 } 3259 3260 /* svm_range_trigger_migration - start page migration if prefetch loc changed 3261 * @mm: current process mm_struct 3262 * @prange: svm range structure 3263 * @migrated: output, true if migration is triggered 3264 * 3265 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 3266 * from ram to vram. 3267 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 3268 * from vram to ram. 3269 * 3270 * If GPU vm fault retry is not enabled, migration interact with MMU notifier 3271 * and restore work: 3272 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 3273 * stops all queues, schedule restore work 3274 * 2. svm_range_restore_work wait for migration is done by 3275 * a. svm_range_validate_vram takes prange->migrate_mutex 3276 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 3277 * 3. restore work update mappings of GPU, resume all queues. 3278 * 3279 * Context: Process context 3280 * 3281 * Return: 3282 * 0 - OK, otherwise - error code of migration 3283 */ 3284 static int 3285 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 3286 bool *migrated) 3287 { 3288 uint32_t best_loc; 3289 int r = 0; 3290 3291 *migrated = false; 3292 best_loc = svm_range_best_prefetch_location(prange); 3293 3294 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3295 best_loc == prange->actual_loc) 3296 return 0; 3297 3298 if (!best_loc) { 3299 r = svm_migrate_vram_to_ram(prange, mm, 3300 KFD_MIGRATE_TRIGGER_PREFETCH, NULL); 3301 *migrated = !r; 3302 return r; 3303 } 3304 3305 r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH); 3306 *migrated = !r; 3307 3308 return r; 3309 } 3310 3311 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 3312 { 3313 if (!fence) 3314 return -EINVAL; 3315 3316 if (dma_fence_is_signaled(&fence->base)) 3317 return 0; 3318 3319 if (fence->svm_bo) { 3320 WRITE_ONCE(fence->svm_bo->evicting, 1); 3321 schedule_work(&fence->svm_bo->eviction_work); 3322 } 3323 3324 return 0; 3325 } 3326 3327 static void svm_range_evict_svm_bo_worker(struct work_struct *work) 3328 { 3329 struct svm_range_bo *svm_bo; 3330 struct mm_struct *mm; 3331 int r = 0; 3332 3333 svm_bo = container_of(work, struct svm_range_bo, eviction_work); 3334 if (!svm_bo_ref_unless_zero(svm_bo)) 3335 return; /* svm_bo was freed while eviction was pending */ 3336 3337 if (mmget_not_zero(svm_bo->eviction_fence->mm)) { 3338 mm = svm_bo->eviction_fence->mm; 3339 } else { 3340 svm_range_bo_unref(svm_bo); 3341 return; 3342 } 3343 3344 mmap_read_lock(mm); 3345 spin_lock(&svm_bo->list_lock); 3346 while (!list_empty(&svm_bo->range_list) && !r) { 3347 struct svm_range *prange = 3348 list_first_entry(&svm_bo->range_list, 3349 struct svm_range, svm_bo_list); 3350 int retries = 3; 3351 3352 list_del_init(&prange->svm_bo_list); 3353 spin_unlock(&svm_bo->list_lock); 3354 3355 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3356 prange->start, prange->last); 3357 3358 mutex_lock(&prange->migrate_mutex); 3359 do { 3360 r = svm_migrate_vram_to_ram(prange, mm, 3361 KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); 3362 } while (!r && prange->actual_loc && --retries); 3363 3364 if (!r && prange->actual_loc) 3365 pr_info_once("Migration failed during eviction"); 3366 3367 if (!prange->actual_loc) { 3368 mutex_lock(&prange->lock); 3369 prange->svm_bo = NULL; 3370 mutex_unlock(&prange->lock); 3371 } 3372 mutex_unlock(&prange->migrate_mutex); 3373 3374 spin_lock(&svm_bo->list_lock); 3375 } 3376 spin_unlock(&svm_bo->list_lock); 3377 mmap_read_unlock(mm); 3378 mmput(mm); 3379 3380 dma_fence_signal(&svm_bo->eviction_fence->base); 3381 3382 /* This is the last reference to svm_bo, after svm_range_vram_node_free 3383 * has been called in svm_migrate_vram_to_ram 3384 */ 3385 WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 3386 svm_range_bo_unref(svm_bo); 3387 } 3388 3389 static int 3390 svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, 3391 uint64_t start, uint64_t size, uint32_t nattr, 3392 struct kfd_ioctl_svm_attribute *attrs) 3393 { 3394 struct amdkfd_process_info *process_info = p->kgd_process_info; 3395 struct list_head update_list; 3396 struct list_head insert_list; 3397 struct list_head remove_list; 3398 struct svm_range_list *svms; 3399 struct svm_range *prange; 3400 struct svm_range *next; 3401 bool update_mapping = false; 3402 bool flush_tlb; 3403 int r = 0; 3404 3405 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 3406 p->pasid, &p->svms, start, start + size - 1, size); 3407 3408 r = svm_range_check_attr(p, nattr, attrs); 3409 if (r) 3410 return r; 3411 3412 svms = &p->svms; 3413 3414 mutex_lock(&process_info->lock); 3415 3416 svm_range_list_lock_and_flush_work(svms, mm); 3417 3418 r = svm_range_is_valid(p, start, size); 3419 if (r) { 3420 pr_debug("invalid range r=%d\n", r); 3421 mmap_write_unlock(mm); 3422 goto out; 3423 } 3424 3425 mutex_lock(&svms->lock); 3426 3427 /* Add new range and split existing ranges as needed */ 3428 r = svm_range_add(p, start, size, nattr, attrs, &update_list, 3429 &insert_list, &remove_list); 3430 if (r) { 3431 mutex_unlock(&svms->lock); 3432 mmap_write_unlock(mm); 3433 goto out; 3434 } 3435 /* Apply changes as a transaction */ 3436 list_for_each_entry_safe(prange, next, &insert_list, list) { 3437 svm_range_add_to_svms(prange); 3438 svm_range_add_notifier_locked(mm, prange); 3439 } 3440 list_for_each_entry(prange, &update_list, update_list) { 3441 svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); 3442 /* TODO: unmap ranges from GPU that lost access */ 3443 } 3444 list_for_each_entry_safe(prange, next, &remove_list, update_list) { 3445 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 3446 prange->svms, prange, prange->start, 3447 prange->last); 3448 svm_range_unlink(prange); 3449 svm_range_remove_notifier(prange); 3450 svm_range_free(prange, false); 3451 } 3452 3453 mmap_write_downgrade(mm); 3454 /* Trigger migrations and revalidate and map to GPUs as needed. If 3455 * this fails we may be left with partially completed actions. There 3456 * is no clean way of rolling back to the previous state in such a 3457 * case because the rollback wouldn't be guaranteed to work either. 3458 */ 3459 list_for_each_entry(prange, &update_list, update_list) { 3460 bool migrated; 3461 3462 mutex_lock(&prange->migrate_mutex); 3463 3464 r = svm_range_trigger_migration(mm, prange, &migrated); 3465 if (r) 3466 goto out_unlock_range; 3467 3468 if (migrated && (!p->xnack_enabled || 3469 (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) && 3470 prange->mapped_to_gpu) { 3471 pr_debug("restore_work will update mappings of GPUs\n"); 3472 mutex_unlock(&prange->migrate_mutex); 3473 continue; 3474 } 3475 3476 if (!migrated && !update_mapping) { 3477 mutex_unlock(&prange->migrate_mutex); 3478 continue; 3479 } 3480 3481 flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; 3482 3483 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 3484 true, true, flush_tlb); 3485 if (r) 3486 pr_debug("failed %d to map svm range\n", r); 3487 3488 out_unlock_range: 3489 mutex_unlock(&prange->migrate_mutex); 3490 if (r) 3491 break; 3492 } 3493 3494 svm_range_debug_dump(svms); 3495 3496 mutex_unlock(&svms->lock); 3497 mmap_read_unlock(mm); 3498 out: 3499 mutex_unlock(&process_info->lock); 3500 3501 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 3502 &p->svms, start, start + size - 1, r); 3503 3504 return r; 3505 } 3506 3507 static int 3508 svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, 3509 uint64_t start, uint64_t size, uint32_t nattr, 3510 struct kfd_ioctl_svm_attribute *attrs) 3511 { 3512 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 3513 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 3514 bool get_preferred_loc = false; 3515 bool get_prefetch_loc = false; 3516 bool get_granularity = false; 3517 bool get_accessible = false; 3518 bool get_flags = false; 3519 uint64_t last = start + size - 1UL; 3520 uint8_t granularity = 0xff; 3521 struct interval_tree_node *node; 3522 struct svm_range_list *svms; 3523 struct svm_range *prange; 3524 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3525 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3526 uint32_t flags_and = 0xffffffff; 3527 uint32_t flags_or = 0; 3528 int gpuidx; 3529 uint32_t i; 3530 int r = 0; 3531 3532 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 3533 start + size - 1, nattr); 3534 3535 /* Flush pending deferred work to avoid racing with deferred actions from 3536 * previous memory map changes (e.g. munmap). Concurrent memory map changes 3537 * can still race with get_attr because we don't hold the mmap lock. But that 3538 * would be a race condition in the application anyway, and undefined 3539 * behaviour is acceptable in that case. 3540 */ 3541 flush_work(&p->svms.deferred_list_work); 3542 3543 mmap_read_lock(mm); 3544 r = svm_range_is_valid(p, start, size); 3545 mmap_read_unlock(mm); 3546 if (r) { 3547 pr_debug("invalid range r=%d\n", r); 3548 return r; 3549 } 3550 3551 for (i = 0; i < nattr; i++) { 3552 switch (attrs[i].type) { 3553 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3554 get_preferred_loc = true; 3555 break; 3556 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3557 get_prefetch_loc = true; 3558 break; 3559 case KFD_IOCTL_SVM_ATTR_ACCESS: 3560 get_accessible = true; 3561 break; 3562 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3563 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3564 get_flags = true; 3565 break; 3566 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3567 get_granularity = true; 3568 break; 3569 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 3570 case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 3571 fallthrough; 3572 default: 3573 pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 3574 return -EINVAL; 3575 } 3576 } 3577 3578 svms = &p->svms; 3579 3580 mutex_lock(&svms->lock); 3581 3582 node = interval_tree_iter_first(&svms->objects, start, last); 3583 if (!node) { 3584 pr_debug("range attrs not found return default values\n"); 3585 svm_range_set_default_attributes(&location, &prefetch_loc, 3586 &granularity, &flags_and); 3587 flags_or = flags_and; 3588 if (p->xnack_enabled) 3589 bitmap_copy(bitmap_access, svms->bitmap_supported, 3590 MAX_GPU_INSTANCE); 3591 else 3592 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 3593 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 3594 goto fill_values; 3595 } 3596 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 3597 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 3598 3599 while (node) { 3600 struct interval_tree_node *next; 3601 3602 prange = container_of(node, struct svm_range, it_node); 3603 next = interval_tree_iter_next(node, start, last); 3604 3605 if (get_preferred_loc) { 3606 if (prange->preferred_loc == 3607 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3608 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3609 location != prange->preferred_loc)) { 3610 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3611 get_preferred_loc = false; 3612 } else { 3613 location = prange->preferred_loc; 3614 } 3615 } 3616 if (get_prefetch_loc) { 3617 if (prange->prefetch_loc == 3618 KFD_IOCTL_SVM_LOCATION_UNDEFINED || 3619 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 3620 prefetch_loc != prange->prefetch_loc)) { 3621 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3622 get_prefetch_loc = false; 3623 } else { 3624 prefetch_loc = prange->prefetch_loc; 3625 } 3626 } 3627 if (get_accessible) { 3628 bitmap_and(bitmap_access, bitmap_access, 3629 prange->bitmap_access, MAX_GPU_INSTANCE); 3630 bitmap_and(bitmap_aip, bitmap_aip, 3631 prange->bitmap_aip, MAX_GPU_INSTANCE); 3632 } 3633 if (get_flags) { 3634 flags_and &= prange->flags; 3635 flags_or |= prange->flags; 3636 } 3637 3638 if (get_granularity && prange->granularity < granularity) 3639 granularity = prange->granularity; 3640 3641 node = next; 3642 } 3643 fill_values: 3644 mutex_unlock(&svms->lock); 3645 3646 for (i = 0; i < nattr; i++) { 3647 switch (attrs[i].type) { 3648 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 3649 attrs[i].value = location; 3650 break; 3651 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3652 attrs[i].value = prefetch_loc; 3653 break; 3654 case KFD_IOCTL_SVM_ATTR_ACCESS: 3655 gpuidx = kfd_process_gpuidx_from_gpuid(p, 3656 attrs[i].value); 3657 if (gpuidx < 0) { 3658 pr_debug("invalid gpuid %x\n", attrs[i].value); 3659 return -EINVAL; 3660 } 3661 if (test_bit(gpuidx, bitmap_access)) 3662 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 3663 else if (test_bit(gpuidx, bitmap_aip)) 3664 attrs[i].type = 3665 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 3666 else 3667 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 3668 break; 3669 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3670 attrs[i].value = flags_and; 3671 break; 3672 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 3673 attrs[i].value = ~flags_or; 3674 break; 3675 case KFD_IOCTL_SVM_ATTR_GRANULARITY: 3676 attrs[i].value = (uint32_t)granularity; 3677 break; 3678 } 3679 } 3680 3681 return 0; 3682 } 3683 3684 int kfd_criu_resume_svm(struct kfd_process *p) 3685 { 3686 struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL; 3687 int nattr_common = 4, nattr_accessibility = 1; 3688 struct criu_svm_metadata *criu_svm_md = NULL; 3689 struct svm_range_list *svms = &p->svms; 3690 struct criu_svm_metadata *next = NULL; 3691 uint32_t set_flags = 0xffffffff; 3692 int i, j, num_attrs, ret = 0; 3693 uint64_t set_attr_size; 3694 struct mm_struct *mm; 3695 3696 if (list_empty(&svms->criu_svm_metadata_list)) { 3697 pr_debug("No SVM data from CRIU restore stage 2\n"); 3698 return ret; 3699 } 3700 3701 mm = get_task_mm(p->lead_thread); 3702 if (!mm) { 3703 pr_err("failed to get mm for the target process\n"); 3704 return -ESRCH; 3705 } 3706 3707 num_attrs = nattr_common + (nattr_accessibility * p->n_pdds); 3708 3709 i = j = 0; 3710 list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) { 3711 pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n", 3712 i, criu_svm_md->data.start_addr, criu_svm_md->data.size); 3713 3714 for (j = 0; j < num_attrs; j++) { 3715 pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n", 3716 i, j, criu_svm_md->data.attrs[j].type, 3717 i, j, criu_svm_md->data.attrs[j].value); 3718 switch (criu_svm_md->data.attrs[j].type) { 3719 /* During Checkpoint operation, the query for 3720 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might 3721 * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were 3722 * not used by the range which was checkpointed. Care 3723 * must be taken to not restore with an invalid value 3724 * otherwise the gpuidx value will be invalid and 3725 * set_attr would eventually fail so just replace those 3726 * with another dummy attribute such as 3727 * KFD_IOCTL_SVM_ATTR_SET_FLAGS. 3728 */ 3729 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 3730 if (criu_svm_md->data.attrs[j].value == 3731 KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 3732 criu_svm_md->data.attrs[j].type = 3733 KFD_IOCTL_SVM_ATTR_SET_FLAGS; 3734 criu_svm_md->data.attrs[j].value = 0; 3735 } 3736 break; 3737 case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 3738 set_flags = criu_svm_md->data.attrs[j].value; 3739 break; 3740 default: 3741 break; 3742 } 3743 } 3744 3745 /* CLR_FLAGS is not available via get_attr during checkpoint but 3746 * it needs to be inserted before restoring the ranges so 3747 * allocate extra space for it before calling set_attr 3748 */ 3749 set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 3750 (num_attrs + 1); 3751 set_attr_new = krealloc(set_attr, set_attr_size, 3752 GFP_KERNEL); 3753 if (!set_attr_new) { 3754 ret = -ENOMEM; 3755 goto exit; 3756 } 3757 set_attr = set_attr_new; 3758 3759 memcpy(set_attr, criu_svm_md->data.attrs, num_attrs * 3760 sizeof(struct kfd_ioctl_svm_attribute)); 3761 set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS; 3762 set_attr[num_attrs].value = ~set_flags; 3763 3764 ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr, 3765 criu_svm_md->data.size, num_attrs + 1, 3766 set_attr); 3767 if (ret) { 3768 pr_err("CRIU: failed to set range attributes\n"); 3769 goto exit; 3770 } 3771 3772 i++; 3773 } 3774 exit: 3775 kfree(set_attr); 3776 list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) { 3777 pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n", 3778 criu_svm_md->data.start_addr); 3779 kfree(criu_svm_md); 3780 } 3781 3782 mmput(mm); 3783 return ret; 3784 3785 } 3786 3787 int kfd_criu_restore_svm(struct kfd_process *p, 3788 uint8_t __user *user_priv_ptr, 3789 uint64_t *priv_data_offset, 3790 uint64_t max_priv_data_size) 3791 { 3792 uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size; 3793 int nattr_common = 4, nattr_accessibility = 1; 3794 struct criu_svm_metadata *criu_svm_md = NULL; 3795 struct svm_range_list *svms = &p->svms; 3796 uint32_t num_devices; 3797 int ret = 0; 3798 3799 num_devices = p->n_pdds; 3800 /* Handle one SVM range object at a time, also the number of gpus are 3801 * assumed to be same on the restore node, checking must be done while 3802 * evaluating the topology earlier 3803 */ 3804 3805 svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) * 3806 (nattr_common + nattr_accessibility * num_devices); 3807 svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size; 3808 3809 svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) + 3810 svm_attrs_size; 3811 3812 criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL); 3813 if (!criu_svm_md) { 3814 pr_err("failed to allocate memory to store svm metadata\n"); 3815 return -ENOMEM; 3816 } 3817 if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) { 3818 ret = -EINVAL; 3819 goto exit; 3820 } 3821 3822 ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset, 3823 svm_priv_data_size); 3824 if (ret) { 3825 ret = -EFAULT; 3826 goto exit; 3827 } 3828 *priv_data_offset += svm_priv_data_size; 3829 3830 list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list); 3831 3832 return 0; 3833 3834 3835 exit: 3836 kfree(criu_svm_md); 3837 return ret; 3838 } 3839 3840 int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, 3841 uint64_t *svm_priv_data_size) 3842 { 3843 uint64_t total_size, accessibility_size, common_attr_size; 3844 int nattr_common = 4, nattr_accessibility = 1; 3845 int num_devices = p->n_pdds; 3846 struct svm_range_list *svms; 3847 struct svm_range *prange; 3848 uint32_t count = 0; 3849 3850 *svm_priv_data_size = 0; 3851 3852 svms = &p->svms; 3853 if (!svms) 3854 return -EINVAL; 3855 3856 mutex_lock(&svms->lock); 3857 list_for_each_entry(prange, &svms->list, list) { 3858 pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n", 3859 prange, prange->start, prange->npages, 3860 prange->start + prange->npages - 1); 3861 count++; 3862 } 3863 mutex_unlock(&svms->lock); 3864 3865 *num_svm_ranges = count; 3866 /* Only the accessbility attributes need to be queried for all the gpus 3867 * individually, remaining ones are spanned across the entire process 3868 * regardless of the various gpu nodes. Of the remaining attributes, 3869 * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved. 3870 * 3871 * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC 3872 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC 3873 * KFD_IOCTL_SVM_ATTR_SET_FLAGS 3874 * KFD_IOCTL_SVM_ATTR_GRANULARITY 3875 * 3876 * ** ACCESSBILITY ATTRIBUTES ** 3877 * (Considered as one, type is altered during query, value is gpuid) 3878 * KFD_IOCTL_SVM_ATTR_ACCESS 3879 * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE 3880 * KFD_IOCTL_SVM_ATTR_NO_ACCESS 3881 */ 3882 if (*num_svm_ranges > 0) { 3883 common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 3884 nattr_common; 3885 accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) * 3886 nattr_accessibility * num_devices; 3887 3888 total_size = sizeof(struct kfd_criu_svm_range_priv_data) + 3889 common_attr_size + accessibility_size; 3890 3891 *svm_priv_data_size = *num_svm_ranges * total_size; 3892 } 3893 3894 pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges, 3895 *svm_priv_data_size); 3896 return 0; 3897 } 3898 3899 int kfd_criu_checkpoint_svm(struct kfd_process *p, 3900 uint8_t __user *user_priv_data, 3901 uint64_t *priv_data_offset) 3902 { 3903 struct kfd_criu_svm_range_priv_data *svm_priv = NULL; 3904 struct kfd_ioctl_svm_attribute *query_attr = NULL; 3905 uint64_t svm_priv_data_size, query_attr_size = 0; 3906 int index, nattr_common = 4, ret = 0; 3907 struct svm_range_list *svms; 3908 int num_devices = p->n_pdds; 3909 struct svm_range *prange; 3910 struct mm_struct *mm; 3911 3912 svms = &p->svms; 3913 if (!svms) 3914 return -EINVAL; 3915 3916 mm = get_task_mm(p->lead_thread); 3917 if (!mm) { 3918 pr_err("failed to get mm for the target process\n"); 3919 return -ESRCH; 3920 } 3921 3922 query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 3923 (nattr_common + num_devices); 3924 3925 query_attr = kzalloc(query_attr_size, GFP_KERNEL); 3926 if (!query_attr) { 3927 ret = -ENOMEM; 3928 goto exit; 3929 } 3930 3931 query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC; 3932 query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC; 3933 query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS; 3934 query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY; 3935 3936 for (index = 0; index < num_devices; index++) { 3937 struct kfd_process_device *pdd = p->pdds[index]; 3938 3939 query_attr[index + nattr_common].type = 3940 KFD_IOCTL_SVM_ATTR_ACCESS; 3941 query_attr[index + nattr_common].value = pdd->user_gpu_id; 3942 } 3943 3944 svm_priv_data_size = sizeof(*svm_priv) + query_attr_size; 3945 3946 svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL); 3947 if (!svm_priv) { 3948 ret = -ENOMEM; 3949 goto exit_query; 3950 } 3951 3952 index = 0; 3953 list_for_each_entry(prange, &svms->list, list) { 3954 3955 svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE; 3956 svm_priv->start_addr = prange->start; 3957 svm_priv->size = prange->npages; 3958 memcpy(&svm_priv->attrs, query_attr, query_attr_size); 3959 pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n", 3960 prange, prange->start, prange->npages, 3961 prange->start + prange->npages - 1, 3962 prange->npages * PAGE_SIZE); 3963 3964 ret = svm_range_get_attr(p, mm, svm_priv->start_addr, 3965 svm_priv->size, 3966 (nattr_common + num_devices), 3967 svm_priv->attrs); 3968 if (ret) { 3969 pr_err("CRIU: failed to obtain range attributes\n"); 3970 goto exit_priv; 3971 } 3972 3973 if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv, 3974 svm_priv_data_size)) { 3975 pr_err("Failed to copy svm priv to user\n"); 3976 ret = -EFAULT; 3977 goto exit_priv; 3978 } 3979 3980 *priv_data_offset += svm_priv_data_size; 3981 3982 } 3983 3984 3985 exit_priv: 3986 kfree(svm_priv); 3987 exit_query: 3988 kfree(query_attr); 3989 exit: 3990 mmput(mm); 3991 return ret; 3992 } 3993 3994 int 3995 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 3996 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 3997 { 3998 struct mm_struct *mm = current->mm; 3999 int r; 4000 4001 start >>= PAGE_SHIFT; 4002 size >>= PAGE_SHIFT; 4003 4004 switch (op) { 4005 case KFD_IOCTL_SVM_OP_SET_ATTR: 4006 r = svm_range_set_attr(p, mm, start, size, nattrs, attrs); 4007 break; 4008 case KFD_IOCTL_SVM_OP_GET_ATTR: 4009 r = svm_range_get_attr(p, mm, start, size, nattrs, attrs); 4010 break; 4011 default: 4012 r = EINVAL; 4013 break; 4014 } 4015 4016 return r; 4017 } 4018