1 /******************************************************************************* 2 Copyright (c) 2015-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_global.h" 25 #include "uvm_channel.h" 26 #include "uvm_hal.h" 27 #include "uvm_push.h" 28 #include "uvm_test.h" 29 #include "uvm_test_rng.h" 30 #include "uvm_va_space.h" 31 #include "uvm_tracker.h" 32 #include "uvm_thread_context.h" 33 #include "uvm_gpu_semaphore.h" 34 #include "uvm_kvmalloc.h" 35 36 #define TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU 1024 37 #define TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU_EMU 64 38 39 // Schedule pushes one after another on all GPUs and channel types that copy and 40 // increment a counter into an adjacent memory location in a buffer. And then 41 // verify that all the values are correct on the CPU. 42 static NV_STATUS test_ordering(uvm_va_space_t *va_space) 43 { 44 NV_STATUS status; 45 uvm_gpu_t *gpu; 46 bool exclude_proxy_channel_type; 47 NvU32 i, j; 48 uvm_rm_mem_t *mem = NULL; 49 NvU32 *host_mem; 50 uvm_push_t push; 51 NvU64 gpu_va; 52 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 53 NvU32 value = 0; 54 const NvU32 iters_per_channel_type_per_gpu = g_uvm_global.num_simulated_devices > 0 ? 55 TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU_EMU : 56 TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU; 57 const NvU32 values_count = iters_per_channel_type_per_gpu; 58 const size_t buffer_size = sizeof(NvU32) * values_count; 59 60 gpu = uvm_va_space_find_first_gpu(va_space); 61 TEST_CHECK_RET(gpu != NULL); 62 63 // TODO: Bug 3839176: the test is waived on Confidential Computing because 64 // it assumes that GPU can access system memory without using encryption. 65 if (uvm_conf_computing_mode_enabled(gpu)) 66 return NV_OK; 67 68 status = uvm_rm_mem_alloc_and_map_all(gpu, UVM_RM_MEM_TYPE_SYS, buffer_size, 0, &mem); 69 TEST_CHECK_GOTO(status == NV_OK, done); 70 71 host_mem = (NvU32*)uvm_rm_mem_get_cpu_va(mem); 72 memset(host_mem, 0, buffer_size); 73 74 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Initial memset"); 75 TEST_CHECK_GOTO(status == NV_OK, done); 76 77 gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address; 78 79 // Semaphore release as part of uvm_push_end() will do the membar 80 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 81 gpu->parent->ce_hal->memset_v_4(&push, gpu_va, 0, buffer_size); 82 83 uvm_push_end(&push); 84 85 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done); 86 87 exclude_proxy_channel_type = uvm_gpu_uses_proxy_channel_pool(gpu); 88 89 for (i = 0; i < iters_per_channel_type_per_gpu; ++i) { 90 for (j = 0; j < UVM_CHANNEL_TYPE_CE_COUNT; ++j) { 91 uvm_channel_type_t channel_type = j; 92 93 // Proxy channels don't support the virtual memcopies that are about 94 // to be pushed, so don't test the proxy channel type in any of the 95 // GPUs. 96 if (exclude_proxy_channel_type && (channel_type == uvm_channel_proxy_channel_type())) 97 continue; 98 99 for_each_va_space_gpu(gpu, va_space) { 100 NvU64 gpu_va_base; 101 NvU64 gpu_va_src; 102 NvU64 gpu_va_dst; 103 104 status = uvm_push_begin_acquire(gpu->channel_manager, 105 channel_type, 106 &tracker, 107 &push, 108 "memcpy and inc to %u", 109 value + 1); 110 TEST_CHECK_GOTO(status == NV_OK, done); 111 112 gpu_va_base = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address; 113 gpu_va_src = gpu_va_base + (value % values_count) * sizeof(NvU32); 114 gpu_va_dst = gpu_va_base + ((value + 1) % values_count) * sizeof(NvU32); 115 116 // The semaphore reduction will do a membar before the reduction 117 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 118 gpu->parent->ce_hal->memcopy_v_to_v(&push, gpu_va_dst, gpu_va_src, sizeof(NvU32)); 119 120 // The following reduction is done from the same GPU, but the 121 // previous memcpy is to uncached sysmem and that bypasses L2 122 // and hence requires a SYSMEMBAR to be ordered. 123 gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va_dst, ++value); 124 125 uvm_push_end(&push); 126 127 uvm_tracker_clear(&tracker); 128 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done); 129 } 130 } 131 } 132 status = uvm_tracker_wait(&tracker); 133 TEST_CHECK_GOTO(status == NV_OK, done); 134 135 // At this moment, this should hold: 136 // mem[value % values_count] == value 137 // mem[(value + 1) % values_count] == value + 1 - values_count 138 // And in general, for i=[0, values_count): 139 // mem[(value + 1 + i) % values_count] == value + 1 - values_count + i 140 // Verify that 141 142 for (i = 0; i < values_count; ++i) { 143 NvU32 index = (value + 1 + i) % values_count; 144 NvU32 expected = (value + 1 + i) - values_count; 145 if (host_mem[index] != expected) { 146 UVM_TEST_PRINT("Bad value at host_mem[%u] = %u instead of %u\n", index, host_mem[index], expected); 147 status = NV_ERR_INVALID_STATE; 148 goto done; 149 } 150 } 151 152 done: 153 uvm_tracker_wait(&tracker); 154 uvm_rm_mem_free(mem); 155 156 return status; 157 } 158 159 static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space) 160 { 161 uvm_gpu_t *gpu; 162 163 for_each_va_space_gpu(gpu, va_space) { 164 uvm_channel_t *channel; 165 NvU64 completed_value; 166 167 // The GPU channel manager is destroyed and then re-created after 168 // the test, so this test requires exclusive access to the GPU. 169 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 170 171 channel = &gpu->channel_manager->channel_pools[0].channels[0]; 172 completed_value = uvm_channel_update_completed_value(channel); 173 uvm_gpu_semaphore_set_payload(&channel->tracking_sem.semaphore, (NvU32)completed_value + 1); 174 175 TEST_NV_CHECK_RET(uvm_global_get_status()); 176 uvm_channel_update_progress_all(channel); 177 TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE); 178 179 uvm_channel_manager_destroy(gpu->channel_manager); 180 181 // Destruction will hit the error again, so clear one more time. 182 uvm_global_reset_fatal_error(); 183 184 TEST_NV_CHECK_RET(uvm_channel_manager_create(gpu, &gpu->channel_manager)); 185 } 186 187 return NV_OK; 188 } 189 190 static NV_STATUS uvm_test_rc_for_gpu(uvm_gpu_t *gpu) 191 { 192 uvm_push_t push; 193 uvm_channel_pool_t *pool; 194 uvm_gpfifo_entry_t *fatal_entry; 195 uvm_push_info_t *push_info; 196 int fatal_line; 197 uvm_tracker_entry_t tracker_entry; 198 NV_STATUS status; 199 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 200 uvm_channel_manager_t *manager = gpu->channel_manager; 201 202 // Submit a bunch of successful pushes on each channel first so that the 203 // fatal one is behind a bunch of work (notably more than 204 // uvm_channel_update_progress() completes by default). 205 uvm_for_each_pool(pool, manager) { 206 uvm_channel_t *channel; 207 208 // Skip LCIC channels as those can't accept any pushes 209 if (uvm_channel_pool_is_lcic(pool)) 210 continue; 211 uvm_for_each_channel_in_pool(channel, pool) { 212 NvU32 i; 213 for (i = 0; i < 512; ++i) { 214 status = uvm_push_begin_on_channel(channel, &push, "Non-faulting push"); 215 TEST_CHECK_RET(status == NV_OK); 216 217 uvm_push_end(&push); 218 } 219 } 220 } 221 222 // Check RC on a proxy channel (SR-IOV heavy) or internal channel (any other 223 // mode). It is not allowed to use a virtual address in a memset pushed to 224 // a proxy channel, so we use a physical address instead. 225 if (uvm_gpu_uses_proxy_channel_pool(gpu)) { 226 uvm_gpu_address_t dst_address; 227 228 // Save the line number the push that's supposed to fail was started on 229 fatal_line = __LINE__ + 1; 230 TEST_NV_CHECK_RET(uvm_push_begin(manager, uvm_channel_proxy_channel_type(), &push, "Fatal push 0x%X", 0xBAD)); 231 232 // Memset targeting a physical address beyond the vidmem size. The 233 // passed physical address is not the vidmem size reported by RM 234 // because the reported size can be smaller than the actual physical 235 // size, such that accessing a GPA at the reported size may be allowed 236 // by VMMU. 237 // 238 // GA100 GPUs have way less than UVM_GPU_MAX_PHYS_MEM vidmem, so using 239 // that value as physical address should result on an error 240 dst_address = uvm_gpu_address_physical(UVM_APERTURE_VID, UVM_GPU_MAX_PHYS_MEM - 8); 241 gpu->parent->ce_hal->memset_8(&push, dst_address, 0, 8); 242 } 243 else { 244 fatal_line = __LINE__ + 1; 245 TEST_NV_CHECK_RET(uvm_push_begin(manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Fatal push 0x%X", 0xBAD)); 246 247 // Memset that should fault on 0xFFFFFFFF 248 gpu->parent->ce_hal->memset_v_4(&push, 0xFFFFFFFF, 0, 4); 249 } 250 251 uvm_push_end(&push); 252 253 uvm_push_get_tracker_entry(&push, &tracker_entry); 254 uvm_tracker_overwrite_with_push(&tracker, &push); 255 256 status = uvm_channel_manager_wait(manager); 257 TEST_CHECK_RET(status == NV_ERR_RC_ERROR); 258 259 TEST_CHECK_RET(uvm_channel_get_status(push.channel) == NV_ERR_RC_ERROR); 260 fatal_entry = uvm_channel_get_fatal_entry(push.channel); 261 TEST_CHECK_RET(fatal_entry != NULL); 262 263 push_info = fatal_entry->push_info; 264 TEST_CHECK_RET(push_info != NULL); 265 TEST_CHECK_RET(push_info->line == fatal_line); 266 TEST_CHECK_RET(strcmp(push_info->function, __FUNCTION__) == 0); 267 TEST_CHECK_RET(strcmp(push_info->filename, kbasename(__FILE__)) == 0); 268 if (uvm_push_info_is_tracking_descriptions()) 269 TEST_CHECK_RET(strcmp(push_info->description, "Fatal push 0xBAD") == 0); 270 271 TEST_CHECK_RET(uvm_global_get_status() == NV_ERR_RC_ERROR); 272 273 // Check that waiting for an entry after a global fatal error makes the 274 // entry completed. 275 TEST_CHECK_RET(!uvm_tracker_is_entry_completed(&tracker_entry)); 276 TEST_CHECK_RET(uvm_tracker_wait_for_entry(&tracker_entry) == NV_ERR_RC_ERROR); 277 TEST_CHECK_RET(uvm_tracker_is_entry_completed(&tracker_entry)); 278 279 // Check that waiting for a tracker after a global fatal error, clears all 280 // the entries from the tracker. 281 TEST_CHECK_RET(!uvm_tracker_is_empty(&tracker)); 282 TEST_CHECK_RET(uvm_tracker_wait(&tracker) == NV_ERR_RC_ERROR); 283 TEST_CHECK_RET(uvm_tracker_is_empty(&tracker)); 284 285 TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_RC_ERROR); 286 287 return NV_OK; 288 } 289 290 static NV_STATUS test_rc(uvm_va_space_t *va_space) 291 { 292 uvm_gpu_t *gpu; 293 294 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 295 296 for_each_va_space_gpu(gpu, va_space) { 297 NV_STATUS test_status, create_status; 298 299 // The GPU channel manager is destroyed and then re-created after 300 // testing RC, so this test requires exclusive access to the GPU. 301 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 302 303 g_uvm_global.disable_fatal_error_assert = true; 304 test_status = uvm_test_rc_for_gpu(gpu); 305 g_uvm_global.disable_fatal_error_assert = false; 306 307 uvm_channel_manager_destroy(gpu->channel_manager); 308 create_status = uvm_channel_manager_create(gpu, &gpu->channel_manager); 309 310 TEST_NV_CHECK_RET(test_status); 311 TEST_NV_CHECK_RET(create_status); 312 } 313 314 return NV_OK; 315 } 316 317 typedef struct 318 { 319 uvm_push_t push; 320 uvm_tracker_t tracker; 321 uvm_gpu_semaphore_t semaphore; 322 NvU32 queued_counter_value; 323 NvU32 queued_counter_repeat; 324 uvm_rm_mem_t *counter_mem; 325 uvm_rm_mem_t *counter_snapshots_mem; 326 uvm_rm_mem_t *other_stream_counter_snapshots_mem; 327 NvU32 *counter_snapshots; 328 NvU32 *other_stream_counter_snapshots; 329 NvU32 *other_stream_counter_expected; 330 } uvm_test_stream_t; 331 332 #define MAX_COUNTER_REPEAT_COUNT 10 * 1024 333 // For each iter, snapshot the first and last counter value 334 #define TEST_SNAPSHOT_SIZE(it) (2 * it * sizeof(NvU32)) 335 336 static void snapshot_counter(uvm_push_t *push, 337 uvm_rm_mem_t *counter_mem, 338 uvm_rm_mem_t *snapshot_mem, 339 NvU32 index, 340 NvU32 counters_count) 341 { 342 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 343 NvU64 counter_gpu_va; 344 NvU64 snapshot_gpu_va; 345 bool is_proxy_channel; 346 NvU32 last_counter_offset = (counters_count - 1) * sizeof(NvU32); 347 348 if (counters_count == 0) 349 return; 350 351 is_proxy_channel = uvm_channel_is_proxy(push->channel); 352 counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address; 353 snapshot_gpu_va = uvm_rm_mem_get_gpu_va(snapshot_mem, gpu, is_proxy_channel).address + index * 2 * sizeof(NvU32); 354 355 // Copy the last and first counter to a snapshot for later verification. 356 357 // Membar will be done by uvm_push_end() 358 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 359 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 360 gpu->parent->ce_hal->memcopy_v_to_v(push, 361 snapshot_gpu_va + sizeof(NvU32), 362 counter_gpu_va + last_counter_offset, 363 sizeof(NvU32)); 364 365 // Membar will be done by uvm_push_end() 366 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 367 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 368 gpu->parent->ce_hal->memcopy_v_to_v(push, snapshot_gpu_va, counter_gpu_va, sizeof(NvU32)); 369 } 370 371 static void set_counter(uvm_push_t *push, uvm_rm_mem_t *counter_mem, NvU32 value, NvU32 count) 372 { 373 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 374 NvU64 counter_gpu_va; 375 bool is_proxy_channel; 376 377 is_proxy_channel = uvm_channel_is_proxy(push->channel); 378 counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address; 379 380 gpu->parent->ce_hal->memset_v_4(push, counter_gpu_va, value, count * sizeof(NvU32)); 381 } 382 383 static uvm_channel_type_t random_ce_channel_type(uvm_test_rng_t *rng) 384 { 385 return (uvm_channel_type_t)uvm_test_rng_range_32(rng, 0, UVM_CHANNEL_TYPE_CE_COUNT - 1); 386 } 387 388 static uvm_channel_type_t random_ce_channel_type_except(uvm_test_rng_t *rng, uvm_channel_type_t exception) 389 { 390 uvm_channel_type_t channel_type; 391 392 UVM_ASSERT(exception < UVM_CHANNEL_TYPE_CE_COUNT); 393 394 channel_type = (uvm_channel_type_t)uvm_test_rng_range_32(rng, 0, UVM_CHANNEL_TYPE_CE_COUNT - 2); 395 396 if (channel_type >= exception) 397 channel_type++; 398 399 UVM_ASSERT(channel_type < UVM_CHANNEL_TYPE_CE_COUNT); 400 401 return channel_type; 402 } 403 404 static uvm_channel_type_t gpu_random_internal_ce_channel_type(uvm_gpu_t *gpu, uvm_test_rng_t *rng) 405 { 406 if (uvm_gpu_uses_proxy_channel_pool(gpu)) 407 return random_ce_channel_type_except(rng, uvm_channel_proxy_channel_type()); 408 409 return random_ce_channel_type(rng); 410 } 411 412 static uvm_gpu_t *random_va_space_gpu(uvm_test_rng_t *rng, uvm_va_space_t *va_space) 413 { 414 uvm_gpu_t *gpu; 415 NvU32 gpu_count = uvm_processor_mask_get_gpu_count(&va_space->registered_gpus); 416 NvU32 gpu_index = uvm_test_rng_range_32(rng, 0, gpu_count - 1); 417 418 UVM_ASSERT(gpu_count > 0); 419 420 for_each_va_space_gpu(gpu, va_space) { 421 if (gpu_index-- == 0) 422 return gpu; 423 } 424 425 UVM_ASSERT(0); 426 return NULL; 427 } 428 429 430 static void test_memset_rm_mem(uvm_push_t *push, uvm_rm_mem_t *rm_mem, NvU32 value) 431 { 432 uvm_gpu_t *gpu; 433 NvU64 gpu_va; 434 435 UVM_ASSERT(rm_mem->size % 4 == 0); 436 437 gpu = uvm_push_get_gpu(push); 438 gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_channel_is_proxy(push->channel)).address; 439 440 gpu->parent->ce_hal->memset_v_4(push, gpu_va, value, rm_mem->size); 441 } 442 443 // This test schedules a randomly sized memset on a random channel and GPU in a 444 // "stream" that has operations ordered by acquiring the tracker of the previous 445 // operation. It also snapshots the memset done by the previous operation in the 446 // stream to verify it later on the CPU. Each iteration also optionally acquires 447 // a different stream and snapshots its memset. 448 // The test ioctl is expected to be called at the same time from multiple 449 // threads and contains some schedule() calls to help get as many threads 450 // through the init phase before other threads continue. It also has a random 451 // schedule() call in the main loop scheduling GPU work. 452 static NV_STATUS stress_test_all_gpus_in_va(uvm_va_space_t *va_space, 453 NvU32 num_streams, 454 NvU32 iterations_per_stream, 455 NvU32 seed, 456 NvU32 verbose) 457 { 458 NV_STATUS status = NV_OK; 459 uvm_gpu_t *gpu; 460 NvU32 i, j; 461 uvm_test_stream_t *streams; 462 uvm_test_rng_t rng; 463 464 uvm_test_rng_init(&rng, seed); 465 466 gpu = uvm_va_space_find_first_gpu(va_space); 467 TEST_CHECK_RET(gpu != NULL); 468 469 streams = uvm_kvmalloc_zero(sizeof(*streams) * num_streams); 470 TEST_CHECK_RET(streams != NULL); 471 472 // Initialize all the trackers first so that clean up on error can always 473 // wait for them. 474 for (i = 0; i < num_streams; ++i) { 475 uvm_test_stream_t *stream = &streams[i]; 476 uvm_tracker_init(&stream->tracker); 477 } 478 479 for (i = 0; i < num_streams; ++i) { 480 uvm_test_stream_t *stream = &streams[i]; 481 482 status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &stream->semaphore); 483 if (status != NV_OK) 484 goto done; 485 486 stream->queued_counter_value = 0; 487 488 status = uvm_rm_mem_alloc_and_map_all(gpu, 489 UVM_RM_MEM_TYPE_SYS, 490 MAX_COUNTER_REPEAT_COUNT * sizeof(NvU32), 491 0, 492 &stream->counter_mem); 493 TEST_CHECK_GOTO(status == NV_OK, done); 494 495 status = uvm_rm_mem_alloc_and_map_all(gpu, 496 UVM_RM_MEM_TYPE_SYS, 497 TEST_SNAPSHOT_SIZE(iterations_per_stream), 498 0, 499 &stream->counter_snapshots_mem); 500 TEST_CHECK_GOTO(status == NV_OK, done); 501 502 stream->counter_snapshots = (NvU32*)uvm_rm_mem_get_cpu_va(stream->counter_snapshots_mem); 503 504 status = uvm_rm_mem_alloc_and_map_all(gpu, 505 UVM_RM_MEM_TYPE_SYS, 506 TEST_SNAPSHOT_SIZE(iterations_per_stream), 507 0, 508 &stream->other_stream_counter_snapshots_mem); 509 TEST_CHECK_GOTO(status == NV_OK, done); 510 511 stream->other_stream_counter_snapshots = (NvU32*)uvm_rm_mem_get_cpu_va(stream->other_stream_counter_snapshots_mem); 512 513 stream->other_stream_counter_expected = uvm_kvmalloc_zero(sizeof(NvU32) * iterations_per_stream); 514 if (stream->other_stream_counter_expected == NULL) { 515 status = NV_ERR_NO_MEMORY; 516 goto done; 517 } 518 519 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, &stream->push, "stream %u init", i); 520 TEST_CHECK_GOTO(status == NV_OK, done); 521 522 test_memset_rm_mem(&stream->push, stream->counter_mem, 0); 523 test_memset_rm_mem(&stream->push, stream->counter_snapshots_mem, 0); 524 test_memset_rm_mem(&stream->push, stream->other_stream_counter_snapshots_mem, 0); 525 526 status = uvm_push_end_and_wait(&stream->push); 527 TEST_CHECK_GOTO(status == NV_OK, done); 528 529 if (fatal_signal_pending(current)) { 530 status = NV_ERR_SIGNAL_PENDING; 531 goto done; 532 } 533 534 // Let other threads run 535 schedule(); 536 } 537 538 if (verbose > 0) { 539 UVM_TEST_PRINT("Init done, seed %u, GPUs:\n", seed); 540 for_each_va_space_gpu(gpu, va_space) { 541 UVM_TEST_PRINT(" GPU %s\n", uvm_gpu_name(gpu)); 542 } 543 } 544 545 for (i = 0; i < iterations_per_stream; ++i) { 546 for (j = 0; j < num_streams; ++j) { 547 uvm_test_stream_t *stream = &streams[j]; 548 uvm_channel_type_t channel_type; 549 gpu = random_va_space_gpu(&rng, va_space); 550 551 if (fatal_signal_pending(current)) { 552 status = NV_ERR_SIGNAL_PENDING; 553 goto done; 554 } 555 556 // Select a random channel type. In SR-IOV heavy the selection has 557 // to exclude the type associated with proxy channels, because they 558 // do not support the virtual memcopies/memsets pushed by 559 // snapshot_counter and set_counter 560 channel_type = gpu_random_internal_ce_channel_type(gpu, &rng); 561 562 status = uvm_push_begin_acquire(gpu->channel_manager, 563 channel_type, 564 &stream->tracker, 565 &stream->push, 566 "stream %u payload %u gid %u channel_type %u", 567 j, 568 stream->queued_counter_value, 569 uvm_id_value(gpu->id), 570 channel_type); 571 TEST_CHECK_GOTO(status == NV_OK, done); 572 573 snapshot_counter(&stream->push, 574 stream->counter_mem, 575 stream->counter_snapshots_mem, 576 i, 577 stream->queued_counter_repeat); 578 579 // Set a random number [2, MAX_COUNTER_REPEAT_COUNT] of counters 580 stream->queued_counter_repeat = uvm_test_rng_range_32(&rng, 2, MAX_COUNTER_REPEAT_COUNT); 581 set_counter(&stream->push, 582 stream->counter_mem, 583 ++stream->queued_counter_value, 584 stream->queued_counter_repeat); 585 586 if (uvm_test_rng_range_32(&rng, 0, 1) == 0) { 587 NvU32 random_stream_index = uvm_test_rng_range_32(&rng, 0, num_streams - 1); 588 uvm_test_stream_t *random_stream = &streams[random_stream_index]; 589 uvm_push_acquire_tracker(&stream->push, &random_stream->tracker); 590 snapshot_counter(&stream->push, 591 random_stream->counter_mem, 592 stream->other_stream_counter_snapshots_mem, 593 i, 594 random_stream->queued_counter_repeat); 595 } 596 597 uvm_push_end(&stream->push); 598 uvm_tracker_clear(&stream->tracker); 599 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&stream->tracker, &stream->push), done); 600 } 601 602 // Randomly schedule other threads 603 if (uvm_test_rng_range_32(&rng, 0, 9) == 0) 604 schedule(); 605 } 606 607 if (verbose > 0) 608 UVM_TEST_PRINT("All work scheduled\n"); 609 610 // Let other threads run 611 schedule(); 612 613 for (i = 0; i < num_streams; ++i) { 614 uvm_test_stream_t *stream = &streams[i]; 615 status = uvm_tracker_wait(&stream->tracker); 616 if (status != NV_OK) { 617 UVM_TEST_PRINT("Failed to wait for the tracker for stream %u: %s\n", i, nvstatusToString(status)); 618 goto done; 619 } 620 for (j = 0; j < iterations_per_stream; ++j) { 621 NvU32 snapshot_last = stream->counter_snapshots[j * 2]; 622 NvU32 snapshot_first = stream->counter_snapshots[j * 2 + 1]; 623 if (snapshot_last != j || snapshot_first != j) { 624 UVM_TEST_PRINT("Stream %u counter snapshot[%u] = %u,%u instead of %u,%u\n", 625 i, 626 j, 627 snapshot_last, 628 snapshot_first, 629 j, 630 j); 631 status = NV_ERR_INVALID_STATE; 632 goto done; 633 } 634 } 635 for (j = 0; j < iterations_per_stream; ++j) { 636 NvU32 snapshot_last = stream->other_stream_counter_snapshots[j * 2]; 637 NvU32 snapshot_first = stream->other_stream_counter_snapshots[j * 2 + 1]; 638 NvU32 expected = stream->other_stream_counter_expected[j]; 639 if (snapshot_last < expected || snapshot_first < expected) { 640 UVM_TEST_PRINT("Stream %u other_counter snapshot[%u] = %u,%u which is < of %u,%u\n", 641 i, 642 j, 643 snapshot_last, 644 snapshot_first, 645 expected, 646 expected); 647 status = NV_ERR_INVALID_STATE; 648 goto done; 649 } 650 } 651 } 652 653 if (verbose > 0) 654 UVM_TEST_PRINT("Verification done\n"); 655 656 schedule(); 657 658 done: 659 // Wait for all the trackers first before freeing up memory as streams 660 // references each other's buffers. 661 for (i = 0; i < num_streams; ++i) { 662 uvm_test_stream_t *stream = &streams[i]; 663 uvm_tracker_wait(&stream->tracker); 664 } 665 666 for (i = 0; i < num_streams; ++i) { 667 uvm_test_stream_t *stream = &streams[i]; 668 uvm_gpu_semaphore_free(&stream->semaphore); 669 uvm_rm_mem_free(stream->other_stream_counter_snapshots_mem); 670 uvm_rm_mem_free(stream->counter_snapshots_mem); 671 uvm_rm_mem_free(stream->counter_mem); 672 uvm_tracker_deinit(&stream->tracker); 673 uvm_kvfree(stream->other_stream_counter_expected); 674 } 675 uvm_kvfree(streams); 676 677 if (verbose > 0) 678 UVM_TEST_PRINT("Cleanup done\n"); 679 680 return status; 681 } 682 683 // The following test is inspired by uvm_push_test.c:test_concurrent_pushes. 684 // This test verifies that concurrent pushes using the same channel pool 685 // select different channels, when the Confidential Computing feature is 686 // enabled. 687 NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space) 688 { 689 NV_STATUS status = NV_OK; 690 uvm_channel_pool_t *pool; 691 uvm_push_t *pushes; 692 uvm_gpu_t *gpu; 693 NvU32 i; 694 NvU32 num_pushes; 695 696 gpu = uvm_va_space_find_first_gpu(va_space); 697 698 if (!uvm_conf_computing_mode_enabled(gpu)) 699 return NV_OK; 700 701 uvm_thread_context_lock_disable_tracking(); 702 703 for_each_va_space_gpu(gpu, va_space) { 704 uvm_channel_type_t channel_type; 705 706 for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) { 707 pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type]; 708 TEST_CHECK_RET(pool != NULL); 709 710 // Skip LCIC channels as those can't accept any pushes 711 if (uvm_channel_pool_is_lcic(pool)) 712 continue; 713 714 if (pool->num_channels < 2) 715 continue; 716 717 num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES); 718 719 pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes); 720 TEST_CHECK_RET(pushes != NULL); 721 722 for (i = 0; i < num_pushes; i++) { 723 uvm_push_t *push = &pushes[i]; 724 status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i); 725 TEST_NV_CHECK_GOTO(status, error); 726 if (i > 0) 727 TEST_CHECK_GOTO(pushes[i-1].channel != push->channel, error); 728 } 729 for (i = 0; i < num_pushes; i++) { 730 uvm_push_t *push = &pushes[i]; 731 status = uvm_push_end_and_wait(push); 732 TEST_NV_CHECK_GOTO(status, error); 733 } 734 735 uvm_kvfree(pushes); 736 } 737 } 738 739 uvm_thread_context_lock_enable_tracking(); 740 741 return status; 742 error: 743 uvm_thread_context_lock_enable_tracking(); 744 uvm_kvfree(pushes); 745 746 return status; 747 } 748 749 NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space) 750 { 751 uvm_gpu_t *gpu; 752 753 for_each_va_space_gpu(gpu, va_space) { 754 uvm_channel_manager_t *manager = gpu->channel_manager; 755 uvm_channel_pool_t *pool; 756 757 uvm_for_each_pool(pool, manager) { 758 uvm_channel_t *channel; 759 760 // Skip LCIC channels as those can't accept any pushes 761 if (uvm_channel_pool_is_lcic(pool)) 762 continue; 763 764 // Skip WLC channels as those can't accept ctrl gpfifos 765 // after their schedule is set up 766 if (uvm_channel_pool_is_wlc(pool)) 767 continue; 768 uvm_for_each_channel_in_pool(channel, pool) { 769 NvU32 i; 770 771 if (uvm_channel_is_proxy(channel)) 772 continue; 773 774 // We submit 8x the channel's GPFIFO entries to force a few 775 // complete loops in the GPFIFO circular buffer. 776 for (i = 0; i < 8 * channel->num_gpfifo_entries; i++) { 777 NvU64 entry; 778 gpu->parent->host_hal->set_gpfifo_noop(&entry); 779 TEST_NV_CHECK_RET(uvm_channel_write_ctrl_gpfifo(channel, entry)); 780 } 781 } 782 } 783 } 784 785 return NV_OK; 786 } 787 788 NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space) 789 { 790 uvm_gpu_t *gpu; 791 792 for_each_va_space_gpu(gpu, va_space) { 793 uvm_channel_manager_t *manager = gpu->channel_manager; 794 uvm_channel_pool_t *pool; 795 796 uvm_for_each_pool(pool, manager) { 797 uvm_channel_t *channel; 798 799 // Skip LCIC channels as those can't accept any pushes 800 if (uvm_channel_pool_is_lcic(pool)) 801 continue; 802 803 // Skip WLC channels as those can't accept ctrl gpfifos 804 // after their schedule is set up 805 if (uvm_channel_pool_is_wlc(pool)) 806 continue; 807 uvm_for_each_channel_in_pool(channel, pool) { 808 NvU32 i; 809 uvm_push_t push; 810 811 if (uvm_channel_is_proxy(channel)) 812 continue; 813 814 // We submit 8x the channel's GPFIFO entries to force a few 815 // complete loops in the GPFIFO circular buffer. 816 for (i = 0; i < 8 * channel->num_gpfifo_entries; i++) { 817 if (i % 2 == 0) { 818 NvU64 entry; 819 gpu->parent->host_hal->set_gpfifo_noop(&entry); 820 TEST_NV_CHECK_RET(uvm_channel_write_ctrl_gpfifo(channel, entry)); 821 } 822 else { 823 TEST_NV_CHECK_RET(uvm_push_begin_on_channel(channel, &push, "gpfifo ctrl and push test")); 824 uvm_push_end(&push); 825 } 826 } 827 828 TEST_NV_CHECK_RET(uvm_push_wait(&push)); 829 } 830 } 831 } 832 833 return NV_OK; 834 } 835 836 NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space) 837 { 838 NV_STATUS status = NV_OK; 839 uvm_gpu_t *gpu; 840 uvm_channel_t *channel; 841 uvm_rm_mem_t *mem; 842 NvU32 *cpu_ptr; 843 NvU64 gpu_va; 844 NvU32 i; 845 NvU64 entry; 846 uvm_push_t push; 847 848 gpu = uvm_va_space_find_first_gpu(va_space); 849 850 // TODO: Bug 3839176: the test is waived on Confidential Computing because 851 // it assumes that GPU can access system memory without using encryption. 852 if (uvm_conf_computing_mode_enabled(gpu)) 853 return NV_OK; 854 855 for_each_va_space_gpu(gpu, va_space) { 856 uvm_channel_manager_t *manager = gpu->channel_manager; 857 858 TEST_NV_CHECK_RET(uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(*cpu_ptr), 0, &mem)); 859 cpu_ptr = uvm_rm_mem_get_cpu_va(mem); 860 gpu_va = uvm_rm_mem_get_gpu_uvm_va(mem, gpu); 861 862 *cpu_ptr = 0; 863 864 // This semaphore acquire takes 1 GPFIFO entries. 865 TEST_NV_CHECK_GOTO(uvm_push_begin(manager, UVM_CHANNEL_TYPE_GPU_TO_GPU, &push, "gpfifo ctrl tight test acq"), 866 error); 867 868 channel = push.channel; 869 UVM_ASSERT(!uvm_channel_is_proxy(channel)); 870 871 gpu->parent->host_hal->semaphore_acquire(&push, gpu_va, 1); 872 uvm_push_end(&push); 873 874 // Flush all completed entries from the GPFIFO ring buffer. This test 875 // requires this flush because we verify (below with 876 // uvm_channel_get_available_gpfifo_entries) the number of free entries 877 // in the channel. 878 uvm_channel_update_progress_all(channel); 879 880 // Populate the remaining GPFIFO entries, leaving 2 slots available. 881 // 2 available entries + 1 semaphore acquire (above) + 1 spare entry to 882 // indicate a terminal condition for the GPFIFO ringbuffer, therefore we 883 // push num_gpfifo_entries-4. 884 for (i = 0; i < channel->num_gpfifo_entries - 4; i++) { 885 TEST_NV_CHECK_GOTO(uvm_push_begin_on_channel(channel, &push, "gpfifo ctrl tight test populate"), error); 886 uvm_push_end(&push); 887 } 888 889 TEST_CHECK_GOTO(uvm_channel_get_available_gpfifo_entries(channel) == 2, error); 890 891 // We should have room for the control GPFIFO and the subsequent 892 // semaphore release. 893 gpu->parent->host_hal->set_gpfifo_noop(&entry); 894 TEST_NV_CHECK_GOTO(uvm_channel_write_ctrl_gpfifo(channel, entry), error); 895 896 // Release the semaphore. 897 UVM_WRITE_ONCE(*cpu_ptr, 1); 898 899 TEST_NV_CHECK_GOTO(uvm_push_wait(&push), error); 900 901 uvm_rm_mem_free(mem); 902 } 903 904 return NV_OK; 905 906 error: 907 uvm_rm_mem_free(mem); 908 909 return status; 910 } 911 912 // This test is inspired by the test_rc (above). 913 // The test recreates the GPU's channel manager forcing its pushbuffer to be 914 // mapped on a non-zero 1TB segment. This exercises work submission from 915 // pushbuffers whose VAs are greater than 1TB. 916 static NV_STATUS test_channel_pushbuffer_extension_base(uvm_va_space_t *va_space) 917 { 918 uvm_gpu_t *gpu; 919 NV_STATUS status = NV_OK; 920 921 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 922 923 for_each_va_space_gpu(gpu, va_space) { 924 uvm_channel_manager_t *manager; 925 uvm_channel_pool_t *pool; 926 927 if (!uvm_gpu_has_pushbuffer_segments(gpu)) 928 continue; 929 930 // The GPU channel manager pushbuffer is destroyed and then re-created 931 // after testing a non-zero pushbuffer extension base, so this test 932 // requires exclusive access to the GPU. 933 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 934 935 gpu->uvm_test_force_upper_pushbuffer_segment = 1; 936 uvm_channel_manager_destroy(gpu->channel_manager); 937 TEST_NV_CHECK_GOTO(uvm_channel_manager_create(gpu, &gpu->channel_manager), error); 938 gpu->uvm_test_force_upper_pushbuffer_segment = 0; 939 940 manager = gpu->channel_manager; 941 TEST_CHECK_GOTO(uvm_pushbuffer_get_gpu_va_base(manager->pushbuffer) >= (1ull << 40), error); 942 943 // Submit a few pushes with the recently allocated 944 // channel_manager->pushbuffer. 945 uvm_for_each_pool(pool, manager) { 946 uvm_channel_t *channel; 947 948 // Skip LCIC channels as those can't accept any pushes 949 if (uvm_channel_pool_is_lcic(pool)) 950 continue; 951 uvm_for_each_channel_in_pool(channel, pool) { 952 NvU32 i; 953 uvm_push_t push; 954 955 for (i = 0; i < channel->num_gpfifo_entries; i++) { 956 TEST_NV_CHECK_GOTO(uvm_push_begin_on_channel(channel, &push, "pushbuffer extension push test"), 957 error); 958 uvm_push_end(&push); 959 } 960 961 TEST_NV_CHECK_GOTO(uvm_push_wait(&push), error); 962 } 963 } 964 } 965 966 return NV_OK; 967 968 error: 969 gpu->uvm_test_force_upper_pushbuffer_segment = 0; 970 971 return status; 972 } 973 974 NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct file *filp) 975 { 976 NV_STATUS status; 977 uvm_va_space_t *va_space = uvm_va_space_get(filp); 978 979 uvm_mutex_lock(&g_uvm_global.global_lock); 980 uvm_va_space_down_read_rm(va_space); 981 982 status = test_ordering(va_space); 983 if (status != NV_OK) 984 goto done; 985 986 status = test_write_ctrl_gpfifo_noop(va_space); 987 if (status != NV_OK) 988 goto done; 989 990 status = test_write_ctrl_gpfifo_and_pushes(va_space); 991 if (status != NV_OK) 992 goto done; 993 994 status = test_write_ctrl_gpfifo_tight(va_space); 995 if (status != NV_OK) 996 goto done; 997 998 status = test_conf_computing_channel_selection(va_space); 999 if (status != NV_OK) 1000 goto done; 1001 1002 // The following tests have side effects, they reset the GPU's 1003 // channel_manager. 1004 status = test_channel_pushbuffer_extension_base(va_space); 1005 if (status != NV_OK) 1006 goto done; 1007 1008 g_uvm_global.disable_fatal_error_assert = true; 1009 uvm_release_asserts_set_global_error_for_tests = true; 1010 status = test_unexpected_completed_values(va_space); 1011 uvm_release_asserts_set_global_error_for_tests = false; 1012 g_uvm_global.disable_fatal_error_assert = false; 1013 if (status != NV_OK) 1014 goto done; 1015 1016 if (g_uvm_global.num_simulated_devices == 0) { 1017 status = test_rc(va_space); 1018 if (status != NV_OK) 1019 goto done; 1020 } 1021 1022 done: 1023 uvm_va_space_up_read_rm(va_space); 1024 uvm_mutex_unlock(&g_uvm_global.global_lock); 1025 1026 return status; 1027 } 1028 1029 static NV_STATUS uvm_test_channel_stress_stream(uvm_va_space_t *va_space, 1030 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1031 { 1032 NV_STATUS status = NV_OK; 1033 1034 if (params->iterations == 0 || params->num_streams == 0) 1035 return NV_ERR_INVALID_PARAMETER; 1036 1037 // TODO: Bug 1764963: Rework the test to not rely on the global lock as that 1038 // serializes all the threads calling this at the same time. 1039 uvm_mutex_lock(&g_uvm_global.global_lock); 1040 uvm_va_space_down_read_rm(va_space); 1041 1042 // TODO: Bug 3839176: the test is waived on Confidential Computing because 1043 // it assumes that GPU can access system memory without using encryption. 1044 if (uvm_conf_computing_mode_enabled(uvm_va_space_find_first_gpu(va_space))) 1045 goto done; 1046 1047 status = stress_test_all_gpus_in_va(va_space, 1048 params->num_streams, 1049 params->iterations, 1050 params->seed, 1051 params->verbose); 1052 1053 done: 1054 uvm_va_space_up_read_rm(va_space); 1055 uvm_mutex_unlock(&g_uvm_global.global_lock); 1056 1057 return status; 1058 } 1059 1060 static NV_STATUS uvm_test_channel_stress_update_channels(uvm_va_space_t *va_space, 1061 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1062 { 1063 NV_STATUS status = NV_OK; 1064 uvm_test_rng_t rng; 1065 NvU32 i; 1066 1067 uvm_test_rng_init(&rng, params->seed); 1068 1069 uvm_va_space_down_read(va_space); 1070 1071 for (i = 0; i < params->iterations; ++i) { 1072 uvm_gpu_t *gpu = random_va_space_gpu(&rng, va_space); 1073 uvm_channel_manager_update_progress(gpu->channel_manager); 1074 1075 if (fatal_signal_pending(current)) { 1076 status = NV_ERR_SIGNAL_PENDING; 1077 goto done; 1078 } 1079 } 1080 1081 done: 1082 uvm_va_space_up_read(va_space); 1083 1084 return status; 1085 } 1086 1087 static NV_STATUS uvm_test_channel_noop_push(uvm_va_space_t *va_space, 1088 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1089 { 1090 NV_STATUS status = NV_OK; 1091 uvm_push_t push; 1092 uvm_test_rng_t rng; 1093 uvm_gpu_t *gpu; 1094 NvU32 i; 1095 1096 uvm_test_rng_init(&rng, params->seed); 1097 1098 uvm_va_space_down_read(va_space); 1099 1100 for (i = 0; i < params->iterations; ++i) { 1101 uvm_channel_type_t channel_type = random_ce_channel_type(&rng); 1102 gpu = random_va_space_gpu(&rng, va_space); 1103 1104 status = uvm_push_begin(gpu->channel_manager, channel_type, &push, "noop push"); 1105 if (status != NV_OK) 1106 goto done; 1107 1108 // Push an actual noop method so that the push doesn't get optimized 1109 // away if we ever detect empty pushes. 1110 gpu->parent->host_hal->noop(&push, UVM_METHOD_SIZE); 1111 1112 uvm_push_end(&push); 1113 1114 if (fatal_signal_pending(current)) { 1115 status = NV_ERR_SIGNAL_PENDING; 1116 goto done; 1117 } 1118 } 1119 if (params->verbose > 0) 1120 UVM_TEST_PRINT("Noop pushes: completed %u pushes seed: %u\n", i, params->seed); 1121 1122 for_each_va_space_gpu_in_mask(gpu, va_space, &va_space->registered_gpu_va_spaces) { 1123 NV_STATUS wait_status = uvm_channel_manager_wait(gpu->channel_manager); 1124 if (status == NV_OK) 1125 status = wait_status; 1126 } 1127 1128 done: 1129 uvm_va_space_up_read(va_space); 1130 1131 return status; 1132 } 1133 1134 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp) 1135 { 1136 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1137 1138 switch (params->mode) { 1139 case UVM_TEST_CHANNEL_STRESS_MODE_STREAM: 1140 return uvm_test_channel_stress_stream(va_space, params); 1141 case UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS: 1142 return uvm_test_channel_stress_update_channels(va_space, params); 1143 case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH: 1144 return uvm_test_channel_noop_push(va_space, params); 1145 default: 1146 return NV_ERR_INVALID_PARAMETER; 1147 } 1148 } 1149