1 /******************************************************************************* 2 Copyright (c) 2015-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_global.h" 25 #include "uvm_channel.h" 26 #include "uvm_hal.h" 27 #include "uvm_push.h" 28 #include "uvm_test.h" 29 #include "uvm_test_rng.h" 30 #include "uvm_va_space.h" 31 #include "uvm_tracker.h" 32 #include "uvm_thread_context.h" 33 #include "uvm_gpu_semaphore.h" 34 #include "uvm_kvmalloc.h" 35 36 #define TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU 1024 37 #define TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU_EMU 64 38 39 // Schedule pushes one after another on all GPUs and channel types that copy and 40 // increment a counter into an adjacent memory location in a buffer. And then 41 // verify that all the values are correct on the CPU. 42 static NV_STATUS test_ordering(uvm_va_space_t *va_space) 43 { 44 NV_STATUS status; 45 uvm_gpu_t *gpu; 46 bool exclude_proxy_channel_type; 47 NvU32 i, j; 48 uvm_rm_mem_t *mem = NULL; 49 NvU32 *host_mem; 50 uvm_push_t push; 51 NvU64 gpu_va; 52 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 53 NvU32 value = 0; 54 const NvU32 iters_per_channel_type_per_gpu = g_uvm_global.num_simulated_devices > 0 ? 55 TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU_EMU : 56 TEST_ORDERING_ITERS_PER_CHANNEL_TYPE_PER_GPU; 57 const NvU32 values_count = iters_per_channel_type_per_gpu; 58 const size_t buffer_size = sizeof(NvU32) * values_count; 59 60 gpu = uvm_va_space_find_first_gpu(va_space); 61 TEST_CHECK_RET(gpu != NULL); 62 63 // TODO: Bug 3839176: the test is waived on Confidential Computing because 64 // it assumes that GPU can access system memory without using encryption. 65 if (uvm_conf_computing_mode_enabled(gpu)) 66 return NV_OK; 67 68 status = uvm_rm_mem_alloc_and_map_all(gpu, UVM_RM_MEM_TYPE_SYS, buffer_size, 0, &mem); 69 TEST_CHECK_GOTO(status == NV_OK, done); 70 71 host_mem = (NvU32*)uvm_rm_mem_get_cpu_va(mem); 72 memset(host_mem, 0, buffer_size); 73 74 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Initial memset"); 75 TEST_CHECK_GOTO(status == NV_OK, done); 76 77 gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address; 78 79 // Semaphore release as part of uvm_push_end() will do the membar 80 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 81 gpu->parent->ce_hal->memset_v_4(&push, gpu_va, 0, buffer_size); 82 83 uvm_push_end(&push); 84 85 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done); 86 87 exclude_proxy_channel_type = uvm_gpu_uses_proxy_channel_pool(gpu); 88 89 for (i = 0; i < iters_per_channel_type_per_gpu; ++i) { 90 for (j = 0; j < UVM_CHANNEL_TYPE_CE_COUNT; ++j) { 91 uvm_channel_type_t channel_type = j; 92 93 // Proxy channels don't support the virtual memcopies that are about 94 // to be pushed, so don't test the proxy channel type in any of the 95 // GPUs. 96 if (exclude_proxy_channel_type && (channel_type == uvm_channel_proxy_channel_type())) 97 continue; 98 99 for_each_va_space_gpu(gpu, va_space) { 100 NvU64 gpu_va_base; 101 NvU64 gpu_va_src; 102 NvU64 gpu_va_dst; 103 104 status = uvm_push_begin_acquire(gpu->channel_manager, 105 channel_type, 106 &tracker, 107 &push, 108 "memcpy and inc to %u", 109 value + 1); 110 TEST_CHECK_GOTO(status == NV_OK, done); 111 112 gpu_va_base = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(push.channel)).address; 113 gpu_va_src = gpu_va_base + (value % values_count) * sizeof(NvU32); 114 gpu_va_dst = gpu_va_base + ((value + 1) % values_count) * sizeof(NvU32); 115 116 // The semaphore reduction will do a membar before the reduction 117 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 118 gpu->parent->ce_hal->memcopy_v_to_v(&push, gpu_va_dst, gpu_va_src, sizeof(NvU32)); 119 120 // The following reduction is done from the same GPU, but the 121 // previous memcpy is to uncached sysmem and that bypasses L2 122 // and hence requires a SYSMEMBAR to be ordered. 123 gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va_dst, ++value); 124 125 uvm_push_end(&push); 126 127 uvm_tracker_clear(&tracker); 128 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done); 129 } 130 } 131 } 132 status = uvm_tracker_wait(&tracker); 133 TEST_CHECK_GOTO(status == NV_OK, done); 134 135 // At this moment, this should hold: 136 // mem[value % values_count] == value 137 // mem[(value + 1) % values_count] == value + 1 - values_count 138 // And in general, for i=[0, values_count): 139 // mem[(value + 1 + i) % values_count] == value + 1 - values_count + i 140 // Verify that 141 142 for (i = 0; i < values_count; ++i) { 143 NvU32 index = (value + 1 + i) % values_count; 144 NvU32 expected = (value + 1 + i) - values_count; 145 if (host_mem[index] != expected) { 146 UVM_TEST_PRINT("Bad value at host_mem[%u] = %u instead of %u\n", index, host_mem[index], expected); 147 status = NV_ERR_INVALID_STATE; 148 goto done; 149 } 150 } 151 152 done: 153 uvm_tracker_wait(&tracker); 154 uvm_rm_mem_free(mem); 155 156 return status; 157 } 158 159 static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space) 160 { 161 uvm_gpu_t *gpu; 162 163 for_each_va_space_gpu(gpu, va_space) { 164 uvm_channel_t *channel; 165 NvU64 completed_value; 166 167 // The GPU channel manager is destroyed and then re-created after 168 // the test, so this test requires exclusive access to the GPU. 169 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 170 171 channel = &gpu->channel_manager->channel_pools[0].channels[0]; 172 completed_value = uvm_channel_update_completed_value(channel); 173 uvm_gpu_semaphore_set_payload(&channel->tracking_sem.semaphore, (NvU32)completed_value + 1); 174 175 TEST_NV_CHECK_RET(uvm_global_get_status()); 176 uvm_channel_update_progress_all(channel); 177 TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE); 178 179 uvm_channel_manager_destroy(gpu->channel_manager); 180 181 // Destruction will hit the error again, so clear one more time. 182 uvm_global_reset_fatal_error(); 183 184 TEST_NV_CHECK_RET(uvm_channel_manager_create(gpu, &gpu->channel_manager)); 185 } 186 187 return NV_OK; 188 } 189 190 static NV_STATUS uvm_test_rc_for_gpu(uvm_gpu_t *gpu) 191 { 192 uvm_push_t push; 193 uvm_channel_pool_t *pool; 194 uvm_gpfifo_entry_t *fatal_entry; 195 uvm_push_info_t *push_info; 196 int fatal_line; 197 uvm_tracker_entry_t tracker_entry; 198 NV_STATUS status; 199 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 200 uvm_channel_manager_t *manager = gpu->channel_manager; 201 202 // Submit a bunch of successful pushes on each channel first so that the 203 // fatal one is behind a bunch of work (notably more than 204 // uvm_channel_update_progress() completes by default). 205 uvm_for_each_pool(pool, manager) { 206 uvm_channel_t *channel; 207 208 // Skip LCIC channels as those can't accept any pushes 209 if (uvm_channel_pool_is_lcic(pool)) 210 continue; 211 uvm_for_each_channel_in_pool(channel, pool) { 212 NvU32 i; 213 for (i = 0; i < 512; ++i) { 214 status = uvm_push_begin_on_channel(channel, &push, "Non-faulting push"); 215 TEST_CHECK_RET(status == NV_OK); 216 217 uvm_push_end(&push); 218 } 219 } 220 } 221 222 // Check RC on a proxy channel (SR-IOV heavy) or internal channel (any other 223 // mode). It is not allowed to use a virtual address in a memset pushed to 224 // a proxy channel, so we use a physical address instead. 225 if (uvm_gpu_uses_proxy_channel_pool(gpu)) { 226 uvm_gpu_address_t dst_address; 227 228 // Save the line number the push that's supposed to fail was started on 229 fatal_line = __LINE__ + 1; 230 TEST_NV_CHECK_RET(uvm_push_begin(manager, uvm_channel_proxy_channel_type(), &push, "Fatal push 0x%X", 0xBAD)); 231 232 // Memset targeting a physical address beyond the vidmem size. The 233 // passed physical address is not the vidmem size reported by RM 234 // because the reported size can be smaller than the actual physical 235 // size, such that accessing a GPA at the reported size may be allowed 236 // by VMMU. 237 // 238 // GA100 GPUs have way less than UVM_GPU_MAX_PHYS_MEM vidmem, so using 239 // that value as physical address should result on an error 240 dst_address = uvm_gpu_address_physical(UVM_APERTURE_VID, UVM_GPU_MAX_PHYS_MEM - 8); 241 gpu->parent->ce_hal->memset_8(&push, dst_address, 0, 8); 242 } 243 else { 244 fatal_line = __LINE__ + 1; 245 TEST_NV_CHECK_RET(uvm_push_begin(manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Fatal push 0x%X", 0xBAD)); 246 247 // Memset that should fault on 0xFFFFFFFF 248 gpu->parent->ce_hal->memset_v_4(&push, 0xFFFFFFFF, 0, 4); 249 } 250 251 uvm_push_end(&push); 252 253 uvm_push_get_tracker_entry(&push, &tracker_entry); 254 uvm_tracker_overwrite_with_push(&tracker, &push); 255 256 status = uvm_channel_manager_wait(manager); 257 TEST_CHECK_RET(status == NV_ERR_RC_ERROR); 258 259 TEST_CHECK_RET(uvm_channel_get_status(push.channel) == NV_ERR_RC_ERROR); 260 fatal_entry = uvm_channel_get_fatal_entry(push.channel); 261 TEST_CHECK_RET(fatal_entry != NULL); 262 263 push_info = fatal_entry->push_info; 264 TEST_CHECK_RET(push_info != NULL); 265 TEST_CHECK_RET(push_info->line == fatal_line); 266 TEST_CHECK_RET(strcmp(push_info->function, __FUNCTION__) == 0); 267 TEST_CHECK_RET(strcmp(push_info->filename, kbasename(__FILE__)) == 0); 268 if (uvm_push_info_is_tracking_descriptions()) 269 TEST_CHECK_RET(strcmp(push_info->description, "Fatal push 0xBAD") == 0); 270 271 TEST_CHECK_RET(uvm_global_get_status() == NV_ERR_RC_ERROR); 272 273 // Check that waiting for an entry after a global fatal error makes the 274 // entry completed. 275 TEST_CHECK_RET(!uvm_tracker_is_entry_completed(&tracker_entry)); 276 TEST_CHECK_RET(uvm_tracker_wait_for_entry(&tracker_entry) == NV_ERR_RC_ERROR); 277 TEST_CHECK_RET(uvm_tracker_is_entry_completed(&tracker_entry)); 278 279 // Check that waiting for a tracker after a global fatal error, clears all 280 // the entries from the tracker. 281 TEST_CHECK_RET(!uvm_tracker_is_empty(&tracker)); 282 TEST_CHECK_RET(uvm_tracker_wait(&tracker) == NV_ERR_RC_ERROR); 283 TEST_CHECK_RET(uvm_tracker_is_empty(&tracker)); 284 285 TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_RC_ERROR); 286 287 return NV_OK; 288 } 289 290 static NV_STATUS test_rc(uvm_va_space_t *va_space) 291 { 292 uvm_gpu_t *gpu; 293 294 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 295 296 for_each_va_space_gpu(gpu, va_space) { 297 NV_STATUS test_status, create_status; 298 299 // The GPU channel manager is destroyed and then re-created after 300 // testing RC, so this test requires exclusive access to the GPU. 301 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 302 303 g_uvm_global.disable_fatal_error_assert = true; 304 test_status = uvm_test_rc_for_gpu(gpu); 305 g_uvm_global.disable_fatal_error_assert = false; 306 307 uvm_channel_manager_destroy(gpu->channel_manager); 308 create_status = uvm_channel_manager_create(gpu, &gpu->channel_manager); 309 310 TEST_NV_CHECK_RET(test_status); 311 TEST_NV_CHECK_RET(create_status); 312 } 313 314 return NV_OK; 315 } 316 317 typedef struct 318 { 319 uvm_push_t push; 320 uvm_tracker_t tracker; 321 uvm_gpu_semaphore_t semaphore; 322 NvU32 queued_counter_value; 323 NvU32 queued_counter_repeat; 324 uvm_rm_mem_t *counter_mem; 325 uvm_rm_mem_t *counter_snapshots_mem; 326 uvm_rm_mem_t *other_stream_counter_snapshots_mem; 327 NvU32 *counter_snapshots; 328 NvU32 *other_stream_counter_snapshots; 329 NvU32 *other_stream_counter_expected; 330 } uvm_test_stream_t; 331 332 #define MAX_COUNTER_REPEAT_COUNT 10 * 1024 333 // For each iter, snapshot the first and last counter value 334 #define TEST_SNAPSHOT_SIZE(it) (2 * it * sizeof(NvU32)) 335 336 static void snapshot_counter(uvm_push_t *push, 337 uvm_rm_mem_t *counter_mem, 338 uvm_rm_mem_t *snapshot_mem, 339 NvU32 index, 340 NvU32 counters_count) 341 { 342 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 343 NvU64 counter_gpu_va; 344 NvU64 snapshot_gpu_va; 345 bool is_proxy_channel; 346 NvU32 last_counter_offset = (counters_count - 1) * sizeof(NvU32); 347 348 if (counters_count == 0) 349 return; 350 351 is_proxy_channel = uvm_channel_is_proxy(push->channel); 352 counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address; 353 snapshot_gpu_va = uvm_rm_mem_get_gpu_va(snapshot_mem, gpu, is_proxy_channel).address + index * 2 * sizeof(NvU32); 354 355 // Copy the last and first counter to a snapshot for later verification. 356 357 // Membar will be done by uvm_push_end() 358 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 359 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 360 gpu->parent->ce_hal->memcopy_v_to_v(push, 361 snapshot_gpu_va + sizeof(NvU32), 362 counter_gpu_va + last_counter_offset, 363 sizeof(NvU32)); 364 365 // Membar will be done by uvm_push_end() 366 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 367 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 368 gpu->parent->ce_hal->memcopy_v_to_v(push, snapshot_gpu_va, counter_gpu_va, sizeof(NvU32)); 369 } 370 371 static void set_counter(uvm_push_t *push, uvm_rm_mem_t *counter_mem, NvU32 value, NvU32 count) 372 { 373 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 374 NvU64 counter_gpu_va; 375 bool is_proxy_channel; 376 377 is_proxy_channel = uvm_channel_is_proxy(push->channel); 378 counter_gpu_va = uvm_rm_mem_get_gpu_va(counter_mem, gpu, is_proxy_channel).address; 379 380 gpu->parent->ce_hal->memset_v_4(push, counter_gpu_va, value, count * sizeof(NvU32)); 381 } 382 383 static uvm_channel_type_t random_ce_channel_type(uvm_test_rng_t *rng) 384 { 385 return (uvm_channel_type_t)uvm_test_rng_range_32(rng, 0, UVM_CHANNEL_TYPE_CE_COUNT - 1); 386 } 387 388 static uvm_channel_type_t random_ce_channel_type_except(uvm_test_rng_t *rng, uvm_channel_type_t exception) 389 { 390 uvm_channel_type_t channel_type; 391 392 UVM_ASSERT(exception < UVM_CHANNEL_TYPE_CE_COUNT); 393 394 channel_type = (uvm_channel_type_t)uvm_test_rng_range_32(rng, 0, UVM_CHANNEL_TYPE_CE_COUNT - 2); 395 396 if (channel_type >= exception) 397 channel_type++; 398 399 UVM_ASSERT(channel_type < UVM_CHANNEL_TYPE_CE_COUNT); 400 401 return channel_type; 402 } 403 404 static uvm_channel_type_t gpu_random_internal_ce_channel_type(uvm_gpu_t *gpu, uvm_test_rng_t *rng) 405 { 406 if (uvm_gpu_uses_proxy_channel_pool(gpu)) 407 return random_ce_channel_type_except(rng, uvm_channel_proxy_channel_type()); 408 409 return random_ce_channel_type(rng); 410 } 411 412 static uvm_gpu_t *random_va_space_gpu(uvm_test_rng_t *rng, uvm_va_space_t *va_space) 413 { 414 uvm_gpu_t *gpu; 415 NvU32 gpu_count = uvm_processor_mask_get_gpu_count(&va_space->registered_gpus); 416 NvU32 gpu_index = uvm_test_rng_range_32(rng, 0, gpu_count - 1); 417 418 UVM_ASSERT(gpu_count > 0); 419 420 for_each_va_space_gpu(gpu, va_space) { 421 if (gpu_index-- == 0) 422 return gpu; 423 } 424 425 UVM_ASSERT(0); 426 return NULL; 427 } 428 429 430 static void test_memset_rm_mem(uvm_push_t *push, uvm_rm_mem_t *rm_mem, NvU32 value) 431 { 432 uvm_gpu_t *gpu; 433 NvU64 gpu_va; 434 435 UVM_ASSERT(rm_mem->size % 4 == 0); 436 437 gpu = uvm_push_get_gpu(push); 438 gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_channel_is_proxy(push->channel)).address; 439 440 gpu->parent->ce_hal->memset_v_4(push, gpu_va, value, rm_mem->size); 441 } 442 443 // This test schedules a randomly sized memset on a random channel and GPU in a 444 // "stream" that has operations ordered by acquiring the tracker of the previous 445 // operation. It also snapshots the memset done by the previous operation in the 446 // stream to verify it later on the CPU. Each iteration also optionally acquires 447 // a different stream and snapshots its memset. 448 // The test ioctl is expected to be called at the same time from multiple 449 // threads and contains some schedule() calls to help get as many threads 450 // through the init phase before other threads continue. It also has a random 451 // schedule() call in the main loop scheduling GPU work. 452 static NV_STATUS stress_test_all_gpus_in_va(uvm_va_space_t *va_space, 453 NvU32 num_streams, 454 NvU32 iterations_per_stream, 455 NvU32 seed, 456 NvU32 verbose) 457 { 458 NV_STATUS status = NV_OK; 459 uvm_gpu_t *gpu; 460 NvU32 i, j; 461 uvm_test_stream_t *streams; 462 uvm_test_rng_t rng; 463 464 uvm_test_rng_init(&rng, seed); 465 466 gpu = uvm_va_space_find_first_gpu(va_space); 467 TEST_CHECK_RET(gpu != NULL); 468 469 streams = uvm_kvmalloc_zero(sizeof(*streams) * num_streams); 470 TEST_CHECK_RET(streams != NULL); 471 472 // Initialize all the trackers first so that clean up on error can always 473 // wait for them. 474 for (i = 0; i < num_streams; ++i) { 475 uvm_test_stream_t *stream = &streams[i]; 476 uvm_tracker_init(&stream->tracker); 477 } 478 479 for (i = 0; i < num_streams; ++i) { 480 uvm_test_stream_t *stream = &streams[i]; 481 482 status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &stream->semaphore); 483 if (status != NV_OK) 484 goto done; 485 486 stream->queued_counter_value = 0; 487 488 status = uvm_rm_mem_alloc_and_map_all(gpu, 489 UVM_RM_MEM_TYPE_SYS, 490 MAX_COUNTER_REPEAT_COUNT * sizeof(NvU32), 491 0, 492 &stream->counter_mem); 493 TEST_CHECK_GOTO(status == NV_OK, done); 494 495 status = uvm_rm_mem_alloc_and_map_all(gpu, 496 UVM_RM_MEM_TYPE_SYS, 497 TEST_SNAPSHOT_SIZE(iterations_per_stream), 498 0, 499 &stream->counter_snapshots_mem); 500 TEST_CHECK_GOTO(status == NV_OK, done); 501 502 stream->counter_snapshots = (NvU32*)uvm_rm_mem_get_cpu_va(stream->counter_snapshots_mem); 503 504 status = uvm_rm_mem_alloc_and_map_all(gpu, 505 UVM_RM_MEM_TYPE_SYS, 506 TEST_SNAPSHOT_SIZE(iterations_per_stream), 507 0, 508 &stream->other_stream_counter_snapshots_mem); 509 TEST_CHECK_GOTO(status == NV_OK, done); 510 511 stream->other_stream_counter_snapshots = (NvU32*)uvm_rm_mem_get_cpu_va(stream->other_stream_counter_snapshots_mem); 512 513 stream->other_stream_counter_expected = uvm_kvmalloc_zero(sizeof(NvU32) * iterations_per_stream); 514 if (stream->other_stream_counter_expected == NULL) { 515 status = NV_ERR_NO_MEMORY; 516 goto done; 517 } 518 519 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, &stream->push, "stream %u init", i); 520 TEST_CHECK_GOTO(status == NV_OK, done); 521 522 test_memset_rm_mem(&stream->push, stream->counter_mem, 0); 523 test_memset_rm_mem(&stream->push, stream->counter_snapshots_mem, 0); 524 test_memset_rm_mem(&stream->push, stream->other_stream_counter_snapshots_mem, 0); 525 526 status = uvm_push_end_and_wait(&stream->push); 527 TEST_CHECK_GOTO(status == NV_OK, done); 528 529 if (fatal_signal_pending(current)) { 530 status = NV_ERR_SIGNAL_PENDING; 531 goto done; 532 } 533 534 // Let other threads run 535 schedule(); 536 } 537 538 if (verbose > 0) { 539 UVM_TEST_PRINT("Init done, seed %u, GPUs:\n", seed); 540 for_each_va_space_gpu(gpu, va_space) { 541 UVM_TEST_PRINT(" GPU %s\n", uvm_gpu_name(gpu)); 542 } 543 } 544 545 for (i = 0; i < iterations_per_stream; ++i) { 546 for (j = 0; j < num_streams; ++j) { 547 uvm_test_stream_t *stream = &streams[j]; 548 uvm_channel_type_t channel_type; 549 gpu = random_va_space_gpu(&rng, va_space); 550 551 if (fatal_signal_pending(current)) { 552 status = NV_ERR_SIGNAL_PENDING; 553 goto done; 554 } 555 556 // Select a random channel type. In SR-IOV heavy the selection has 557 // to exclude the type associated with proxy channels, because they 558 // do not support the virtual memcopies/memsets pushed by 559 // snapshot_counter and set_counter 560 channel_type = gpu_random_internal_ce_channel_type(gpu, &rng); 561 562 status = uvm_push_begin_acquire(gpu->channel_manager, 563 channel_type, 564 &stream->tracker, 565 &stream->push, 566 "stream %u payload %u gid %u channel_type %u", 567 j, 568 stream->queued_counter_value, 569 uvm_id_value(gpu->id), 570 channel_type); 571 TEST_CHECK_GOTO(status == NV_OK, done); 572 573 snapshot_counter(&stream->push, 574 stream->counter_mem, 575 stream->counter_snapshots_mem, 576 i, 577 stream->queued_counter_repeat); 578 579 // Set a random number [2, MAX_COUNTER_REPEAT_COUNT] of counters 580 stream->queued_counter_repeat = uvm_test_rng_range_32(&rng, 2, MAX_COUNTER_REPEAT_COUNT); 581 set_counter(&stream->push, 582 stream->counter_mem, 583 ++stream->queued_counter_value, 584 stream->queued_counter_repeat); 585 586 if (uvm_test_rng_range_32(&rng, 0, 1) == 0) { 587 NvU32 random_stream_index = uvm_test_rng_range_32(&rng, 0, num_streams - 1); 588 uvm_test_stream_t *random_stream = &streams[random_stream_index]; 589 uvm_push_acquire_tracker(&stream->push, &random_stream->tracker); 590 snapshot_counter(&stream->push, 591 random_stream->counter_mem, 592 stream->other_stream_counter_snapshots_mem, 593 i, 594 random_stream->queued_counter_repeat); 595 } 596 597 uvm_push_end(&stream->push); 598 uvm_tracker_clear(&stream->tracker); 599 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&stream->tracker, &stream->push), done); 600 } 601 602 // Randomly schedule other threads 603 if (uvm_test_rng_range_32(&rng, 0, 9) == 0) 604 schedule(); 605 } 606 607 if (verbose > 0) 608 UVM_TEST_PRINT("All work scheduled\n"); 609 610 // Let other threads run 611 schedule(); 612 613 for (i = 0; i < num_streams; ++i) { 614 uvm_test_stream_t *stream = &streams[i]; 615 status = uvm_tracker_wait(&stream->tracker); 616 if (status != NV_OK) { 617 UVM_TEST_PRINT("Failed to wait for the tracker for stream %u: %s\n", i, nvstatusToString(status)); 618 goto done; 619 } 620 for (j = 0; j < iterations_per_stream; ++j) { 621 NvU32 snapshot_last = stream->counter_snapshots[j * 2]; 622 NvU32 snapshot_first = stream->counter_snapshots[j * 2 + 1]; 623 if (snapshot_last != j || snapshot_first != j) { 624 UVM_TEST_PRINT("Stream %u counter snapshot[%u] = %u,%u instead of %u,%u\n", 625 i, 626 j, 627 snapshot_last, 628 snapshot_first, 629 j, 630 j); 631 status = NV_ERR_INVALID_STATE; 632 goto done; 633 } 634 } 635 for (j = 0; j < iterations_per_stream; ++j) { 636 NvU32 snapshot_last = stream->other_stream_counter_snapshots[j * 2]; 637 NvU32 snapshot_first = stream->other_stream_counter_snapshots[j * 2 + 1]; 638 NvU32 expected = stream->other_stream_counter_expected[j]; 639 if (snapshot_last < expected || snapshot_first < expected) { 640 UVM_TEST_PRINT("Stream %u other_counter snapshot[%u] = %u,%u which is < of %u,%u\n", 641 i, 642 j, 643 snapshot_last, 644 snapshot_first, 645 expected, 646 expected); 647 status = NV_ERR_INVALID_STATE; 648 goto done; 649 } 650 } 651 } 652 653 if (verbose > 0) 654 UVM_TEST_PRINT("Verification done\n"); 655 656 schedule(); 657 658 done: 659 // Wait for all the trackers first before freeing up memory as streams 660 // references each other's buffers. 661 for (i = 0; i < num_streams; ++i) { 662 uvm_test_stream_t *stream = &streams[i]; 663 uvm_tracker_wait(&stream->tracker); 664 } 665 666 for (i = 0; i < num_streams; ++i) { 667 uvm_test_stream_t *stream = &streams[i]; 668 uvm_gpu_semaphore_free(&stream->semaphore); 669 uvm_rm_mem_free(stream->other_stream_counter_snapshots_mem); 670 uvm_rm_mem_free(stream->counter_snapshots_mem); 671 uvm_rm_mem_free(stream->counter_mem); 672 uvm_tracker_deinit(&stream->tracker); 673 uvm_kvfree(stream->other_stream_counter_expected); 674 } 675 uvm_kvfree(streams); 676 677 if (verbose > 0) 678 UVM_TEST_PRINT("Cleanup done\n"); 679 680 return status; 681 } 682 683 // The following test is inspired by uvm_push_test.c:test_concurrent_pushes. 684 // This test verifies that concurrent pushes using the same secure channel pool 685 // select different channels. 686 NV_STATUS test_secure_channel_selection(uvm_va_space_t *va_space) 687 { 688 NV_STATUS status = NV_OK; 689 uvm_channel_pool_t *pool; 690 uvm_push_t *pushes; 691 uvm_gpu_t *gpu; 692 NvU32 i; 693 NvU32 num_pushes; 694 695 gpu = uvm_va_space_find_first_gpu(va_space); 696 697 if (!uvm_conf_computing_mode_enabled(gpu)) 698 return NV_OK; 699 700 uvm_thread_context_lock_disable_tracking(); 701 702 for_each_va_space_gpu(gpu, va_space) { 703 uvm_channel_type_t channel_type; 704 705 for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) { 706 if (!uvm_channel_type_requires_secure_pool(gpu, channel_type)) 707 continue; 708 709 pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type]; 710 TEST_CHECK_RET(pool != NULL); 711 712 // Skip LCIC channels as those can't accept any pushes 713 if (uvm_channel_pool_is_lcic(pool)) 714 continue; 715 716 if (pool->num_channels < 2) 717 continue; 718 719 num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES); 720 721 pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes); 722 TEST_CHECK_RET(pushes != NULL); 723 724 for (i = 0; i < num_pushes; i++) { 725 uvm_push_t *push = &pushes[i]; 726 status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i); 727 TEST_NV_CHECK_GOTO(status, error); 728 if (i > 0) 729 TEST_CHECK_GOTO(pushes[i-1].channel != push->channel, error); 730 } 731 for (i = 0; i < num_pushes; i++) { 732 uvm_push_t *push = &pushes[i]; 733 status = uvm_push_end_and_wait(push); 734 TEST_NV_CHECK_GOTO(status, error); 735 } 736 737 uvm_kvfree(pushes); 738 } 739 } 740 741 uvm_thread_context_lock_enable_tracking(); 742 743 return status; 744 error: 745 uvm_thread_context_lock_enable_tracking(); 746 uvm_kvfree(pushes); 747 748 return status; 749 } 750 751 NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space) 752 { 753 uvm_gpu_t *gpu; 754 755 for_each_va_space_gpu(gpu, va_space) { 756 uvm_channel_manager_t *manager = gpu->channel_manager; 757 uvm_channel_pool_t *pool; 758 759 uvm_for_each_pool(pool, manager) { 760 uvm_channel_t *channel; 761 762 // Skip LCIC channels as those can't accept any pushes 763 if (uvm_channel_pool_is_lcic(pool)) 764 continue; 765 766 // Skip WLC channels as those can't accept ctrl gpfifos 767 // after their schedule is set up 768 if (uvm_channel_pool_is_wlc(pool)) 769 continue; 770 uvm_for_each_channel_in_pool(channel, pool) { 771 NvU32 i; 772 773 if (uvm_channel_is_proxy(channel)) 774 continue; 775 776 // We submit 8x the channel's GPFIFO entries to force a few 777 // complete loops in the GPFIFO circular buffer. 778 for (i = 0; i < 8 * channel->num_gpfifo_entries; i++) { 779 NvU64 entry; 780 gpu->parent->host_hal->set_gpfifo_noop(&entry); 781 TEST_NV_CHECK_RET(uvm_channel_write_ctrl_gpfifo(channel, entry)); 782 } 783 } 784 } 785 } 786 787 return NV_OK; 788 } 789 790 NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space) 791 { 792 uvm_gpu_t *gpu; 793 794 for_each_va_space_gpu(gpu, va_space) { 795 uvm_channel_manager_t *manager = gpu->channel_manager; 796 uvm_channel_pool_t *pool; 797 798 uvm_for_each_pool(pool, manager) { 799 uvm_channel_t *channel; 800 801 // Skip LCIC channels as those can't accept any pushes 802 if (uvm_channel_pool_is_lcic(pool)) 803 continue; 804 805 // Skip WLC channels as those can't accept ctrl gpfifos 806 // after their schedule is set up 807 if (uvm_channel_pool_is_wlc(pool)) 808 continue; 809 uvm_for_each_channel_in_pool(channel, pool) { 810 NvU32 i; 811 uvm_push_t push; 812 813 if (uvm_channel_is_proxy(channel)) 814 continue; 815 816 // We submit 8x the channel's GPFIFO entries to force a few 817 // complete loops in the GPFIFO circular buffer. 818 for (i = 0; i < 8 * channel->num_gpfifo_entries; i++) { 819 if (i % 2 == 0) { 820 NvU64 entry; 821 gpu->parent->host_hal->set_gpfifo_noop(&entry); 822 TEST_NV_CHECK_RET(uvm_channel_write_ctrl_gpfifo(channel, entry)); 823 } 824 else { 825 TEST_NV_CHECK_RET(uvm_push_begin_on_channel(channel, &push, "gpfifo ctrl and push test")); 826 uvm_push_end(&push); 827 } 828 } 829 830 TEST_NV_CHECK_RET(uvm_push_wait(&push)); 831 } 832 } 833 } 834 835 return NV_OK; 836 } 837 838 NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space) 839 { 840 NV_STATUS status = NV_OK; 841 uvm_gpu_t *gpu; 842 uvm_channel_t *channel; 843 uvm_rm_mem_t *mem; 844 NvU32 *cpu_ptr; 845 NvU64 gpu_va; 846 NvU32 i; 847 NvU64 entry; 848 uvm_push_t push; 849 850 gpu = uvm_va_space_find_first_gpu(va_space); 851 852 // TODO: Bug 3839176: the test is waived on Confidential Computing because 853 // it assumes that GPU can access system memory without using encryption. 854 if (uvm_conf_computing_mode_enabled(gpu)) 855 return NV_OK; 856 857 for_each_va_space_gpu(gpu, va_space) { 858 uvm_channel_manager_t *manager = gpu->channel_manager; 859 860 TEST_NV_CHECK_RET(uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(*cpu_ptr), 0, &mem)); 861 cpu_ptr = uvm_rm_mem_get_cpu_va(mem); 862 gpu_va = uvm_rm_mem_get_gpu_uvm_va(mem, gpu); 863 864 *cpu_ptr = 0; 865 866 // This semaphore acquire takes 1 GPFIFO entries. 867 TEST_NV_CHECK_GOTO(uvm_push_begin(manager, UVM_CHANNEL_TYPE_GPU_TO_GPU, &push, "gpfifo ctrl tight test acq"), 868 error); 869 870 channel = push.channel; 871 UVM_ASSERT(!uvm_channel_is_proxy(channel)); 872 873 gpu->parent->host_hal->semaphore_acquire(&push, gpu_va, 1); 874 uvm_push_end(&push); 875 876 // Flush all completed entries from the GPFIFO ring buffer. This test 877 // requires this flush because we verify (below with 878 // uvm_channel_get_available_gpfifo_entries) the number of free entries 879 // in the channel. 880 uvm_channel_update_progress_all(channel); 881 882 // Populate the remaining GPFIFO entries, leaving 2 slots available. 883 // 2 available entries + 1 semaphore acquire (above) + 1 spare entry to 884 // indicate a terminal condition for the GPFIFO ringbuffer, therefore we 885 // push num_gpfifo_entries-4. 886 for (i = 0; i < channel->num_gpfifo_entries - 4; i++) { 887 TEST_NV_CHECK_GOTO(uvm_push_begin_on_channel(channel, &push, "gpfifo ctrl tight test populate"), error); 888 uvm_push_end(&push); 889 } 890 891 TEST_CHECK_GOTO(uvm_channel_get_available_gpfifo_entries(channel) == 2, error); 892 893 // We should have room for the control GPFIFO and the subsequent 894 // semaphore release. 895 gpu->parent->host_hal->set_gpfifo_noop(&entry); 896 TEST_NV_CHECK_GOTO(uvm_channel_write_ctrl_gpfifo(channel, entry), error); 897 898 // Release the semaphore. 899 UVM_WRITE_ONCE(*cpu_ptr, 1); 900 901 TEST_NV_CHECK_GOTO(uvm_push_wait(&push), error); 902 903 uvm_rm_mem_free(mem); 904 } 905 906 return NV_OK; 907 908 error: 909 uvm_rm_mem_free(mem); 910 911 return status; 912 } 913 914 // This test is inspired by the test_rc (above). 915 // The test recreates the GPU's channel manager forcing its pushbuffer to be 916 // mapped on a non-zero 1TB segment. This exercises work submission from 917 // pushbuffers whose VAs are greater than 1TB. 918 static NV_STATUS test_channel_pushbuffer_extension_base(uvm_va_space_t *va_space) 919 { 920 uvm_gpu_t *gpu; 921 NV_STATUS status = NV_OK; 922 923 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 924 925 for_each_va_space_gpu(gpu, va_space) { 926 uvm_channel_manager_t *manager; 927 uvm_channel_pool_t *pool; 928 929 if (!uvm_gpu_has_pushbuffer_segments(gpu)) 930 continue; 931 932 // The GPU channel manager pushbuffer is destroyed and then re-created 933 // after testing a non-zero pushbuffer extension base, so this test 934 // requires exclusive access to the GPU. 935 TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1); 936 937 gpu->uvm_test_force_upper_pushbuffer_segment = 1; 938 uvm_channel_manager_destroy(gpu->channel_manager); 939 TEST_NV_CHECK_GOTO(uvm_channel_manager_create(gpu, &gpu->channel_manager), error); 940 gpu->uvm_test_force_upper_pushbuffer_segment = 0; 941 942 manager = gpu->channel_manager; 943 TEST_CHECK_GOTO(uvm_pushbuffer_get_gpu_va_base(manager->pushbuffer) >= (1ull << 40), error); 944 945 // Submit a few pushes with the recently allocated 946 // channel_manager->pushbuffer. 947 uvm_for_each_pool(pool, manager) { 948 uvm_channel_t *channel; 949 950 // Skip LCIC channels as those can't accept any pushes 951 if (uvm_channel_pool_is_lcic(pool)) 952 continue; 953 uvm_for_each_channel_in_pool(channel, pool) { 954 NvU32 i; 955 uvm_push_t push; 956 957 for (i = 0; i < channel->num_gpfifo_entries; i++) { 958 TEST_NV_CHECK_GOTO(uvm_push_begin_on_channel(channel, &push, "pushbuffer extension push test"), 959 error); 960 uvm_push_end(&push); 961 } 962 963 TEST_NV_CHECK_GOTO(uvm_push_wait(&push), error); 964 } 965 } 966 } 967 968 return NV_OK; 969 970 error: 971 gpu->uvm_test_force_upper_pushbuffer_segment = 0; 972 973 return status; 974 } 975 976 NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct file *filp) 977 { 978 NV_STATUS status; 979 uvm_va_space_t *va_space = uvm_va_space_get(filp); 980 981 uvm_mutex_lock(&g_uvm_global.global_lock); 982 uvm_va_space_down_read_rm(va_space); 983 984 status = test_ordering(va_space); 985 if (status != NV_OK) 986 goto done; 987 988 status = test_write_ctrl_gpfifo_noop(va_space); 989 if (status != NV_OK) 990 goto done; 991 992 status = test_write_ctrl_gpfifo_and_pushes(va_space); 993 if (status != NV_OK) 994 goto done; 995 996 status = test_write_ctrl_gpfifo_tight(va_space); 997 if (status != NV_OK) 998 goto done; 999 1000 status = test_secure_channel_selection(va_space); 1001 if (status != NV_OK) 1002 goto done; 1003 1004 // The following tests have side effects, they reset the GPU's 1005 // channel_manager. 1006 status = test_channel_pushbuffer_extension_base(va_space); 1007 if (status != NV_OK) 1008 goto done; 1009 1010 g_uvm_global.disable_fatal_error_assert = true; 1011 uvm_release_asserts_set_global_error_for_tests = true; 1012 status = test_unexpected_completed_values(va_space); 1013 uvm_release_asserts_set_global_error_for_tests = false; 1014 g_uvm_global.disable_fatal_error_assert = false; 1015 if (status != NV_OK) 1016 goto done; 1017 1018 if (g_uvm_global.num_simulated_devices == 0) { 1019 status = test_rc(va_space); 1020 if (status != NV_OK) 1021 goto done; 1022 } 1023 1024 done: 1025 uvm_va_space_up_read_rm(va_space); 1026 uvm_mutex_unlock(&g_uvm_global.global_lock); 1027 1028 return status; 1029 } 1030 1031 static NV_STATUS uvm_test_channel_stress_stream(uvm_va_space_t *va_space, 1032 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1033 { 1034 NV_STATUS status = NV_OK; 1035 1036 if (params->iterations == 0 || params->num_streams == 0) 1037 return NV_ERR_INVALID_PARAMETER; 1038 1039 // TODO: Bug 1764963: Rework the test to not rely on the global lock as that 1040 // serializes all the threads calling this at the same time. 1041 uvm_mutex_lock(&g_uvm_global.global_lock); 1042 uvm_va_space_down_read_rm(va_space); 1043 1044 // TODO: Bug 3839176: the test is waived on Confidential Computing because 1045 // it assumes that GPU can access system memory without using encryption. 1046 if (uvm_conf_computing_mode_enabled(uvm_va_space_find_first_gpu(va_space))) 1047 goto done; 1048 1049 status = stress_test_all_gpus_in_va(va_space, 1050 params->num_streams, 1051 params->iterations, 1052 params->seed, 1053 params->verbose); 1054 1055 done: 1056 uvm_va_space_up_read_rm(va_space); 1057 uvm_mutex_unlock(&g_uvm_global.global_lock); 1058 1059 return status; 1060 } 1061 1062 static NV_STATUS uvm_test_channel_stress_update_channels(uvm_va_space_t *va_space, 1063 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1064 { 1065 NV_STATUS status = NV_OK; 1066 uvm_test_rng_t rng; 1067 NvU32 i; 1068 1069 uvm_test_rng_init(&rng, params->seed); 1070 1071 uvm_va_space_down_read(va_space); 1072 1073 for (i = 0; i < params->iterations; ++i) { 1074 uvm_gpu_t *gpu = random_va_space_gpu(&rng, va_space); 1075 uvm_channel_manager_update_progress(gpu->channel_manager); 1076 1077 if (fatal_signal_pending(current)) { 1078 status = NV_ERR_SIGNAL_PENDING; 1079 goto done; 1080 } 1081 } 1082 1083 done: 1084 uvm_va_space_up_read(va_space); 1085 1086 return status; 1087 } 1088 1089 static NV_STATUS uvm_test_channel_noop_push(uvm_va_space_t *va_space, 1090 const UVM_TEST_CHANNEL_STRESS_PARAMS *params) 1091 { 1092 NV_STATUS status = NV_OK; 1093 uvm_push_t push; 1094 uvm_test_rng_t rng; 1095 uvm_gpu_t *gpu; 1096 NvU32 i; 1097 1098 uvm_test_rng_init(&rng, params->seed); 1099 1100 uvm_va_space_down_read(va_space); 1101 1102 for (i = 0; i < params->iterations; ++i) { 1103 uvm_channel_type_t channel_type = random_ce_channel_type(&rng); 1104 gpu = random_va_space_gpu(&rng, va_space); 1105 1106 status = uvm_push_begin(gpu->channel_manager, channel_type, &push, "noop push"); 1107 if (status != NV_OK) 1108 goto done; 1109 1110 // Push an actual noop method so that the push doesn't get optimized 1111 // away if we ever detect empty pushes. 1112 gpu->parent->host_hal->noop(&push, UVM_METHOD_SIZE); 1113 1114 uvm_push_end(&push); 1115 1116 if (fatal_signal_pending(current)) { 1117 status = NV_ERR_SIGNAL_PENDING; 1118 goto done; 1119 } 1120 } 1121 if (params->verbose > 0) 1122 UVM_TEST_PRINT("Noop pushes: completed %u pushes seed: %u\n", i, params->seed); 1123 1124 for_each_va_space_gpu_in_mask(gpu, va_space, &va_space->registered_gpu_va_spaces) { 1125 NV_STATUS wait_status = uvm_channel_manager_wait(gpu->channel_manager); 1126 if (status == NV_OK) 1127 status = wait_status; 1128 } 1129 1130 done: 1131 uvm_va_space_up_read(va_space); 1132 1133 return status; 1134 } 1135 1136 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp) 1137 { 1138 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1139 1140 switch (params->mode) { 1141 case UVM_TEST_CHANNEL_STRESS_MODE_STREAM: 1142 return uvm_test_channel_stress_stream(va_space, params); 1143 case UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS: 1144 return uvm_test_channel_stress_update_channels(va_space, params); 1145 case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH: 1146 return uvm_test_channel_noop_push(va_space, params); 1147 default: 1148 return NV_ERR_INVALID_PARAMETER; 1149 } 1150 } 1151