1 /*
2 * Copyright 2011-2013 Blender Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifdef WITH_CUDA
18
19 # include <climits>
20 # include <limits.h>
21 # include <stdio.h>
22 # include <stdlib.h>
23 # include <string.h>
24
25 # include "device/cuda/device_cuda.h"
26 # include "device/device_intern.h"
27 # include "device/device_split_kernel.h"
28
29 # include "render/buffers.h"
30
31 # include "kernel/filter/filter_defines.h"
32
33 # include "util/util_debug.h"
34 # include "util/util_foreach.h"
35 # include "util/util_logging.h"
36 # include "util/util_map.h"
37 # include "util/util_md5.h"
38 # include "util/util_opengl.h"
39 # include "util/util_path.h"
40 # include "util/util_string.h"
41 # include "util/util_system.h"
42 # include "util/util_time.h"
43 # include "util/util_types.h"
44 # include "util/util_windows.h"
45
46 # include "kernel/split/kernel_split_data_types.h"
47
48 CCL_NAMESPACE_BEGIN
49
50 # ifndef WITH_CUDA_DYNLOAD
51
52 /* Transparently implement some functions, so majority of the file does not need
53 * to worry about difference between dynamically loaded and linked CUDA at all.
54 */
55
56 namespace {
57
cuewErrorString(CUresult result)58 const char *cuewErrorString(CUresult result)
59 {
60 /* We can only give error code here without major code duplication, that
61 * should be enough since dynamic loading is only being disabled by folks
62 * who knows what they're doing anyway.
63 *
64 * NOTE: Avoid call from several threads.
65 */
66 static string error;
67 error = string_printf("%d", result);
68 return error.c_str();
69 }
70
cuewCompilerPath()71 const char *cuewCompilerPath()
72 {
73 return CYCLES_CUDA_NVCC_EXECUTABLE;
74 }
75
cuewCompilerVersion()76 int cuewCompilerVersion()
77 {
78 return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
79 }
80
81 } /* namespace */
82 # endif /* WITH_CUDA_DYNLOAD */
83
84 class CUDADevice;
85
86 class CUDASplitKernel : public DeviceSplitKernel {
87 CUDADevice *device;
88
89 public:
90 explicit CUDASplitKernel(CUDADevice *device);
91
92 virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
93
94 virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
95 RenderTile &rtile,
96 int num_global_elements,
97 device_memory &kernel_globals,
98 device_memory &kernel_data_,
99 device_memory &split_data,
100 device_memory &ray_state,
101 device_memory &queue_index,
102 device_memory &use_queues_flag,
103 device_memory &work_pool_wgs);
104
105 virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
106 const DeviceRequestedFeatures &);
107 virtual int2 split_kernel_local_size();
108 virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
109 };
110
111 /* Utility to push/pop CUDA context. */
112 class CUDAContextScope {
113 public:
114 CUDAContextScope(CUDADevice *device);
115 ~CUDAContextScope();
116
117 private:
118 CUDADevice *device;
119 };
120
have_precompiled_kernels()121 bool CUDADevice::have_precompiled_kernels()
122 {
123 string cubins_path = path_get("lib");
124 return path_exists(cubins_path);
125 }
126
show_samples() const127 bool CUDADevice::show_samples() const
128 {
129 /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
130 return true;
131 }
132
get_bvh_layout_mask() const133 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
134 {
135 return BVH_LAYOUT_BVH2;
136 }
137
set_error(const string & error)138 void CUDADevice::set_error(const string &error)
139 {
140 Device::set_error(error);
141
142 if (first_error) {
143 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
144 fprintf(stderr,
145 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
146 first_error = false;
147 }
148 }
149
150 # define cuda_assert(stmt) \
151 { \
152 CUresult result = stmt; \
153 if (result != CUDA_SUCCESS) { \
154 const char *name = cuewErrorString(result); \
155 set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
156 } \
157 } \
158 (void)0
159
CUDADevice(DeviceInfo & info,Stats & stats,Profiler & profiler,bool background_)160 CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
161 : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
162 {
163 first_error = true;
164 background = background_;
165
166 cuDevId = info.num;
167 cuDevice = 0;
168 cuContext = 0;
169
170 cuModule = 0;
171 cuFilterModule = 0;
172
173 split_kernel = NULL;
174
175 need_texture_info = false;
176
177 device_texture_headroom = 0;
178 device_working_headroom = 0;
179 move_texture_to_host = false;
180 map_host_limit = 0;
181 map_host_used = 0;
182 can_map_host = 0;
183 pitch_alignment = 0;
184
185 functions.loaded = false;
186
187 /* Intialize CUDA. */
188 CUresult result = cuInit(0);
189 if (result != CUDA_SUCCESS) {
190 set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
191 return;
192 }
193
194 /* Setup device and context. */
195 result = cuDeviceGet(&cuDevice, cuDevId);
196 if (result != CUDA_SUCCESS) {
197 set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
198 cuewErrorString(result)));
199 return;
200 }
201
202 /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
203 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
204 * so we can predict which memory to map to host. */
205 cuda_assert(
206 cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
207
208 cuda_assert(cuDeviceGetAttribute(
209 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
210
211 unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
212 if (can_map_host) {
213 ctx_flags |= CU_CTX_MAP_HOST;
214 init_host_memory();
215 }
216
217 /* Create context. */
218 if (background) {
219 result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
220 }
221 else {
222 result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
223
224 if (result != CUDA_SUCCESS) {
225 result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
226 background = true;
227 }
228 }
229
230 if (result != CUDA_SUCCESS) {
231 set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
232 return;
233 }
234
235 int major, minor;
236 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
237 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
238 cuDevArchitecture = major * 100 + minor * 10;
239
240 /* Pop context set by cuCtxCreate. */
241 cuCtxPopCurrent(NULL);
242 }
243
~CUDADevice()244 CUDADevice::~CUDADevice()
245 {
246 task_pool.cancel();
247
248 delete split_kernel;
249
250 texture_info.free();
251
252 cuda_assert(cuCtxDestroy(cuContext));
253 }
254
support_device(const DeviceRequestedFeatures &)255 bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
256 {
257 int major, minor;
258 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
259 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
260
261 /* We only support sm_30 and above */
262 if (major < 3) {
263 set_error(string_printf(
264 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
265 return false;
266 }
267
268 return true;
269 }
270
check_peer_access(Device * peer_device)271 bool CUDADevice::check_peer_access(Device *peer_device)
272 {
273 if (peer_device == this) {
274 return false;
275 }
276 if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
277 return false;
278 }
279
280 CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
281
282 int can_access = 0;
283 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
284 if (can_access == 0) {
285 return false;
286 }
287
288 // Ensure array access over the link is possible as well (for 3D textures)
289 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
290 CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
291 cuDevice,
292 peer_device_cuda->cuDevice));
293 if (can_access == 0) {
294 return false;
295 }
296
297 // Enable peer access in both directions
298 {
299 const CUDAContextScope scope(this);
300 CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
301 if (result != CUDA_SUCCESS) {
302 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
303 cuewErrorString(result)));
304 return false;
305 }
306 }
307 {
308 const CUDAContextScope scope(peer_device_cuda);
309 CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
310 if (result != CUDA_SUCCESS) {
311 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
312 cuewErrorString(result)));
313 return false;
314 }
315 }
316
317 return true;
318 }
319
use_adaptive_compilation()320 bool CUDADevice::use_adaptive_compilation()
321 {
322 return DebugFlags().cuda.adaptive_compile;
323 }
324
use_split_kernel()325 bool CUDADevice::use_split_kernel()
326 {
327 return DebugFlags().cuda.split_kernel;
328 }
329
330 /* Common NVCC flags which stays the same regardless of shading model,
331 * kernel sources md5 and only depends on compiler or compilation settings.
332 */
compile_kernel_get_common_cflags(const DeviceRequestedFeatures & requested_features,bool filter,bool split)333 string CUDADevice::compile_kernel_get_common_cflags(
334 const DeviceRequestedFeatures &requested_features, bool filter, bool split)
335 {
336 const int machine = system_cpu_bits();
337 const string source_path = path_get("source");
338 const string include_path = source_path;
339 string cflags = string_printf(
340 "-m%d "
341 "--ptxas-options=\"-v\" "
342 "--use_fast_math "
343 "-DNVCC "
344 "-I\"%s\"",
345 machine,
346 include_path.c_str());
347 if (!filter && use_adaptive_compilation()) {
348 cflags += " " + requested_features.get_build_options();
349 }
350 const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
351 if (extra_cflags) {
352 cflags += string(" ") + string(extra_cflags);
353 }
354 # ifdef WITH_CYCLES_DEBUG
355 cflags += " -D__KERNEL_DEBUG__";
356 # endif
357
358 if (split) {
359 cflags += " -D__SPLIT__";
360 }
361
362 # ifdef WITH_NANOVDB
363 cflags += " -DWITH_NANOVDB";
364 # endif
365
366 return cflags;
367 }
368
compile_kernel(const DeviceRequestedFeatures & requested_features,const char * name,const char * base,bool force_ptx)369 string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
370 const char *name,
371 const char *base,
372 bool force_ptx)
373 {
374 /* Compute kernel name. */
375 int major, minor;
376 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
377 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
378
379 /* Attempt to use kernel provided with Blender. */
380 if (!use_adaptive_compilation()) {
381 if (!force_ptx) {
382 const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
383 VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
384 if (path_exists(cubin)) {
385 VLOG(1) << "Using precompiled kernel.";
386 return cubin;
387 }
388 }
389
390 /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
391 int ptx_major = major, ptx_minor = minor;
392 while (ptx_major >= 3) {
393 const string ptx = path_get(
394 string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
395 VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
396 if (path_exists(ptx)) {
397 VLOG(1) << "Using precompiled kernel.";
398 return ptx;
399 }
400
401 if (ptx_minor > 0) {
402 ptx_minor--;
403 }
404 else {
405 ptx_major--;
406 ptx_minor = 9;
407 }
408 }
409 }
410
411 /* Try to use locally compiled kernel. */
412 string source_path = path_get("source");
413 const string source_md5 = path_files_md5_hash(source_path);
414
415 /* We include cflags into md5 so changing cuda toolkit or changing other
416 * compiler command line arguments makes sure cubin gets re-built.
417 */
418 string common_cflags = compile_kernel_get_common_cflags(
419 requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
420 const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
421
422 const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
423 const char *const kernel_arch = force_ptx ? "compute" : "sm";
424 const string cubin_file = string_printf(
425 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
426 const string cubin = path_cache_get(path_join("kernels", cubin_file));
427 VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
428 if (path_exists(cubin)) {
429 VLOG(1) << "Using locally compiled kernel.";
430 return cubin;
431 }
432
433 # ifdef _WIN32
434 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
435 if (major < 3) {
436 set_error(
437 string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
438 "Your GPU is not supported.",
439 major,
440 minor));
441 }
442 else {
443 set_error(
444 string_printf("CUDA binary kernel for this graphics card compute "
445 "capability (%d.%d) not found.",
446 major,
447 minor));
448 }
449 return string();
450 }
451 # endif
452
453 /* Compile. */
454 const char *const nvcc = cuewCompilerPath();
455 if (nvcc == NULL) {
456 set_error(
457 "CUDA nvcc compiler not found. "
458 "Install CUDA toolkit in default location.");
459 return string();
460 }
461
462 const int nvcc_cuda_version = cuewCompilerVersion();
463 VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
464 if (nvcc_cuda_version < 80) {
465 printf(
466 "Unsupported CUDA version %d.%d detected, "
467 "you need CUDA 8.0 or newer.\n",
468 nvcc_cuda_version / 10,
469 nvcc_cuda_version % 10);
470 return string();
471 }
472 else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102)) {
473 printf(
474 "CUDA version %d.%d detected, build may succeed but only "
475 "CUDA 10.1 and 10.2 are officially supported.\n",
476 nvcc_cuda_version / 10,
477 nvcc_cuda_version % 10);
478 }
479
480 double starttime = time_dt();
481
482 path_create_directories(cubin);
483
484 source_path = path_join(path_join(source_path, "kernel"),
485 path_join("kernels", path_join(base, string_printf("%s.cu", name))));
486
487 string command = string_printf(
488 "\"%s\" "
489 "-arch=%s_%d%d "
490 "--%s \"%s\" "
491 "-o \"%s\" "
492 "%s",
493 nvcc,
494 kernel_arch,
495 major,
496 minor,
497 kernel_ext,
498 source_path.c_str(),
499 cubin.c_str(),
500 common_cflags.c_str());
501
502 printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
503
504 # ifdef _WIN32
505 command = "call " + command;
506 # endif
507 if (system(command.c_str()) != 0) {
508 set_error(
509 "Failed to execute compilation command, "
510 "see console for details.");
511 return string();
512 }
513
514 /* Verify if compilation succeeded */
515 if (!path_exists(cubin)) {
516 set_error(
517 "CUDA kernel compilation failed, "
518 "see console for details.");
519 return string();
520 }
521
522 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
523
524 return cubin;
525 }
526
load_kernels(const DeviceRequestedFeatures & requested_features)527 bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
528 {
529 /* TODO(sergey): Support kernels re-load for CUDA devices.
530 *
531 * Currently re-loading kernel will invalidate memory pointers,
532 * causing problems in cuCtxSynchronize.
533 */
534 if (cuFilterModule && cuModule) {
535 VLOG(1) << "Skipping kernel reload, not currently supported.";
536 return true;
537 }
538
539 /* check if cuda init succeeded */
540 if (cuContext == 0)
541 return false;
542
543 /* check if GPU is supported */
544 if (!support_device(requested_features))
545 return false;
546
547 /* get kernel */
548 const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
549 string cubin = compile_kernel(requested_features, kernel_name);
550 if (cubin.empty())
551 return false;
552
553 const char *filter_name = "filter";
554 string filter_cubin = compile_kernel(requested_features, filter_name);
555 if (filter_cubin.empty())
556 return false;
557
558 /* open module */
559 CUDAContextScope scope(this);
560
561 string cubin_data;
562 CUresult result;
563
564 if (path_read_text(cubin, cubin_data))
565 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
566 else
567 result = CUDA_ERROR_FILE_NOT_FOUND;
568
569 if (result != CUDA_SUCCESS)
570 set_error(string_printf(
571 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
572
573 if (path_read_text(filter_cubin, cubin_data))
574 result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
575 else
576 result = CUDA_ERROR_FILE_NOT_FOUND;
577
578 if (result != CUDA_SUCCESS)
579 set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
580 filter_cubin.c_str(),
581 cuewErrorString(result)));
582
583 if (result == CUDA_SUCCESS) {
584 reserve_local_memory(requested_features);
585 }
586
587 load_functions();
588
589 return (result == CUDA_SUCCESS);
590 }
591
load_functions()592 void CUDADevice::load_functions()
593 {
594 /* TODO: load all functions here. */
595 if (functions.loaded) {
596 return;
597 }
598 functions.loaded = true;
599
600 cuda_assert(cuModuleGetFunction(
601 &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
602 cuda_assert(cuModuleGetFunction(
603 &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
604 cuda_assert(cuModuleGetFunction(
605 &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
606 cuda_assert(cuModuleGetFunction(
607 &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
608
609 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
610 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
611 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
612 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
613
614 int unused_min_blocks;
615 cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
616 &functions.adaptive_num_threads_per_block,
617 functions.adaptive_scale_samples,
618 NULL,
619 0,
620 0));
621 }
622
reserve_local_memory(const DeviceRequestedFeatures & requested_features)623 void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
624 {
625 if (use_split_kernel()) {
626 /* Split kernel mostly uses global memory and adaptive compilation,
627 * difficult to predict how much is needed currently. */
628 return;
629 }
630
631 /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
632 * needed for kernel launches, so that we can reliably figure out when
633 * to allocate scene data in mapped host memory. */
634 CUDAContextScope scope(this);
635
636 size_t total = 0, free_before = 0, free_after = 0;
637 cuMemGetInfo(&free_before, &total);
638
639 /* Get kernel function. */
640 CUfunction cuRender;
641
642 if (requested_features.use_baking) {
643 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
644 }
645 else if (requested_features.use_integrator_branched) {
646 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
647 }
648 else {
649 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
650 }
651
652 cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
653
654 int min_blocks, num_threads_per_block;
655 cuda_assert(
656 cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
657
658 /* Launch kernel, using just 1 block appears sufficient to reserve
659 * memory for all multiprocessors. It would be good to do this in
660 * parallel for the multi GPU case still to make it faster. */
661 CUdeviceptr d_work_tiles = 0;
662 uint total_work_size = 0;
663
664 void *args[] = {&d_work_tiles, &total_work_size};
665
666 cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
667
668 cuda_assert(cuCtxSynchronize());
669
670 cuMemGetInfo(&free_after, &total);
671 VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
672 << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
673
674 # if 0
675 /* For testing mapped host memory, fill up device memory. */
676 const size_t keep_mb = 1024;
677
678 while (free_after > keep_mb * 1024 * 1024LL) {
679 CUdeviceptr tmp;
680 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
681 cuMemGetInfo(&free_after, &total);
682 }
683 # endif
684 }
685
init_host_memory()686 void CUDADevice::init_host_memory()
687 {
688 /* Limit amount of host mapped memory, because allocating too much can
689 * cause system instability. Leave at least half or 4 GB of system
690 * memory free, whichever is smaller. */
691 size_t default_limit = 4 * 1024 * 1024 * 1024LL;
692 size_t system_ram = system_physical_ram();
693
694 if (system_ram > 0) {
695 if (system_ram / 2 > default_limit) {
696 map_host_limit = system_ram - default_limit;
697 }
698 else {
699 map_host_limit = system_ram / 2;
700 }
701 }
702 else {
703 VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
704 map_host_limit = 0;
705 }
706
707 /* Amount of device memory to keep is free after texture memory
708 * and working memory allocations respectively. We set the working
709 * memory limit headroom lower so that some space is left after all
710 * texture memory allocations. */
711 device_working_headroom = 32 * 1024 * 1024LL; // 32MB
712 device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
713
714 VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
715 << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
716 }
717
load_texture_info()718 void CUDADevice::load_texture_info()
719 {
720 if (need_texture_info) {
721 texture_info.copy_to_device();
722 need_texture_info = false;
723 }
724 }
725
move_textures_to_host(size_t size,bool for_texture)726 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
727 {
728 /* Break out of recursive call, which can happen when moving memory on a multi device. */
729 static bool any_device_moving_textures_to_host = false;
730 if (any_device_moving_textures_to_host) {
731 return;
732 }
733
734 /* Signal to reallocate textures in host memory only. */
735 move_texture_to_host = true;
736
737 while (size > 0) {
738 /* Find suitable memory allocation to move. */
739 device_memory *max_mem = NULL;
740 size_t max_size = 0;
741 bool max_is_image = false;
742
743 foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
744 device_memory &mem = *pair.first;
745 CUDAMem *cmem = &pair.second;
746
747 /* Can only move textures allocated on this device (and not those from peer devices).
748 * And need to ignore memory that is already on the host. */
749 if (!mem.is_resident(this) || cmem->use_mapped_host) {
750 continue;
751 }
752
753 bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
754 (&mem != &texture_info);
755 bool is_image = is_texture && (mem.data_height > 1);
756
757 /* Can't move this type of memory. */
758 if (!is_texture || cmem->array) {
759 continue;
760 }
761
762 /* For other textures, only move image textures. */
763 if (for_texture && !is_image) {
764 continue;
765 }
766
767 /* Try to move largest allocation, prefer moving images. */
768 if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
769 max_is_image = is_image;
770 max_size = mem.device_size;
771 max_mem = &mem;
772 }
773 }
774
775 /* Move to host memory. This part is mutex protected since
776 * multiple CUDA devices could be moving the memory. The
777 * first one will do it, and the rest will adopt the pointer. */
778 if (max_mem) {
779 VLOG(1) << "Move memory from device to host: " << max_mem->name;
780
781 static thread_mutex move_mutex;
782 thread_scoped_lock lock(move_mutex);
783
784 any_device_moving_textures_to_host = true;
785
786 /* Potentially need to call back into multi device, so pointer mapping
787 * and peer devices are updated. This is also necessary since the device
788 * pointer may just be a key here, so cannot be accessed and freed directly.
789 * Unfortunately it does mean that memory is reallocated on all other
790 * devices as well, which is potentially dangerous when still in use (since
791 * a thread rendering on another devices would only be caught in this mutex
792 * if it so happens to do an allocation at the same time as well. */
793 max_mem->device_copy_to();
794 size = (max_size >= size) ? 0 : size - max_size;
795
796 any_device_moving_textures_to_host = false;
797 }
798 else {
799 break;
800 }
801 }
802
803 /* Unset flag before texture info is reloaded, since it should stay in device memory. */
804 move_texture_to_host = false;
805
806 /* Update texture info array with new pointers. */
807 load_texture_info();
808 }
809
generic_alloc(device_memory & mem,size_t pitch_padding)810 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
811 {
812 CUDAContextScope scope(this);
813
814 CUdeviceptr device_pointer = 0;
815 size_t size = mem.memory_size() + pitch_padding;
816
817 CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
818 const char *status = "";
819
820 /* First try allocating in device memory, respecting headroom. We make
821 * an exception for texture info. It is small and frequently accessed,
822 * so treat it as working memory.
823 *
824 * If there is not enough room for working memory, we will try to move
825 * textures to host memory, assuming the performance impact would have
826 * been worse for working memory. */
827 bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
828 bool is_image = is_texture && (mem.data_height > 1);
829
830 size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
831
832 size_t total = 0, free = 0;
833 cuMemGetInfo(&free, &total);
834
835 /* Move textures to host memory if needed. */
836 if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
837 move_textures_to_host(size + headroom - free, is_texture);
838 cuMemGetInfo(&free, &total);
839 }
840
841 /* Allocate in device memory. */
842 if (!move_texture_to_host && (size + headroom) < free) {
843 mem_alloc_result = cuMemAlloc(&device_pointer, size);
844 if (mem_alloc_result == CUDA_SUCCESS) {
845 status = " in device memory";
846 }
847 }
848
849 /* Fall back to mapped host memory if needed and possible. */
850
851 void *shared_pointer = 0;
852
853 if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
854 if (mem.shared_pointer) {
855 /* Another device already allocated host memory. */
856 mem_alloc_result = CUDA_SUCCESS;
857 shared_pointer = mem.shared_pointer;
858 }
859 else if (map_host_used + size < map_host_limit) {
860 /* Allocate host memory ourselves. */
861 mem_alloc_result = cuMemHostAlloc(
862 &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
863
864 assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
865 (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
866 }
867
868 if (mem_alloc_result == CUDA_SUCCESS) {
869 cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
870 map_host_used += size;
871 status = " in host memory";
872 }
873 }
874
875 if (mem_alloc_result != CUDA_SUCCESS) {
876 status = " failed, out of device and host memory";
877 set_error("System is out of GPU and shared host memory");
878 }
879
880 if (mem.name) {
881 VLOG(1) << "Buffer allocate: " << mem.name << ", "
882 << string_human_readable_number(mem.memory_size()) << " bytes. ("
883 << string_human_readable_size(mem.memory_size()) << ")" << status;
884 }
885
886 mem.device_pointer = (device_ptr)device_pointer;
887 mem.device_size = size;
888 stats.mem_alloc(size);
889
890 if (!mem.device_pointer) {
891 return NULL;
892 }
893
894 /* Insert into map of allocations. */
895 CUDAMem *cmem = &cuda_mem_map[&mem];
896 if (shared_pointer != 0) {
897 /* Replace host pointer with our host allocation. Only works if
898 * CUDA memory layout is the same and has no pitch padding. Also
899 * does not work if we move textures to host during a render,
900 * since other devices might be using the memory. */
901
902 if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
903 mem.host_pointer != shared_pointer) {
904 memcpy(shared_pointer, mem.host_pointer, size);
905
906 /* A Call to device_memory::host_free() should be preceded by
907 * a call to device_memory::device_free() for host memory
908 * allocated by a device to be handled properly. Two exceptions
909 * are here and a call in OptiXDevice::generic_alloc(), where
910 * the current host memory can be assumed to be allocated by
911 * device_memory::host_alloc(), not by a device */
912
913 mem.host_free();
914 mem.host_pointer = shared_pointer;
915 }
916 mem.shared_pointer = shared_pointer;
917 mem.shared_counter++;
918 cmem->use_mapped_host = true;
919 }
920 else {
921 cmem->use_mapped_host = false;
922 }
923
924 return cmem;
925 }
926
generic_copy_to(device_memory & mem)927 void CUDADevice::generic_copy_to(device_memory &mem)
928 {
929 if (!mem.host_pointer || !mem.device_pointer) {
930 return;
931 }
932
933 /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
934 * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
935 * mem.host_pointer. */
936 if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
937 const CUDAContextScope scope(this);
938 cuda_assert(
939 cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
940 }
941 }
942
generic_free(device_memory & mem)943 void CUDADevice::generic_free(device_memory &mem)
944 {
945 if (mem.device_pointer) {
946 CUDAContextScope scope(this);
947 const CUDAMem &cmem = cuda_mem_map[&mem];
948
949 /* If cmem.use_mapped_host is true, reference counting is used
950 * to safely free a mapped host memory. */
951
952 if (cmem.use_mapped_host) {
953 assert(mem.shared_pointer);
954 if (mem.shared_pointer) {
955 assert(mem.shared_counter > 0);
956 if (--mem.shared_counter == 0) {
957 if (mem.host_pointer == mem.shared_pointer) {
958 mem.host_pointer = 0;
959 }
960 cuMemFreeHost(mem.shared_pointer);
961 mem.shared_pointer = 0;
962 }
963 }
964 map_host_used -= mem.device_size;
965 }
966 else {
967 /* Free device memory. */
968 cuda_assert(cuMemFree(mem.device_pointer));
969 }
970
971 stats.mem_free(mem.device_size);
972 mem.device_pointer = 0;
973 mem.device_size = 0;
974
975 cuda_mem_map.erase(cuda_mem_map.find(&mem));
976 }
977 }
978
mem_alloc(device_memory & mem)979 void CUDADevice::mem_alloc(device_memory &mem)
980 {
981 if (mem.type == MEM_PIXELS && !background) {
982 pixels_alloc(mem);
983 }
984 else if (mem.type == MEM_TEXTURE) {
985 assert(!"mem_alloc not supported for textures.");
986 }
987 else if (mem.type == MEM_GLOBAL) {
988 assert(!"mem_alloc not supported for global memory.");
989 }
990 else {
991 generic_alloc(mem);
992 }
993 }
994
mem_copy_to(device_memory & mem)995 void CUDADevice::mem_copy_to(device_memory &mem)
996 {
997 if (mem.type == MEM_PIXELS) {
998 assert(!"mem_copy_to not supported for pixels.");
999 }
1000 else if (mem.type == MEM_GLOBAL) {
1001 global_free(mem);
1002 global_alloc(mem);
1003 }
1004 else if (mem.type == MEM_TEXTURE) {
1005 tex_free((device_texture &)mem);
1006 tex_alloc((device_texture &)mem);
1007 }
1008 else {
1009 if (!mem.device_pointer) {
1010 generic_alloc(mem);
1011 }
1012
1013 generic_copy_to(mem);
1014 }
1015 }
1016
mem_copy_from(device_memory & mem,int y,int w,int h,int elem)1017 void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
1018 {
1019 if (mem.type == MEM_PIXELS && !background) {
1020 pixels_copy_from(mem, y, w, h);
1021 }
1022 else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
1023 assert(!"mem_copy_from not supported for textures.");
1024 }
1025 else if (mem.host_pointer) {
1026 const size_t size = elem * w * h;
1027 const size_t offset = elem * y * w;
1028
1029 if (mem.device_pointer) {
1030 const CUDAContextScope scope(this);
1031 cuda_assert(cuMemcpyDtoH(
1032 (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
1033 }
1034 else {
1035 memset((char *)mem.host_pointer + offset, 0, size);
1036 }
1037 }
1038 }
1039
mem_zero(device_memory & mem)1040 void CUDADevice::mem_zero(device_memory &mem)
1041 {
1042 if (!mem.device_pointer) {
1043 mem_alloc(mem);
1044 }
1045 if (!mem.device_pointer) {
1046 return;
1047 }
1048
1049 /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
1050 * regardless of mem.host_pointer and mem.shared_pointer. */
1051 if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
1052 const CUDAContextScope scope(this);
1053 cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
1054 }
1055 else if (mem.host_pointer) {
1056 memset(mem.host_pointer, 0, mem.memory_size());
1057 }
1058 }
1059
mem_free(device_memory & mem)1060 void CUDADevice::mem_free(device_memory &mem)
1061 {
1062 if (mem.type == MEM_PIXELS && !background) {
1063 pixels_free(mem);
1064 }
1065 else if (mem.type == MEM_GLOBAL) {
1066 global_free(mem);
1067 }
1068 else if (mem.type == MEM_TEXTURE) {
1069 tex_free((device_texture &)mem);
1070 }
1071 else {
1072 generic_free(mem);
1073 }
1074 }
1075
mem_alloc_sub_ptr(device_memory & mem,int offset,int)1076 device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
1077 {
1078 return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
1079 }
1080
const_copy_to(const char * name,void * host,size_t size)1081 void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
1082 {
1083 CUDAContextScope scope(this);
1084 CUdeviceptr mem;
1085 size_t bytes;
1086
1087 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
1088 // assert(bytes == size);
1089 cuda_assert(cuMemcpyHtoD(mem, host, size));
1090 }
1091
global_alloc(device_memory & mem)1092 void CUDADevice::global_alloc(device_memory &mem)
1093 {
1094 if (mem.is_resident(this)) {
1095 generic_alloc(mem);
1096 generic_copy_to(mem);
1097 }
1098
1099 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
1100 }
1101
global_free(device_memory & mem)1102 void CUDADevice::global_free(device_memory &mem)
1103 {
1104 if (mem.is_resident(this) && mem.device_pointer) {
1105 generic_free(mem);
1106 }
1107 }
1108
tex_alloc(device_texture & mem)1109 void CUDADevice::tex_alloc(device_texture &mem)
1110 {
1111 CUDAContextScope scope(this);
1112
1113 /* General variables for both architectures */
1114 string bind_name = mem.name;
1115 size_t dsize = datatype_size(mem.data_type);
1116 size_t size = mem.memory_size();
1117
1118 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
1119 switch (mem.info.extension) {
1120 case EXTENSION_REPEAT:
1121 address_mode = CU_TR_ADDRESS_MODE_WRAP;
1122 break;
1123 case EXTENSION_EXTEND:
1124 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
1125 break;
1126 case EXTENSION_CLIP:
1127 address_mode = CU_TR_ADDRESS_MODE_BORDER;
1128 break;
1129 default:
1130 assert(0);
1131 break;
1132 }
1133
1134 CUfilter_mode filter_mode;
1135 if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
1136 filter_mode = CU_TR_FILTER_MODE_POINT;
1137 }
1138 else {
1139 filter_mode = CU_TR_FILTER_MODE_LINEAR;
1140 }
1141
1142 /* Image Texture Storage */
1143 CUarray_format_enum format;
1144 switch (mem.data_type) {
1145 case TYPE_UCHAR:
1146 format = CU_AD_FORMAT_UNSIGNED_INT8;
1147 break;
1148 case TYPE_UINT16:
1149 format = CU_AD_FORMAT_UNSIGNED_INT16;
1150 break;
1151 case TYPE_UINT:
1152 format = CU_AD_FORMAT_UNSIGNED_INT32;
1153 break;
1154 case TYPE_INT:
1155 format = CU_AD_FORMAT_SIGNED_INT32;
1156 break;
1157 case TYPE_FLOAT:
1158 format = CU_AD_FORMAT_FLOAT;
1159 break;
1160 case TYPE_HALF:
1161 format = CU_AD_FORMAT_HALF;
1162 break;
1163 default:
1164 assert(0);
1165 return;
1166 }
1167
1168 CUDAMem *cmem = NULL;
1169 CUarray array_3d = NULL;
1170 size_t src_pitch = mem.data_width * dsize * mem.data_elements;
1171 size_t dst_pitch = src_pitch;
1172
1173 if (!mem.is_resident(this)) {
1174 cmem = &cuda_mem_map[&mem];
1175 cmem->texobject = 0;
1176
1177 if (mem.data_depth > 1) {
1178 array_3d = (CUarray)mem.device_pointer;
1179 cmem->array = array_3d;
1180 }
1181 else if (mem.data_height > 0) {
1182 dst_pitch = align_up(src_pitch, pitch_alignment);
1183 }
1184 }
1185 else if (mem.data_depth > 1) {
1186 /* 3D texture using array, there is no API for linear memory. */
1187 CUDA_ARRAY3D_DESCRIPTOR desc;
1188
1189 desc.Width = mem.data_width;
1190 desc.Height = mem.data_height;
1191 desc.Depth = mem.data_depth;
1192 desc.Format = format;
1193 desc.NumChannels = mem.data_elements;
1194 desc.Flags = 0;
1195
1196 VLOG(1) << "Array 3D allocate: " << mem.name << ", "
1197 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1198 << string_human_readable_size(mem.memory_size()) << ")";
1199
1200 cuda_assert(cuArray3DCreate(&array_3d, &desc));
1201
1202 if (!array_3d) {
1203 return;
1204 }
1205
1206 CUDA_MEMCPY3D param;
1207 memset(¶m, 0, sizeof(param));
1208 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
1209 param.dstArray = array_3d;
1210 param.srcMemoryType = CU_MEMORYTYPE_HOST;
1211 param.srcHost = mem.host_pointer;
1212 param.srcPitch = src_pitch;
1213 param.WidthInBytes = param.srcPitch;
1214 param.Height = mem.data_height;
1215 param.Depth = mem.data_depth;
1216
1217 cuda_assert(cuMemcpy3D(¶m));
1218
1219 mem.device_pointer = (device_ptr)array_3d;
1220 mem.device_size = size;
1221 stats.mem_alloc(size);
1222
1223 cmem = &cuda_mem_map[&mem];
1224 cmem->texobject = 0;
1225 cmem->array = array_3d;
1226 }
1227 else if (mem.data_height > 0) {
1228 /* 2D texture, using pitch aligned linear memory. */
1229 dst_pitch = align_up(src_pitch, pitch_alignment);
1230 size_t dst_size = dst_pitch * mem.data_height;
1231
1232 cmem = generic_alloc(mem, dst_size - mem.memory_size());
1233 if (!cmem) {
1234 return;
1235 }
1236
1237 CUDA_MEMCPY2D param;
1238 memset(¶m, 0, sizeof(param));
1239 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1240 param.dstDevice = mem.device_pointer;
1241 param.dstPitch = dst_pitch;
1242 param.srcMemoryType = CU_MEMORYTYPE_HOST;
1243 param.srcHost = mem.host_pointer;
1244 param.srcPitch = src_pitch;
1245 param.WidthInBytes = param.srcPitch;
1246 param.Height = mem.data_height;
1247
1248 cuda_assert(cuMemcpy2DUnaligned(¶m));
1249 }
1250 else {
1251 /* 1D texture, using linear memory. */
1252 cmem = generic_alloc(mem);
1253 if (!cmem) {
1254 return;
1255 }
1256
1257 cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
1258 }
1259
1260 /* Resize once */
1261 const uint slot = mem.slot;
1262 if (slot >= texture_info.size()) {
1263 /* Allocate some slots in advance, to reduce amount
1264 * of re-allocations. */
1265 texture_info.resize(slot + 128);
1266 }
1267
1268 /* Set Mapping and tag that we need to (re-)upload to device */
1269 texture_info[slot] = mem.info;
1270 need_texture_info = true;
1271
1272 if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
1273 mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
1274 /* Kepler+, bindless textures. */
1275 CUDA_RESOURCE_DESC resDesc;
1276 memset(&resDesc, 0, sizeof(resDesc));
1277
1278 if (array_3d) {
1279 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
1280 resDesc.res.array.hArray = array_3d;
1281 resDesc.flags = 0;
1282 }
1283 else if (mem.data_height > 0) {
1284 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
1285 resDesc.res.pitch2D.devPtr = mem.device_pointer;
1286 resDesc.res.pitch2D.format = format;
1287 resDesc.res.pitch2D.numChannels = mem.data_elements;
1288 resDesc.res.pitch2D.height = mem.data_height;
1289 resDesc.res.pitch2D.width = mem.data_width;
1290 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1291 }
1292 else {
1293 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
1294 resDesc.res.linear.devPtr = mem.device_pointer;
1295 resDesc.res.linear.format = format;
1296 resDesc.res.linear.numChannels = mem.data_elements;
1297 resDesc.res.linear.sizeInBytes = mem.device_size;
1298 }
1299
1300 CUDA_TEXTURE_DESC texDesc;
1301 memset(&texDesc, 0, sizeof(texDesc));
1302 texDesc.addressMode[0] = address_mode;
1303 texDesc.addressMode[1] = address_mode;
1304 texDesc.addressMode[2] = address_mode;
1305 texDesc.filterMode = filter_mode;
1306 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
1307
1308 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
1309
1310 texture_info[slot].data = (uint64_t)cmem->texobject;
1311 }
1312 else {
1313 texture_info[slot].data = (uint64_t)mem.device_pointer;
1314 }
1315 }
1316
tex_free(device_texture & mem)1317 void CUDADevice::tex_free(device_texture &mem)
1318 {
1319 if (mem.device_pointer) {
1320 CUDAContextScope scope(this);
1321 const CUDAMem &cmem = cuda_mem_map[&mem];
1322
1323 if (cmem.texobject) {
1324 /* Free bindless texture. */
1325 cuTexObjectDestroy(cmem.texobject);
1326 }
1327
1328 if (!mem.is_resident(this)) {
1329 /* Do not free memory here, since it was allocated on a different device. */
1330 cuda_mem_map.erase(cuda_mem_map.find(&mem));
1331 }
1332 else if (cmem.array) {
1333 /* Free array. */
1334 cuArrayDestroy(cmem.array);
1335 stats.mem_free(mem.device_size);
1336 mem.device_pointer = 0;
1337 mem.device_size = 0;
1338
1339 cuda_mem_map.erase(cuda_mem_map.find(&mem));
1340 }
1341 else {
1342 generic_free(mem);
1343 }
1344 }
1345 }
1346
1347 # define CUDA_GET_BLOCKSIZE(func, w, h) \
1348 int threads_per_block; \
1349 cuda_assert( \
1350 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1351 int threads = (int)sqrt((float)threads_per_block); \
1352 int xblocks = ((w) + threads - 1) / threads; \
1353 int yblocks = ((h) + threads - 1) / threads;
1354
1355 # define CUDA_LAUNCH_KERNEL(func, args) \
1356 cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
1357
1358 /* Similar as above, but for 1-dimensional blocks. */
1359 # define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
1360 int threads_per_block; \
1361 cuda_assert( \
1362 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1363 int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
1364 int yblocks = h;
1365
1366 # define CUDA_LAUNCH_KERNEL_1D(func, args) \
1367 cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
1368
denoising_non_local_means(device_ptr image_ptr,device_ptr guide_ptr,device_ptr variance_ptr,device_ptr out_ptr,DenoisingTask * task)1369 bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
1370 device_ptr guide_ptr,
1371 device_ptr variance_ptr,
1372 device_ptr out_ptr,
1373 DenoisingTask *task)
1374 {
1375 if (have_error())
1376 return false;
1377
1378 CUDAContextScope scope(this);
1379
1380 int stride = task->buffer.stride;
1381 int w = task->buffer.width;
1382 int h = task->buffer.h;
1383 int r = task->nlm_state.r;
1384 int f = task->nlm_state.f;
1385 float a = task->nlm_state.a;
1386 float k_2 = task->nlm_state.k_2;
1387
1388 int pass_stride = task->buffer.pass_stride;
1389 int num_shifts = (2 * r + 1) * (2 * r + 1);
1390 int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1391 int frame_offset = 0;
1392
1393 if (have_error())
1394 return false;
1395
1396 CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1397 CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1398 CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
1399 CUdeviceptr scale_ptr = 0;
1400
1401 cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
1402 cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
1403
1404 {
1405 CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
1406 cuda_assert(cuModuleGetFunction(
1407 &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1408 cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1409 cuda_assert(cuModuleGetFunction(
1410 &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1411 cuda_assert(cuModuleGetFunction(
1412 &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
1413
1414 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1415 cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1416 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1417 cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
1418
1419 CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
1420
1421 void *calc_difference_args[] = {&guide_ptr,
1422 &variance_ptr,
1423 &scale_ptr,
1424 &difference,
1425 &w,
1426 &h,
1427 &stride,
1428 &pass_stride,
1429 &r,
1430 &channel_offset,
1431 &frame_offset,
1432 &a,
1433 &k_2};
1434 void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1435 void *calc_weight_args[] = {
1436 &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1437 void *update_output_args[] = {&blurDifference,
1438 &image_ptr,
1439 &out_ptr,
1440 &weightAccum,
1441 &w,
1442 &h,
1443 &stride,
1444 &pass_stride,
1445 &channel_offset,
1446 &r,
1447 &f};
1448
1449 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1450 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1451 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1452 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1453 CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
1454 }
1455
1456 {
1457 CUfunction cuNLMNormalize;
1458 cuda_assert(
1459 cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
1460 cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
1461 void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
1462 CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
1463 CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
1464 cuda_assert(cuCtxSynchronize());
1465 }
1466
1467 return !have_error();
1468 }
1469
denoising_construct_transform(DenoisingTask * task)1470 bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
1471 {
1472 if (have_error())
1473 return false;
1474
1475 CUDAContextScope scope(this);
1476
1477 CUfunction cuFilterConstructTransform;
1478 cuda_assert(cuModuleGetFunction(
1479 &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
1480 cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
1481 CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
1482
1483 void *args[] = {&task->buffer.mem.device_pointer,
1484 &task->tile_info_mem.device_pointer,
1485 &task->storage.transform.device_pointer,
1486 &task->storage.rank.device_pointer,
1487 &task->filter_area,
1488 &task->rect,
1489 &task->radius,
1490 &task->pca_threshold,
1491 &task->buffer.pass_stride,
1492 &task->buffer.frame_stride,
1493 &task->buffer.use_time};
1494 CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
1495 cuda_assert(cuCtxSynchronize());
1496
1497 return !have_error();
1498 }
1499
denoising_accumulate(device_ptr color_ptr,device_ptr color_variance_ptr,device_ptr scale_ptr,int frame,DenoisingTask * task)1500 bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
1501 device_ptr color_variance_ptr,
1502 device_ptr scale_ptr,
1503 int frame,
1504 DenoisingTask *task)
1505 {
1506 if (have_error())
1507 return false;
1508
1509 CUDAContextScope scope(this);
1510
1511 int r = task->radius;
1512 int f = 4;
1513 float a = 1.0f;
1514 float k_2 = task->nlm_k_2;
1515
1516 int w = task->reconstruction_state.source_w;
1517 int h = task->reconstruction_state.source_h;
1518 int stride = task->buffer.stride;
1519 int frame_offset = frame * task->buffer.frame_stride;
1520 int t = task->tile_info->frames[frame];
1521
1522 int pass_stride = task->buffer.pass_stride;
1523 int num_shifts = (2 * r + 1) * (2 * r + 1);
1524
1525 if (have_error())
1526 return false;
1527
1528 CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1529 CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1530
1531 CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
1532 cuda_assert(cuModuleGetFunction(
1533 &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1534 cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1535 cuda_assert(
1536 cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1537 cuda_assert(cuModuleGetFunction(
1538 &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
1539
1540 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1541 cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1542 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1543 cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
1544
1545 CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
1546 task->reconstruction_state.source_w * task->reconstruction_state.source_h,
1547 num_shifts);
1548
1549 void *calc_difference_args[] = {&color_ptr,
1550 &color_variance_ptr,
1551 &scale_ptr,
1552 &difference,
1553 &w,
1554 &h,
1555 &stride,
1556 &pass_stride,
1557 &r,
1558 &pass_stride,
1559 &frame_offset,
1560 &a,
1561 &k_2};
1562 void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1563 void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1564 void *construct_gramian_args[] = {&t,
1565 &blurDifference,
1566 &task->buffer.mem.device_pointer,
1567 &task->storage.transform.device_pointer,
1568 &task->storage.rank.device_pointer,
1569 &task->storage.XtWX.device_pointer,
1570 &task->storage.XtWY.device_pointer,
1571 &task->reconstruction_state.filter_window,
1572 &w,
1573 &h,
1574 &stride,
1575 &pass_stride,
1576 &r,
1577 &f,
1578 &frame_offset,
1579 &task->buffer.use_time};
1580
1581 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1582 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1583 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1584 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1585 CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
1586 cuda_assert(cuCtxSynchronize());
1587
1588 return !have_error();
1589 }
1590
denoising_solve(device_ptr output_ptr,DenoisingTask * task)1591 bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1592 {
1593 CUfunction cuFinalize;
1594 cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
1595 cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
1596 void *finalize_args[] = {&output_ptr,
1597 &task->storage.rank.device_pointer,
1598 &task->storage.XtWX.device_pointer,
1599 &task->storage.XtWY.device_pointer,
1600 &task->filter_area,
1601 &task->reconstruction_state.buffer_params.x,
1602 &task->render_buffer.samples};
1603 CUDA_GET_BLOCKSIZE(
1604 cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
1605 CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
1606 cuda_assert(cuCtxSynchronize());
1607
1608 return !have_error();
1609 }
1610
denoising_combine_halves(device_ptr a_ptr,device_ptr b_ptr,device_ptr mean_ptr,device_ptr variance_ptr,int r,int4 rect,DenoisingTask * task)1611 bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
1612 device_ptr b_ptr,
1613 device_ptr mean_ptr,
1614 device_ptr variance_ptr,
1615 int r,
1616 int4 rect,
1617 DenoisingTask *task)
1618 {
1619 if (have_error())
1620 return false;
1621
1622 CUDAContextScope scope(this);
1623
1624 CUfunction cuFilterCombineHalves;
1625 cuda_assert(cuModuleGetFunction(
1626 &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
1627 cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
1628 CUDA_GET_BLOCKSIZE(
1629 cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1630
1631 void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
1632 CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
1633 cuda_assert(cuCtxSynchronize());
1634
1635 return !have_error();
1636 }
1637
denoising_divide_shadow(device_ptr a_ptr,device_ptr b_ptr,device_ptr sample_variance_ptr,device_ptr sv_variance_ptr,device_ptr buffer_variance_ptr,DenoisingTask * task)1638 bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
1639 device_ptr b_ptr,
1640 device_ptr sample_variance_ptr,
1641 device_ptr sv_variance_ptr,
1642 device_ptr buffer_variance_ptr,
1643 DenoisingTask *task)
1644 {
1645 if (have_error())
1646 return false;
1647
1648 CUDAContextScope scope(this);
1649
1650 CUfunction cuFilterDivideShadow;
1651 cuda_assert(cuModuleGetFunction(
1652 &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
1653 cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
1654 CUDA_GET_BLOCKSIZE(
1655 cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1656
1657 void *args[] = {&task->render_buffer.samples,
1658 &task->tile_info_mem.device_pointer,
1659 &a_ptr,
1660 &b_ptr,
1661 &sample_variance_ptr,
1662 &sv_variance_ptr,
1663 &buffer_variance_ptr,
1664 &task->rect,
1665 &task->render_buffer.pass_stride,
1666 &task->render_buffer.offset};
1667 CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
1668 cuda_assert(cuCtxSynchronize());
1669
1670 return !have_error();
1671 }
1672
denoising_get_feature(int mean_offset,int variance_offset,device_ptr mean_ptr,device_ptr variance_ptr,float scale,DenoisingTask * task)1673 bool CUDADevice::denoising_get_feature(int mean_offset,
1674 int variance_offset,
1675 device_ptr mean_ptr,
1676 device_ptr variance_ptr,
1677 float scale,
1678 DenoisingTask *task)
1679 {
1680 if (have_error())
1681 return false;
1682
1683 CUDAContextScope scope(this);
1684
1685 CUfunction cuFilterGetFeature;
1686 cuda_assert(
1687 cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
1688 cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
1689 CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1690
1691 void *args[] = {&task->render_buffer.samples,
1692 &task->tile_info_mem.device_pointer,
1693 &mean_offset,
1694 &variance_offset,
1695 &mean_ptr,
1696 &variance_ptr,
1697 &scale,
1698 &task->rect,
1699 &task->render_buffer.pass_stride,
1700 &task->render_buffer.offset};
1701 CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
1702 cuda_assert(cuCtxSynchronize());
1703
1704 return !have_error();
1705 }
1706
denoising_write_feature(int out_offset,device_ptr from_ptr,device_ptr buffer_ptr,DenoisingTask * task)1707 bool CUDADevice::denoising_write_feature(int out_offset,
1708 device_ptr from_ptr,
1709 device_ptr buffer_ptr,
1710 DenoisingTask *task)
1711 {
1712 if (have_error())
1713 return false;
1714
1715 CUDAContextScope scope(this);
1716
1717 CUfunction cuFilterWriteFeature;
1718 cuda_assert(cuModuleGetFunction(
1719 &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
1720 cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
1721 CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1722
1723 void *args[] = {&task->render_buffer.samples,
1724 &task->reconstruction_state.buffer_params,
1725 &task->filter_area,
1726 &from_ptr,
1727 &buffer_ptr,
1728 &out_offset,
1729 &task->rect};
1730 CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
1731 cuda_assert(cuCtxSynchronize());
1732
1733 return !have_error();
1734 }
1735
denoising_detect_outliers(device_ptr image_ptr,device_ptr variance_ptr,device_ptr depth_ptr,device_ptr output_ptr,DenoisingTask * task)1736 bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
1737 device_ptr variance_ptr,
1738 device_ptr depth_ptr,
1739 device_ptr output_ptr,
1740 DenoisingTask *task)
1741 {
1742 if (have_error())
1743 return false;
1744
1745 CUDAContextScope scope(this);
1746
1747 CUfunction cuFilterDetectOutliers;
1748 cuda_assert(cuModuleGetFunction(
1749 &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
1750 cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
1751 CUDA_GET_BLOCKSIZE(
1752 cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1753
1754 void *args[] = {
1755 &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
1756
1757 CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
1758 cuda_assert(cuCtxSynchronize());
1759
1760 return !have_error();
1761 }
1762
denoise(RenderTile & rtile,DenoisingTask & denoising)1763 void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1764 {
1765 denoising.functions.construct_transform = function_bind(
1766 &CUDADevice::denoising_construct_transform, this, &denoising);
1767 denoising.functions.accumulate = function_bind(
1768 &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1769 denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
1770 denoising.functions.divide_shadow = function_bind(
1771 &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1772 denoising.functions.non_local_means = function_bind(
1773 &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1774 denoising.functions.combine_halves = function_bind(
1775 &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1776 denoising.functions.get_feature = function_bind(
1777 &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1778 denoising.functions.write_feature = function_bind(
1779 &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1780 denoising.functions.detect_outliers = function_bind(
1781 &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1782
1783 denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1784 denoising.render_buffer.samples = rtile.sample;
1785 denoising.buffer.gpu_temporary_mem = true;
1786
1787 denoising.run_denoising(rtile);
1788 }
1789
adaptive_sampling_filter(uint filter_sample,WorkTile * wtile,CUdeviceptr d_wtile,CUstream stream)1790 void CUDADevice::adaptive_sampling_filter(uint filter_sample,
1791 WorkTile *wtile,
1792 CUdeviceptr d_wtile,
1793 CUstream stream)
1794 {
1795 const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1796
1797 /* These are a series of tiny kernels because there is no grid synchronization
1798 * from within a kernel, so multiple kernel launches it is. */
1799 uint total_work_size = wtile->h * wtile->w;
1800 void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
1801 uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1802 cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
1803 num_blocks,
1804 1,
1805 1,
1806 num_threads_per_block,
1807 1,
1808 1,
1809 0,
1810 stream,
1811 args2,
1812 0));
1813 total_work_size = wtile->h;
1814 num_blocks = divide_up(total_work_size, num_threads_per_block);
1815 cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
1816 num_blocks,
1817 1,
1818 1,
1819 num_threads_per_block,
1820 1,
1821 1,
1822 0,
1823 stream,
1824 args2,
1825 0));
1826 total_work_size = wtile->w;
1827 num_blocks = divide_up(total_work_size, num_threads_per_block);
1828 cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
1829 num_blocks,
1830 1,
1831 1,
1832 num_threads_per_block,
1833 1,
1834 1,
1835 0,
1836 stream,
1837 args2,
1838 0));
1839 }
1840
adaptive_sampling_post(RenderTile & rtile,WorkTile * wtile,CUdeviceptr d_wtile,CUstream stream)1841 void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
1842 WorkTile *wtile,
1843 CUdeviceptr d_wtile,
1844 CUstream stream)
1845 {
1846 const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1847 uint total_work_size = wtile->h * wtile->w;
1848
1849 void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
1850 uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1851 cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
1852 num_blocks,
1853 1,
1854 1,
1855 num_threads_per_block,
1856 1,
1857 1,
1858 0,
1859 stream,
1860 args,
1861 0));
1862 }
1863
render(DeviceTask & task,RenderTile & rtile,device_vector<WorkTile> & work_tiles)1864 void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
1865 {
1866 scoped_timer timer(&rtile.buffers->render_time);
1867
1868 if (have_error())
1869 return;
1870
1871 CUDAContextScope scope(this);
1872 CUfunction cuRender;
1873
1874 /* Get kernel function. */
1875 if (rtile.task == RenderTile::BAKE) {
1876 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
1877 }
1878 else if (task.integrator_branched) {
1879 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
1880 }
1881 else {
1882 cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
1883 }
1884
1885 if (have_error()) {
1886 return;
1887 }
1888
1889 cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
1890
1891 /* Allocate work tile. */
1892 work_tiles.alloc(1);
1893
1894 WorkTile *wtile = work_tiles.data();
1895 wtile->x = rtile.x;
1896 wtile->y = rtile.y;
1897 wtile->w = rtile.w;
1898 wtile->h = rtile.h;
1899 wtile->offset = rtile.offset;
1900 wtile->stride = rtile.stride;
1901 wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
1902
1903 /* Prepare work size. More step samples render faster, but for now we
1904 * remain conservative for GPUs connected to a display to avoid driver
1905 * timeouts and display freezing. */
1906 int min_blocks, num_threads_per_block;
1907 cuda_assert(
1908 cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
1909 if (!info.display_device) {
1910 min_blocks *= 8;
1911 }
1912
1913 uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
1914 if (task.adaptive_sampling.use) {
1915 step_samples = task.adaptive_sampling.align_static_samples(step_samples);
1916 }
1917
1918 /* Render all samples. */
1919 int start_sample = rtile.start_sample;
1920 int end_sample = rtile.start_sample + rtile.num_samples;
1921
1922 for (int sample = start_sample; sample < end_sample; sample += step_samples) {
1923 /* Setup and copy work tile to device. */
1924 wtile->start_sample = sample;
1925 wtile->num_samples = min(step_samples, end_sample - sample);
1926 work_tiles.copy_to_device();
1927
1928 CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1929 uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
1930 uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1931
1932 /* Launch kernel. */
1933 void *args[] = {&d_work_tiles, &total_work_size};
1934
1935 cuda_assert(
1936 cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
1937
1938 /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
1939 uint filter_sample = sample + wtile->num_samples - 1;
1940 if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
1941 adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
1942 }
1943
1944 cuda_assert(cuCtxSynchronize());
1945
1946 /* Update progress. */
1947 rtile.sample = sample + wtile->num_samples;
1948 task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1949
1950 if (task.get_cancel()) {
1951 if (task.need_finish_queue == false)
1952 break;
1953 }
1954 }
1955
1956 /* Finalize adaptive sampling. */
1957 if (task.adaptive_sampling.use) {
1958 CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1959 adaptive_sampling_post(rtile, wtile, d_work_tiles);
1960 cuda_assert(cuCtxSynchronize());
1961 task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1962 }
1963 }
1964
film_convert(DeviceTask & task,device_ptr buffer,device_ptr rgba_byte,device_ptr rgba_half)1965 void CUDADevice::film_convert(DeviceTask &task,
1966 device_ptr buffer,
1967 device_ptr rgba_byte,
1968 device_ptr rgba_half)
1969 {
1970 if (have_error())
1971 return;
1972
1973 CUDAContextScope scope(this);
1974
1975 CUfunction cuFilmConvert;
1976 CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
1977 CUdeviceptr d_buffer = (CUdeviceptr)buffer;
1978
1979 /* get kernel function */
1980 if (rgba_half) {
1981 cuda_assert(
1982 cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
1983 }
1984 else {
1985 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
1986 }
1987
1988 float sample_scale = 1.0f / (task.sample + 1);
1989
1990 /* pass in parameters */
1991 void *args[] = {&d_rgba,
1992 &d_buffer,
1993 &sample_scale,
1994 &task.x,
1995 &task.y,
1996 &task.w,
1997 &task.h,
1998 &task.offset,
1999 &task.stride};
2000
2001 /* launch kernel */
2002 int threads_per_block;
2003 cuda_assert(cuFuncGetAttribute(
2004 &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
2005
2006 int xthreads = (int)sqrt(threads_per_block);
2007 int ythreads = (int)sqrt(threads_per_block);
2008 int xblocks = (task.w + xthreads - 1) / xthreads;
2009 int yblocks = (task.h + ythreads - 1) / ythreads;
2010
2011 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
2012
2013 cuda_assert(cuLaunchKernel(cuFilmConvert,
2014 xblocks,
2015 yblocks,
2016 1, /* blocks */
2017 xthreads,
2018 ythreads,
2019 1, /* threads */
2020 0,
2021 0,
2022 args,
2023 0));
2024
2025 unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
2026
2027 cuda_assert(cuCtxSynchronize());
2028 }
2029
shader(DeviceTask & task)2030 void CUDADevice::shader(DeviceTask &task)
2031 {
2032 if (have_error())
2033 return;
2034
2035 CUDAContextScope scope(this);
2036
2037 CUfunction cuShader;
2038 CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
2039 CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
2040
2041 /* get kernel function */
2042 if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
2043 cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
2044 }
2045 else {
2046 cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
2047 }
2048
2049 /* do tasks in smaller chunks, so we can cancel it */
2050 const int shader_chunk_size = 65536;
2051 const int start = task.shader_x;
2052 const int end = task.shader_x + task.shader_w;
2053 int offset = task.offset;
2054
2055 bool canceled = false;
2056 for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
2057 for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
2058 int shader_w = min(shader_chunk_size, end - shader_x);
2059
2060 /* pass in parameters */
2061 void *args[8];
2062 int arg = 0;
2063 args[arg++] = &d_input;
2064 args[arg++] = &d_output;
2065 args[arg++] = &task.shader_eval_type;
2066 if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
2067 args[arg++] = &task.shader_filter;
2068 }
2069 args[arg++] = &shader_x;
2070 args[arg++] = &shader_w;
2071 args[arg++] = &offset;
2072 args[arg++] = &sample;
2073
2074 /* launch kernel */
2075 int threads_per_block;
2076 cuda_assert(cuFuncGetAttribute(
2077 &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
2078
2079 int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
2080
2081 cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
2082 cuda_assert(cuLaunchKernel(cuShader,
2083 xblocks,
2084 1,
2085 1, /* blocks */
2086 threads_per_block,
2087 1,
2088 1, /* threads */
2089 0,
2090 0,
2091 args,
2092 0));
2093
2094 cuda_assert(cuCtxSynchronize());
2095
2096 if (task.get_cancel()) {
2097 canceled = true;
2098 break;
2099 }
2100 }
2101
2102 task.update_progress(NULL);
2103 }
2104 }
2105
map_pixels(device_ptr mem)2106 CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
2107 {
2108 if (!background) {
2109 PixelMem pmem = pixel_mem_map[mem];
2110 CUdeviceptr buffer;
2111
2112 size_t bytes;
2113 cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
2114 cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
2115
2116 return buffer;
2117 }
2118
2119 return (CUdeviceptr)mem;
2120 }
2121
unmap_pixels(device_ptr mem)2122 void CUDADevice::unmap_pixels(device_ptr mem)
2123 {
2124 if (!background) {
2125 PixelMem pmem = pixel_mem_map[mem];
2126
2127 cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
2128 }
2129 }
2130
pixels_alloc(device_memory & mem)2131 void CUDADevice::pixels_alloc(device_memory &mem)
2132 {
2133 PixelMem pmem;
2134
2135 pmem.w = mem.data_width;
2136 pmem.h = mem.data_height;
2137
2138 CUDAContextScope scope(this);
2139
2140 glGenBuffers(1, &pmem.cuPBO);
2141 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2142 if (mem.data_type == TYPE_HALF)
2143 glBufferData(
2144 GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
2145 else
2146 glBufferData(
2147 GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
2148
2149 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2150
2151 glActiveTexture(GL_TEXTURE0);
2152 glGenTextures(1, &pmem.cuTexId);
2153 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2154 if (mem.data_type == TYPE_HALF)
2155 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
2156 else
2157 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
2158 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
2159 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
2160 glBindTexture(GL_TEXTURE_2D, 0);
2161
2162 CUresult result = cuGraphicsGLRegisterBuffer(
2163 &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
2164
2165 if (result == CUDA_SUCCESS) {
2166 mem.device_pointer = pmem.cuTexId;
2167 pixel_mem_map[mem.device_pointer] = pmem;
2168
2169 mem.device_size = mem.memory_size();
2170 stats.mem_alloc(mem.device_size);
2171
2172 return;
2173 }
2174 else {
2175 /* failed to register buffer, fallback to no interop */
2176 glDeleteBuffers(1, &pmem.cuPBO);
2177 glDeleteTextures(1, &pmem.cuTexId);
2178
2179 background = true;
2180 }
2181 }
2182
pixels_copy_from(device_memory & mem,int y,int w,int h)2183 void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
2184 {
2185 PixelMem pmem = pixel_mem_map[mem.device_pointer];
2186
2187 CUDAContextScope scope(this);
2188
2189 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2190 uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
2191 size_t offset = sizeof(uchar) * 4 * y * w;
2192 memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
2193 glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
2194 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2195 }
2196
pixels_free(device_memory & mem)2197 void CUDADevice::pixels_free(device_memory &mem)
2198 {
2199 if (mem.device_pointer) {
2200 PixelMem pmem = pixel_mem_map[mem.device_pointer];
2201
2202 CUDAContextScope scope(this);
2203
2204 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
2205 glDeleteBuffers(1, &pmem.cuPBO);
2206 glDeleteTextures(1, &pmem.cuTexId);
2207
2208 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
2209 mem.device_pointer = 0;
2210
2211 stats.mem_free(mem.device_size);
2212 mem.device_size = 0;
2213 }
2214 }
2215
draw_pixels(device_memory & mem,int y,int w,int h,int width,int height,int dx,int dy,int dw,int dh,bool transparent,const DeviceDrawParams & draw_params)2216 void CUDADevice::draw_pixels(device_memory &mem,
2217 int y,
2218 int w,
2219 int h,
2220 int width,
2221 int height,
2222 int dx,
2223 int dy,
2224 int dw,
2225 int dh,
2226 bool transparent,
2227 const DeviceDrawParams &draw_params)
2228 {
2229 assert(mem.type == MEM_PIXELS);
2230
2231 if (!background) {
2232 const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
2233 PixelMem pmem = pixel_mem_map[mem.device_pointer];
2234 float *vpointer;
2235
2236 CUDAContextScope scope(this);
2237
2238 /* for multi devices, this assumes the inefficient method that we allocate
2239 * all pixels on the device even though we only render to a subset */
2240 size_t offset = 4 * y * w;
2241
2242 if (mem.data_type == TYPE_HALF)
2243 offset *= sizeof(GLhalf);
2244 else
2245 offset *= sizeof(uint8_t);
2246
2247 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2248 glActiveTexture(GL_TEXTURE0);
2249 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2250 if (mem.data_type == TYPE_HALF) {
2251 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
2252 }
2253 else {
2254 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
2255 }
2256 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2257
2258 if (transparent) {
2259 glEnable(GL_BLEND);
2260 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
2261 }
2262
2263 GLint shader_program;
2264 if (use_fallback_shader) {
2265 if (!bind_fallback_display_space_shader(dw, dh)) {
2266 return;
2267 }
2268 shader_program = fallback_shader_program;
2269 }
2270 else {
2271 draw_params.bind_display_space_shader_cb();
2272 glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
2273 }
2274
2275 if (!vertex_buffer) {
2276 glGenBuffers(1, &vertex_buffer);
2277 }
2278
2279 glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
2280 /* invalidate old contents -
2281 * avoids stalling if buffer is still waiting in queue to be rendered */
2282 glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
2283
2284 vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
2285
2286 if (vpointer) {
2287 /* texture coordinate - vertex pair */
2288 vpointer[0] = 0.0f;
2289 vpointer[1] = 0.0f;
2290 vpointer[2] = dx;
2291 vpointer[3] = dy;
2292
2293 vpointer[4] = (float)w / (float)pmem.w;
2294 vpointer[5] = 0.0f;
2295 vpointer[6] = (float)width + dx;
2296 vpointer[7] = dy;
2297
2298 vpointer[8] = (float)w / (float)pmem.w;
2299 vpointer[9] = (float)h / (float)pmem.h;
2300 vpointer[10] = (float)width + dx;
2301 vpointer[11] = (float)height + dy;
2302
2303 vpointer[12] = 0.0f;
2304 vpointer[13] = (float)h / (float)pmem.h;
2305 vpointer[14] = dx;
2306 vpointer[15] = (float)height + dy;
2307
2308 glUnmapBuffer(GL_ARRAY_BUFFER);
2309 }
2310
2311 GLuint vertex_array_object;
2312 GLuint position_attribute, texcoord_attribute;
2313
2314 glGenVertexArrays(1, &vertex_array_object);
2315 glBindVertexArray(vertex_array_object);
2316
2317 texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
2318 position_attribute = glGetAttribLocation(shader_program, "pos");
2319
2320 glEnableVertexAttribArray(texcoord_attribute);
2321 glEnableVertexAttribArray(position_attribute);
2322
2323 glVertexAttribPointer(
2324 texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
2325 glVertexAttribPointer(position_attribute,
2326 2,
2327 GL_FLOAT,
2328 GL_FALSE,
2329 4 * sizeof(float),
2330 (const GLvoid *)(sizeof(float) * 2));
2331
2332 glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
2333
2334 if (use_fallback_shader) {
2335 glUseProgram(0);
2336 }
2337 else {
2338 draw_params.unbind_display_space_shader_cb();
2339 }
2340
2341 if (transparent) {
2342 glDisable(GL_BLEND);
2343 }
2344
2345 glBindTexture(GL_TEXTURE_2D, 0);
2346
2347 return;
2348 }
2349
2350 Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
2351 }
2352
thread_run(DeviceTask & task)2353 void CUDADevice::thread_run(DeviceTask &task)
2354 {
2355 CUDAContextScope scope(this);
2356
2357 if (task.type == DeviceTask::RENDER) {
2358 DeviceRequestedFeatures requested_features;
2359 if (use_split_kernel()) {
2360 if (split_kernel == NULL) {
2361 split_kernel = new CUDASplitKernel(this);
2362 split_kernel->load_kernels(requested_features);
2363 }
2364 }
2365
2366 device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
2367
2368 /* keep rendering tiles until done */
2369 RenderTile tile;
2370 DenoisingTask denoising(this, task);
2371
2372 while (task.acquire_tile(this, tile, task.tile_types)) {
2373 if (tile.task == RenderTile::PATH_TRACE) {
2374 if (use_split_kernel()) {
2375 device_only_memory<uchar> void_buffer(this, "void_buffer");
2376 split_kernel->path_trace(task, tile, void_buffer, void_buffer);
2377 }
2378 else {
2379 render(task, tile, work_tiles);
2380 }
2381 }
2382 else if (tile.task == RenderTile::BAKE) {
2383 render(task, tile, work_tiles);
2384 }
2385 else if (tile.task == RenderTile::DENOISE) {
2386 tile.sample = tile.start_sample + tile.num_samples;
2387
2388 denoise(tile, denoising);
2389
2390 task.update_progress(&tile, tile.w * tile.h);
2391 }
2392
2393 task.release_tile(tile);
2394
2395 if (task.get_cancel()) {
2396 if (task.need_finish_queue == false)
2397 break;
2398 }
2399 }
2400
2401 work_tiles.free();
2402 }
2403 else if (task.type == DeviceTask::SHADER) {
2404 shader(task);
2405
2406 cuda_assert(cuCtxSynchronize());
2407 }
2408 else if (task.type == DeviceTask::DENOISE_BUFFER) {
2409 RenderTile tile;
2410 tile.x = task.x;
2411 tile.y = task.y;
2412 tile.w = task.w;
2413 tile.h = task.h;
2414 tile.buffer = task.buffer;
2415 tile.sample = task.sample + task.num_samples;
2416 tile.num_samples = task.num_samples;
2417 tile.start_sample = task.sample;
2418 tile.offset = task.offset;
2419 tile.stride = task.stride;
2420 tile.buffers = task.buffers;
2421
2422 DenoisingTask denoising(this, task);
2423 denoise(tile, denoising);
2424 task.update_progress(&tile, tile.w * tile.h);
2425 }
2426 }
2427
task_add(DeviceTask & task)2428 void CUDADevice::task_add(DeviceTask &task)
2429 {
2430 CUDAContextScope scope(this);
2431
2432 /* Load texture info. */
2433 load_texture_info();
2434
2435 /* Synchronize all memory copies before executing task. */
2436 cuda_assert(cuCtxSynchronize());
2437
2438 if (task.type == DeviceTask::FILM_CONVERT) {
2439 /* must be done in main thread due to opengl access */
2440 film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
2441 }
2442 else {
2443 task_pool.push([=] {
2444 DeviceTask task_copy = task;
2445 thread_run(task_copy);
2446 });
2447 }
2448 }
2449
task_wait()2450 void CUDADevice::task_wait()
2451 {
2452 task_pool.wait();
2453 }
2454
task_cancel()2455 void CUDADevice::task_cancel()
2456 {
2457 task_pool.cancel();
2458 }
2459
2460 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
2461 * now that the definition of that class is complete
2462 */
2463 # undef cuda_assert
2464 # define cuda_assert(stmt) \
2465 { \
2466 CUresult result = stmt; \
2467 if (result != CUDA_SUCCESS) { \
2468 const char *name = cuewErrorString(result); \
2469 device->set_error( \
2470 string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
2471 } \
2472 } \
2473 (void)0
2474
2475 /* CUDA context scope. */
2476
CUDAContextScope(CUDADevice * device)2477 CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
2478 {
2479 cuda_assert(cuCtxPushCurrent(device->cuContext));
2480 }
2481
~CUDAContextScope()2482 CUDAContextScope::~CUDAContextScope()
2483 {
2484 cuda_assert(cuCtxPopCurrent(NULL));
2485 }
2486
2487 /* split kernel */
2488
2489 class CUDASplitKernelFunction : public SplitKernelFunction {
2490 CUDADevice *device;
2491 CUfunction func;
2492
2493 public:
CUDASplitKernelFunction(CUDADevice * device,CUfunction func)2494 CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
2495 {
2496 }
2497
2498 /* enqueue the kernel, returns false if there is an error */
enqueue(const KernelDimensions & dim,device_memory &,device_memory &)2499 bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
2500 {
2501 return enqueue(dim, NULL);
2502 }
2503
2504 /* enqueue the kernel, returns false if there is an error */
enqueue(const KernelDimensions & dim,void * args[])2505 bool enqueue(const KernelDimensions &dim, void *args[])
2506 {
2507 if (device->have_error())
2508 return false;
2509
2510 CUDAContextScope scope(device);
2511
2512 /* we ignore dim.local_size for now, as this is faster */
2513 int threads_per_block;
2514 cuda_assert(
2515 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
2516
2517 int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
2518 threads_per_block;
2519
2520 cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
2521
2522 cuda_assert(cuLaunchKernel(func,
2523 xblocks,
2524 1,
2525 1, /* blocks */
2526 threads_per_block,
2527 1,
2528 1, /* threads */
2529 0,
2530 0,
2531 args,
2532 0));
2533
2534 return !device->have_error();
2535 }
2536 };
2537
CUDASplitKernel(CUDADevice * device)2538 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
2539 {
2540 }
2541
state_buffer_size(device_memory &,device_memory &,size_t num_threads)2542 uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
2543 device_memory & /*data*/,
2544 size_t num_threads)
2545 {
2546 CUDAContextScope scope(device);
2547
2548 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
2549 size_buffer.alloc(1);
2550 size_buffer.zero_to_device();
2551
2552 uint threads = num_threads;
2553 CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
2554
2555 struct args_t {
2556 uint *num_threads;
2557 CUdeviceptr *size;
2558 };
2559
2560 args_t args = {&threads, &d_size};
2561
2562 CUfunction state_buffer_size;
2563 cuda_assert(
2564 cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
2565
2566 cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
2567
2568 size_buffer.copy_from_device(0, 1, 1);
2569 size_t size = size_buffer[0];
2570 size_buffer.free();
2571
2572 return size;
2573 }
2574
enqueue_split_kernel_data_init(const KernelDimensions & dim,RenderTile & rtile,int num_global_elements,device_memory &,device_memory &,device_memory & split_data,device_memory & ray_state,device_memory & queue_index,device_memory & use_queues_flag,device_memory & work_pool_wgs)2575 bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
2576 RenderTile &rtile,
2577 int num_global_elements,
2578 device_memory & /*kernel_globals*/,
2579 device_memory & /*kernel_data*/,
2580 device_memory &split_data,
2581 device_memory &ray_state,
2582 device_memory &queue_index,
2583 device_memory &use_queues_flag,
2584 device_memory &work_pool_wgs)
2585 {
2586 CUDAContextScope scope(device);
2587
2588 CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
2589 CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
2590 CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
2591 CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
2592 CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
2593
2594 CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
2595
2596 int end_sample = rtile.start_sample + rtile.num_samples;
2597 int queue_size = dim.global_size[0] * dim.global_size[1];
2598
2599 struct args_t {
2600 CUdeviceptr *split_data_buffer;
2601 int *num_elements;
2602 CUdeviceptr *ray_state;
2603 int *start_sample;
2604 int *end_sample;
2605 int *sx;
2606 int *sy;
2607 int *sw;
2608 int *sh;
2609 int *offset;
2610 int *stride;
2611 CUdeviceptr *queue_index;
2612 int *queuesize;
2613 CUdeviceptr *use_queues_flag;
2614 CUdeviceptr *work_pool_wgs;
2615 int *num_samples;
2616 CUdeviceptr *buffer;
2617 };
2618
2619 args_t args = {&d_split_data,
2620 &num_global_elements,
2621 &d_ray_state,
2622 &rtile.start_sample,
2623 &end_sample,
2624 &rtile.x,
2625 &rtile.y,
2626 &rtile.w,
2627 &rtile.h,
2628 &rtile.offset,
2629 &rtile.stride,
2630 &d_queue_index,
2631 &queue_size,
2632 &d_use_queues_flag,
2633 &d_work_pool_wgs,
2634 &rtile.num_samples,
2635 &d_buffer};
2636
2637 CUfunction data_init;
2638 cuda_assert(
2639 cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
2640 if (device->have_error()) {
2641 return false;
2642 }
2643
2644 CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
2645
2646 return !device->have_error();
2647 }
2648
get_split_kernel_function(const string & kernel_name,const DeviceRequestedFeatures &)2649 SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
2650 const DeviceRequestedFeatures &)
2651 {
2652 const CUDAContextScope scope(device);
2653
2654 CUfunction func;
2655 const CUresult result = cuModuleGetFunction(
2656 &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
2657 if (result != CUDA_SUCCESS) {
2658 device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
2659 kernel_name.data(),
2660 cuewErrorString(result)));
2661 return NULL;
2662 }
2663
2664 return new CUDASplitKernelFunction(device, func);
2665 }
2666
split_kernel_local_size()2667 int2 CUDASplitKernel::split_kernel_local_size()
2668 {
2669 return make_int2(32, 1);
2670 }
2671
split_kernel_global_size(device_memory & kg,device_memory & data,DeviceTask &)2672 int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
2673 device_memory &data,
2674 DeviceTask & /*task*/)
2675 {
2676 CUDAContextScope scope(device);
2677 size_t free;
2678 size_t total;
2679
2680 cuda_assert(cuMemGetInfo(&free, &total));
2681
2682 VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
2683 << " bytes. (" << string_human_readable_size(free) << ").";
2684
2685 size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
2686 size_t side = round_down((int)sqrt(num_elements), 32);
2687 int2 global_size = make_int2(side, round_down(num_elements / side, 16));
2688 VLOG(1) << "Global size: " << global_size << ".";
2689 return global_size;
2690 }
2691
2692 CCL_NAMESPACE_END
2693
2694 #endif
2695