1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef WITH_CUDA
18 
19 #  include <climits>
20 #  include <limits.h>
21 #  include <stdio.h>
22 #  include <stdlib.h>
23 #  include <string.h>
24 
25 #  include "device/cuda/device_cuda.h"
26 #  include "device/device_intern.h"
27 #  include "device/device_split_kernel.h"
28 
29 #  include "render/buffers.h"
30 
31 #  include "kernel/filter/filter_defines.h"
32 
33 #  include "util/util_debug.h"
34 #  include "util/util_foreach.h"
35 #  include "util/util_logging.h"
36 #  include "util/util_map.h"
37 #  include "util/util_md5.h"
38 #  include "util/util_opengl.h"
39 #  include "util/util_path.h"
40 #  include "util/util_string.h"
41 #  include "util/util_system.h"
42 #  include "util/util_time.h"
43 #  include "util/util_types.h"
44 #  include "util/util_windows.h"
45 
46 #  include "kernel/split/kernel_split_data_types.h"
47 
48 CCL_NAMESPACE_BEGIN
49 
50 #  ifndef WITH_CUDA_DYNLOAD
51 
52 /* Transparently implement some functions, so majority of the file does not need
53  * to worry about difference between dynamically loaded and linked CUDA at all.
54  */
55 
56 namespace {
57 
cuewErrorString(CUresult result)58 const char *cuewErrorString(CUresult result)
59 {
60   /* We can only give error code here without major code duplication, that
61    * should be enough since dynamic loading is only being disabled by folks
62    * who knows what they're doing anyway.
63    *
64    * NOTE: Avoid call from several threads.
65    */
66   static string error;
67   error = string_printf("%d", result);
68   return error.c_str();
69 }
70 
cuewCompilerPath()71 const char *cuewCompilerPath()
72 {
73   return CYCLES_CUDA_NVCC_EXECUTABLE;
74 }
75 
cuewCompilerVersion()76 int cuewCompilerVersion()
77 {
78   return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
79 }
80 
81 } /* namespace */
82 #  endif /* WITH_CUDA_DYNLOAD */
83 
84 class CUDADevice;
85 
86 class CUDASplitKernel : public DeviceSplitKernel {
87   CUDADevice *device;
88 
89  public:
90   explicit CUDASplitKernel(CUDADevice *device);
91 
92   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
93 
94   virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
95                                               RenderTile &rtile,
96                                               int num_global_elements,
97                                               device_memory &kernel_globals,
98                                               device_memory &kernel_data_,
99                                               device_memory &split_data,
100                                               device_memory &ray_state,
101                                               device_memory &queue_index,
102                                               device_memory &use_queues_flag,
103                                               device_memory &work_pool_wgs);
104 
105   virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
106                                                          const DeviceRequestedFeatures &);
107   virtual int2 split_kernel_local_size();
108   virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
109 };
110 
111 /* Utility to push/pop CUDA context. */
112 class CUDAContextScope {
113  public:
114   CUDAContextScope(CUDADevice *device);
115   ~CUDAContextScope();
116 
117  private:
118   CUDADevice *device;
119 };
120 
have_precompiled_kernels()121 bool CUDADevice::have_precompiled_kernels()
122 {
123   string cubins_path = path_get("lib");
124   return path_exists(cubins_path);
125 }
126 
show_samples() const127 bool CUDADevice::show_samples() const
128 {
129   /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
130   return true;
131 }
132 
get_bvh_layout_mask() const133 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
134 {
135   return BVH_LAYOUT_BVH2;
136 }
137 
set_error(const string & error)138 void CUDADevice::set_error(const string &error)
139 {
140   Device::set_error(error);
141 
142   if (first_error) {
143     fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
144     fprintf(stderr,
145             "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
146     first_error = false;
147   }
148 }
149 
150 #  define cuda_assert(stmt) \
151     { \
152       CUresult result = stmt; \
153       if (result != CUDA_SUCCESS) { \
154         const char *name = cuewErrorString(result); \
155         set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
156       } \
157     } \
158     (void)0
159 
CUDADevice(DeviceInfo & info,Stats & stats,Profiler & profiler,bool background_)160 CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
161     : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
162 {
163   first_error = true;
164   background = background_;
165 
166   cuDevId = info.num;
167   cuDevice = 0;
168   cuContext = 0;
169 
170   cuModule = 0;
171   cuFilterModule = 0;
172 
173   split_kernel = NULL;
174 
175   need_texture_info = false;
176 
177   device_texture_headroom = 0;
178   device_working_headroom = 0;
179   move_texture_to_host = false;
180   map_host_limit = 0;
181   map_host_used = 0;
182   can_map_host = 0;
183   pitch_alignment = 0;
184 
185   functions.loaded = false;
186 
187   /* Intialize CUDA. */
188   CUresult result = cuInit(0);
189   if (result != CUDA_SUCCESS) {
190     set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
191     return;
192   }
193 
194   /* Setup device and context. */
195   result = cuDeviceGet(&cuDevice, cuDevId);
196   if (result != CUDA_SUCCESS) {
197     set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
198                             cuewErrorString(result)));
199     return;
200   }
201 
202   /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
203    * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
204    * so we can predict which memory to map to host. */
205   cuda_assert(
206       cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
207 
208   cuda_assert(cuDeviceGetAttribute(
209       &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
210 
211   unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
212   if (can_map_host) {
213     ctx_flags |= CU_CTX_MAP_HOST;
214     init_host_memory();
215   }
216 
217   /* Create context. */
218   if (background) {
219     result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
220   }
221   else {
222     result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
223 
224     if (result != CUDA_SUCCESS) {
225       result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
226       background = true;
227     }
228   }
229 
230   if (result != CUDA_SUCCESS) {
231     set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
232     return;
233   }
234 
235   int major, minor;
236   cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
237   cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
238   cuDevArchitecture = major * 100 + minor * 10;
239 
240   /* Pop context set by cuCtxCreate. */
241   cuCtxPopCurrent(NULL);
242 }
243 
~CUDADevice()244 CUDADevice::~CUDADevice()
245 {
246   task_pool.cancel();
247 
248   delete split_kernel;
249 
250   texture_info.free();
251 
252   cuda_assert(cuCtxDestroy(cuContext));
253 }
254 
support_device(const DeviceRequestedFeatures &)255 bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
256 {
257   int major, minor;
258   cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
259   cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
260 
261   /* We only support sm_30 and above */
262   if (major < 3) {
263     set_error(string_printf(
264         "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
265     return false;
266   }
267 
268   return true;
269 }
270 
check_peer_access(Device * peer_device)271 bool CUDADevice::check_peer_access(Device *peer_device)
272 {
273   if (peer_device == this) {
274     return false;
275   }
276   if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
277     return false;
278   }
279 
280   CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
281 
282   int can_access = 0;
283   cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
284   if (can_access == 0) {
285     return false;
286   }
287 
288   // Ensure array access over the link is possible as well (for 3D textures)
289   cuda_assert(cuDeviceGetP2PAttribute(&can_access,
290                                       CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
291                                       cuDevice,
292                                       peer_device_cuda->cuDevice));
293   if (can_access == 0) {
294     return false;
295   }
296 
297   // Enable peer access in both directions
298   {
299     const CUDAContextScope scope(this);
300     CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
301     if (result != CUDA_SUCCESS) {
302       set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
303                               cuewErrorString(result)));
304       return false;
305     }
306   }
307   {
308     const CUDAContextScope scope(peer_device_cuda);
309     CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
310     if (result != CUDA_SUCCESS) {
311       set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
312                               cuewErrorString(result)));
313       return false;
314     }
315   }
316 
317   return true;
318 }
319 
use_adaptive_compilation()320 bool CUDADevice::use_adaptive_compilation()
321 {
322   return DebugFlags().cuda.adaptive_compile;
323 }
324 
use_split_kernel()325 bool CUDADevice::use_split_kernel()
326 {
327   return DebugFlags().cuda.split_kernel;
328 }
329 
330 /* Common NVCC flags which stays the same regardless of shading model,
331  * kernel sources md5 and only depends on compiler or compilation settings.
332  */
compile_kernel_get_common_cflags(const DeviceRequestedFeatures & requested_features,bool filter,bool split)333 string CUDADevice::compile_kernel_get_common_cflags(
334     const DeviceRequestedFeatures &requested_features, bool filter, bool split)
335 {
336   const int machine = system_cpu_bits();
337   const string source_path = path_get("source");
338   const string include_path = source_path;
339   string cflags = string_printf(
340       "-m%d "
341       "--ptxas-options=\"-v\" "
342       "--use_fast_math "
343       "-DNVCC "
344       "-I\"%s\"",
345       machine,
346       include_path.c_str());
347   if (!filter && use_adaptive_compilation()) {
348     cflags += " " + requested_features.get_build_options();
349   }
350   const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
351   if (extra_cflags) {
352     cflags += string(" ") + string(extra_cflags);
353   }
354 #  ifdef WITH_CYCLES_DEBUG
355   cflags += " -D__KERNEL_DEBUG__";
356 #  endif
357 
358   if (split) {
359     cflags += " -D__SPLIT__";
360   }
361 
362 #  ifdef WITH_NANOVDB
363   cflags += " -DWITH_NANOVDB";
364 #  endif
365 
366   return cflags;
367 }
368 
compile_kernel(const DeviceRequestedFeatures & requested_features,const char * name,const char * base,bool force_ptx)369 string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
370                                   const char *name,
371                                   const char *base,
372                                   bool force_ptx)
373 {
374   /* Compute kernel name. */
375   int major, minor;
376   cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
377   cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
378 
379   /* Attempt to use kernel provided with Blender. */
380   if (!use_adaptive_compilation()) {
381     if (!force_ptx) {
382       const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
383       VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
384       if (path_exists(cubin)) {
385         VLOG(1) << "Using precompiled kernel.";
386         return cubin;
387       }
388     }
389 
390     /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
391     int ptx_major = major, ptx_minor = minor;
392     while (ptx_major >= 3) {
393       const string ptx = path_get(
394           string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
395       VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
396       if (path_exists(ptx)) {
397         VLOG(1) << "Using precompiled kernel.";
398         return ptx;
399       }
400 
401       if (ptx_minor > 0) {
402         ptx_minor--;
403       }
404       else {
405         ptx_major--;
406         ptx_minor = 9;
407       }
408     }
409   }
410 
411   /* Try to use locally compiled kernel. */
412   string source_path = path_get("source");
413   const string source_md5 = path_files_md5_hash(source_path);
414 
415   /* We include cflags into md5 so changing cuda toolkit or changing other
416    * compiler command line arguments makes sure cubin gets re-built.
417    */
418   string common_cflags = compile_kernel_get_common_cflags(
419       requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
420   const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
421 
422   const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
423   const char *const kernel_arch = force_ptx ? "compute" : "sm";
424   const string cubin_file = string_printf(
425       "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
426   const string cubin = path_cache_get(path_join("kernels", cubin_file));
427   VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
428   if (path_exists(cubin)) {
429     VLOG(1) << "Using locally compiled kernel.";
430     return cubin;
431   }
432 
433 #  ifdef _WIN32
434   if (!use_adaptive_compilation() && have_precompiled_kernels()) {
435     if (major < 3) {
436       set_error(
437           string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
438                         "Your GPU is not supported.",
439                         major,
440                         minor));
441     }
442     else {
443       set_error(
444           string_printf("CUDA binary kernel for this graphics card compute "
445                         "capability (%d.%d) not found.",
446                         major,
447                         minor));
448     }
449     return string();
450   }
451 #  endif
452 
453   /* Compile. */
454   const char *const nvcc = cuewCompilerPath();
455   if (nvcc == NULL) {
456     set_error(
457         "CUDA nvcc compiler not found. "
458         "Install CUDA toolkit in default location.");
459     return string();
460   }
461 
462   const int nvcc_cuda_version = cuewCompilerVersion();
463   VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
464   if (nvcc_cuda_version < 80) {
465     printf(
466         "Unsupported CUDA version %d.%d detected, "
467         "you need CUDA 8.0 or newer.\n",
468         nvcc_cuda_version / 10,
469         nvcc_cuda_version % 10);
470     return string();
471   }
472   else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102)) {
473     printf(
474         "CUDA version %d.%d detected, build may succeed but only "
475         "CUDA 10.1 and 10.2 are officially supported.\n",
476         nvcc_cuda_version / 10,
477         nvcc_cuda_version % 10);
478   }
479 
480   double starttime = time_dt();
481 
482   path_create_directories(cubin);
483 
484   source_path = path_join(path_join(source_path, "kernel"),
485                           path_join("kernels", path_join(base, string_printf("%s.cu", name))));
486 
487   string command = string_printf(
488       "\"%s\" "
489       "-arch=%s_%d%d "
490       "--%s \"%s\" "
491       "-o \"%s\" "
492       "%s",
493       nvcc,
494       kernel_arch,
495       major,
496       minor,
497       kernel_ext,
498       source_path.c_str(),
499       cubin.c_str(),
500       common_cflags.c_str());
501 
502   printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
503 
504 #  ifdef _WIN32
505   command = "call " + command;
506 #  endif
507   if (system(command.c_str()) != 0) {
508     set_error(
509         "Failed to execute compilation command, "
510         "see console for details.");
511     return string();
512   }
513 
514   /* Verify if compilation succeeded */
515   if (!path_exists(cubin)) {
516     set_error(
517         "CUDA kernel compilation failed, "
518         "see console for details.");
519     return string();
520   }
521 
522   printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
523 
524   return cubin;
525 }
526 
load_kernels(const DeviceRequestedFeatures & requested_features)527 bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
528 {
529   /* TODO(sergey): Support kernels re-load for CUDA devices.
530    *
531    * Currently re-loading kernel will invalidate memory pointers,
532    * causing problems in cuCtxSynchronize.
533    */
534   if (cuFilterModule && cuModule) {
535     VLOG(1) << "Skipping kernel reload, not currently supported.";
536     return true;
537   }
538 
539   /* check if cuda init succeeded */
540   if (cuContext == 0)
541     return false;
542 
543   /* check if GPU is supported */
544   if (!support_device(requested_features))
545     return false;
546 
547   /* get kernel */
548   const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
549   string cubin = compile_kernel(requested_features, kernel_name);
550   if (cubin.empty())
551     return false;
552 
553   const char *filter_name = "filter";
554   string filter_cubin = compile_kernel(requested_features, filter_name);
555   if (filter_cubin.empty())
556     return false;
557 
558   /* open module */
559   CUDAContextScope scope(this);
560 
561   string cubin_data;
562   CUresult result;
563 
564   if (path_read_text(cubin, cubin_data))
565     result = cuModuleLoadData(&cuModule, cubin_data.c_str());
566   else
567     result = CUDA_ERROR_FILE_NOT_FOUND;
568 
569   if (result != CUDA_SUCCESS)
570     set_error(string_printf(
571         "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
572 
573   if (path_read_text(filter_cubin, cubin_data))
574     result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
575   else
576     result = CUDA_ERROR_FILE_NOT_FOUND;
577 
578   if (result != CUDA_SUCCESS)
579     set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
580                             filter_cubin.c_str(),
581                             cuewErrorString(result)));
582 
583   if (result == CUDA_SUCCESS) {
584     reserve_local_memory(requested_features);
585   }
586 
587   load_functions();
588 
589   return (result == CUDA_SUCCESS);
590 }
591 
load_functions()592 void CUDADevice::load_functions()
593 {
594   /* TODO: load all functions here. */
595   if (functions.loaded) {
596     return;
597   }
598   functions.loaded = true;
599 
600   cuda_assert(cuModuleGetFunction(
601       &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
602   cuda_assert(cuModuleGetFunction(
603       &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
604   cuda_assert(cuModuleGetFunction(
605       &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
606   cuda_assert(cuModuleGetFunction(
607       &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
608 
609   cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
610   cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
611   cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
612   cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
613 
614   int unused_min_blocks;
615   cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
616                                                &functions.adaptive_num_threads_per_block,
617                                                functions.adaptive_scale_samples,
618                                                NULL,
619                                                0,
620                                                0));
621 }
622 
reserve_local_memory(const DeviceRequestedFeatures & requested_features)623 void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
624 {
625   if (use_split_kernel()) {
626     /* Split kernel mostly uses global memory and adaptive compilation,
627      * difficult to predict how much is needed currently. */
628     return;
629   }
630 
631   /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
632    * needed for kernel launches, so that we can reliably figure out when
633    * to allocate scene data in mapped host memory. */
634   CUDAContextScope scope(this);
635 
636   size_t total = 0, free_before = 0, free_after = 0;
637   cuMemGetInfo(&free_before, &total);
638 
639   /* Get kernel function. */
640   CUfunction cuRender;
641 
642   if (requested_features.use_baking) {
643     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
644   }
645   else if (requested_features.use_integrator_branched) {
646     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
647   }
648   else {
649     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
650   }
651 
652   cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
653 
654   int min_blocks, num_threads_per_block;
655   cuda_assert(
656       cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
657 
658   /* Launch kernel, using just 1 block appears sufficient to reserve
659    * memory for all multiprocessors. It would be good to do this in
660    * parallel for the multi GPU case still to make it faster. */
661   CUdeviceptr d_work_tiles = 0;
662   uint total_work_size = 0;
663 
664   void *args[] = {&d_work_tiles, &total_work_size};
665 
666   cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
667 
668   cuda_assert(cuCtxSynchronize());
669 
670   cuMemGetInfo(&free_after, &total);
671   VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
672           << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
673 
674 #  if 0
675   /* For testing mapped host memory, fill up device memory. */
676   const size_t keep_mb = 1024;
677 
678   while (free_after > keep_mb * 1024 * 1024LL) {
679     CUdeviceptr tmp;
680     cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
681     cuMemGetInfo(&free_after, &total);
682   }
683 #  endif
684 }
685 
init_host_memory()686 void CUDADevice::init_host_memory()
687 {
688   /* Limit amount of host mapped memory, because allocating too much can
689    * cause system instability. Leave at least half or 4 GB of system
690    * memory free, whichever is smaller. */
691   size_t default_limit = 4 * 1024 * 1024 * 1024LL;
692   size_t system_ram = system_physical_ram();
693 
694   if (system_ram > 0) {
695     if (system_ram / 2 > default_limit) {
696       map_host_limit = system_ram - default_limit;
697     }
698     else {
699       map_host_limit = system_ram / 2;
700     }
701   }
702   else {
703     VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
704     map_host_limit = 0;
705   }
706 
707   /* Amount of device memory to keep is free after texture memory
708    * and working memory allocations respectively. We set the working
709    * memory limit headroom lower so that some space is left after all
710    * texture memory allocations. */
711   device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
712   device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
713 
714   VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
715           << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
716 }
717 
load_texture_info()718 void CUDADevice::load_texture_info()
719 {
720   if (need_texture_info) {
721     texture_info.copy_to_device();
722     need_texture_info = false;
723   }
724 }
725 
move_textures_to_host(size_t size,bool for_texture)726 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
727 {
728   /* Break out of recursive call, which can happen when moving memory on a multi device. */
729   static bool any_device_moving_textures_to_host = false;
730   if (any_device_moving_textures_to_host) {
731     return;
732   }
733 
734   /* Signal to reallocate textures in host memory only. */
735   move_texture_to_host = true;
736 
737   while (size > 0) {
738     /* Find suitable memory allocation to move. */
739     device_memory *max_mem = NULL;
740     size_t max_size = 0;
741     bool max_is_image = false;
742 
743     foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
744       device_memory &mem = *pair.first;
745       CUDAMem *cmem = &pair.second;
746 
747       /* Can only move textures allocated on this device (and not those from peer devices).
748        * And need to ignore memory that is already on the host. */
749       if (!mem.is_resident(this) || cmem->use_mapped_host) {
750         continue;
751       }
752 
753       bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
754                         (&mem != &texture_info);
755       bool is_image = is_texture && (mem.data_height > 1);
756 
757       /* Can't move this type of memory. */
758       if (!is_texture || cmem->array) {
759         continue;
760       }
761 
762       /* For other textures, only move image textures. */
763       if (for_texture && !is_image) {
764         continue;
765       }
766 
767       /* Try to move largest allocation, prefer moving images. */
768       if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
769         max_is_image = is_image;
770         max_size = mem.device_size;
771         max_mem = &mem;
772       }
773     }
774 
775     /* Move to host memory. This part is mutex protected since
776      * multiple CUDA devices could be moving the memory. The
777      * first one will do it, and the rest will adopt the pointer. */
778     if (max_mem) {
779       VLOG(1) << "Move memory from device to host: " << max_mem->name;
780 
781       static thread_mutex move_mutex;
782       thread_scoped_lock lock(move_mutex);
783 
784       any_device_moving_textures_to_host = true;
785 
786       /* Potentially need to call back into multi device, so pointer mapping
787        * and peer devices are updated. This is also necessary since the device
788        * pointer may just be a key here, so cannot be accessed and freed directly.
789        * Unfortunately it does mean that memory is reallocated on all other
790        * devices as well, which is potentially dangerous when still in use (since
791        * a thread rendering on another devices would only be caught in this mutex
792        * if it so happens to do an allocation at the same time as well. */
793       max_mem->device_copy_to();
794       size = (max_size >= size) ? 0 : size - max_size;
795 
796       any_device_moving_textures_to_host = false;
797     }
798     else {
799       break;
800     }
801   }
802 
803   /* Unset flag before texture info is reloaded, since it should stay in device memory. */
804   move_texture_to_host = false;
805 
806   /* Update texture info array with new pointers. */
807   load_texture_info();
808 }
809 
generic_alloc(device_memory & mem,size_t pitch_padding)810 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
811 {
812   CUDAContextScope scope(this);
813 
814   CUdeviceptr device_pointer = 0;
815   size_t size = mem.memory_size() + pitch_padding;
816 
817   CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
818   const char *status = "";
819 
820   /* First try allocating in device memory, respecting headroom. We make
821    * an exception for texture info. It is small and frequently accessed,
822    * so treat it as working memory.
823    *
824    * If there is not enough room for working memory, we will try to move
825    * textures to host memory, assuming the performance impact would have
826    * been worse for working memory. */
827   bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
828   bool is_image = is_texture && (mem.data_height > 1);
829 
830   size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
831 
832   size_t total = 0, free = 0;
833   cuMemGetInfo(&free, &total);
834 
835   /* Move textures to host memory if needed. */
836   if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
837     move_textures_to_host(size + headroom - free, is_texture);
838     cuMemGetInfo(&free, &total);
839   }
840 
841   /* Allocate in device memory. */
842   if (!move_texture_to_host && (size + headroom) < free) {
843     mem_alloc_result = cuMemAlloc(&device_pointer, size);
844     if (mem_alloc_result == CUDA_SUCCESS) {
845       status = " in device memory";
846     }
847   }
848 
849   /* Fall back to mapped host memory if needed and possible. */
850 
851   void *shared_pointer = 0;
852 
853   if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
854     if (mem.shared_pointer) {
855       /* Another device already allocated host memory. */
856       mem_alloc_result = CUDA_SUCCESS;
857       shared_pointer = mem.shared_pointer;
858     }
859     else if (map_host_used + size < map_host_limit) {
860       /* Allocate host memory ourselves. */
861       mem_alloc_result = cuMemHostAlloc(
862           &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
863 
864       assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
865              (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
866     }
867 
868     if (mem_alloc_result == CUDA_SUCCESS) {
869       cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
870       map_host_used += size;
871       status = " in host memory";
872     }
873   }
874 
875   if (mem_alloc_result != CUDA_SUCCESS) {
876     status = " failed, out of device and host memory";
877     set_error("System is out of GPU and shared host memory");
878   }
879 
880   if (mem.name) {
881     VLOG(1) << "Buffer allocate: " << mem.name << ", "
882             << string_human_readable_number(mem.memory_size()) << " bytes. ("
883             << string_human_readable_size(mem.memory_size()) << ")" << status;
884   }
885 
886   mem.device_pointer = (device_ptr)device_pointer;
887   mem.device_size = size;
888   stats.mem_alloc(size);
889 
890   if (!mem.device_pointer) {
891     return NULL;
892   }
893 
894   /* Insert into map of allocations. */
895   CUDAMem *cmem = &cuda_mem_map[&mem];
896   if (shared_pointer != 0) {
897     /* Replace host pointer with our host allocation. Only works if
898      * CUDA memory layout is the same and has no pitch padding. Also
899      * does not work if we move textures to host during a render,
900      * since other devices might be using the memory. */
901 
902     if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
903         mem.host_pointer != shared_pointer) {
904       memcpy(shared_pointer, mem.host_pointer, size);
905 
906       /* A Call to device_memory::host_free() should be preceded by
907        * a call to device_memory::device_free() for host memory
908        * allocated by a device to be handled properly. Two exceptions
909        * are here and a call in OptiXDevice::generic_alloc(), where
910        * the current host memory can be assumed to be allocated by
911        * device_memory::host_alloc(), not by a device */
912 
913       mem.host_free();
914       mem.host_pointer = shared_pointer;
915     }
916     mem.shared_pointer = shared_pointer;
917     mem.shared_counter++;
918     cmem->use_mapped_host = true;
919   }
920   else {
921     cmem->use_mapped_host = false;
922   }
923 
924   return cmem;
925 }
926 
generic_copy_to(device_memory & mem)927 void CUDADevice::generic_copy_to(device_memory &mem)
928 {
929   if (!mem.host_pointer || !mem.device_pointer) {
930     return;
931   }
932 
933   /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
934    * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
935    * mem.host_pointer. */
936   if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
937     const CUDAContextScope scope(this);
938     cuda_assert(
939         cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
940   }
941 }
942 
generic_free(device_memory & mem)943 void CUDADevice::generic_free(device_memory &mem)
944 {
945   if (mem.device_pointer) {
946     CUDAContextScope scope(this);
947     const CUDAMem &cmem = cuda_mem_map[&mem];
948 
949     /* If cmem.use_mapped_host is true, reference counting is used
950      * to safely free a mapped host memory. */
951 
952     if (cmem.use_mapped_host) {
953       assert(mem.shared_pointer);
954       if (mem.shared_pointer) {
955         assert(mem.shared_counter > 0);
956         if (--mem.shared_counter == 0) {
957           if (mem.host_pointer == mem.shared_pointer) {
958             mem.host_pointer = 0;
959           }
960           cuMemFreeHost(mem.shared_pointer);
961           mem.shared_pointer = 0;
962         }
963       }
964       map_host_used -= mem.device_size;
965     }
966     else {
967       /* Free device memory. */
968       cuda_assert(cuMemFree(mem.device_pointer));
969     }
970 
971     stats.mem_free(mem.device_size);
972     mem.device_pointer = 0;
973     mem.device_size = 0;
974 
975     cuda_mem_map.erase(cuda_mem_map.find(&mem));
976   }
977 }
978 
mem_alloc(device_memory & mem)979 void CUDADevice::mem_alloc(device_memory &mem)
980 {
981   if (mem.type == MEM_PIXELS && !background) {
982     pixels_alloc(mem);
983   }
984   else if (mem.type == MEM_TEXTURE) {
985     assert(!"mem_alloc not supported for textures.");
986   }
987   else if (mem.type == MEM_GLOBAL) {
988     assert(!"mem_alloc not supported for global memory.");
989   }
990   else {
991     generic_alloc(mem);
992   }
993 }
994 
mem_copy_to(device_memory & mem)995 void CUDADevice::mem_copy_to(device_memory &mem)
996 {
997   if (mem.type == MEM_PIXELS) {
998     assert(!"mem_copy_to not supported for pixels.");
999   }
1000   else if (mem.type == MEM_GLOBAL) {
1001     global_free(mem);
1002     global_alloc(mem);
1003   }
1004   else if (mem.type == MEM_TEXTURE) {
1005     tex_free((device_texture &)mem);
1006     tex_alloc((device_texture &)mem);
1007   }
1008   else {
1009     if (!mem.device_pointer) {
1010       generic_alloc(mem);
1011     }
1012 
1013     generic_copy_to(mem);
1014   }
1015 }
1016 
mem_copy_from(device_memory & mem,int y,int w,int h,int elem)1017 void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
1018 {
1019   if (mem.type == MEM_PIXELS && !background) {
1020     pixels_copy_from(mem, y, w, h);
1021   }
1022   else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
1023     assert(!"mem_copy_from not supported for textures.");
1024   }
1025   else if (mem.host_pointer) {
1026     const size_t size = elem * w * h;
1027     const size_t offset = elem * y * w;
1028 
1029     if (mem.device_pointer) {
1030       const CUDAContextScope scope(this);
1031       cuda_assert(cuMemcpyDtoH(
1032           (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
1033     }
1034     else {
1035       memset((char *)mem.host_pointer + offset, 0, size);
1036     }
1037   }
1038 }
1039 
mem_zero(device_memory & mem)1040 void CUDADevice::mem_zero(device_memory &mem)
1041 {
1042   if (!mem.device_pointer) {
1043     mem_alloc(mem);
1044   }
1045   if (!mem.device_pointer) {
1046     return;
1047   }
1048 
1049   /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
1050    * regardless of mem.host_pointer and mem.shared_pointer. */
1051   if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
1052     const CUDAContextScope scope(this);
1053     cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
1054   }
1055   else if (mem.host_pointer) {
1056     memset(mem.host_pointer, 0, mem.memory_size());
1057   }
1058 }
1059 
mem_free(device_memory & mem)1060 void CUDADevice::mem_free(device_memory &mem)
1061 {
1062   if (mem.type == MEM_PIXELS && !background) {
1063     pixels_free(mem);
1064   }
1065   else if (mem.type == MEM_GLOBAL) {
1066     global_free(mem);
1067   }
1068   else if (mem.type == MEM_TEXTURE) {
1069     tex_free((device_texture &)mem);
1070   }
1071   else {
1072     generic_free(mem);
1073   }
1074 }
1075 
mem_alloc_sub_ptr(device_memory & mem,int offset,int)1076 device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
1077 {
1078   return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
1079 }
1080 
const_copy_to(const char * name,void * host,size_t size)1081 void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
1082 {
1083   CUDAContextScope scope(this);
1084   CUdeviceptr mem;
1085   size_t bytes;
1086 
1087   cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
1088   // assert(bytes == size);
1089   cuda_assert(cuMemcpyHtoD(mem, host, size));
1090 }
1091 
global_alloc(device_memory & mem)1092 void CUDADevice::global_alloc(device_memory &mem)
1093 {
1094   if (mem.is_resident(this)) {
1095     generic_alloc(mem);
1096     generic_copy_to(mem);
1097   }
1098 
1099   const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
1100 }
1101 
global_free(device_memory & mem)1102 void CUDADevice::global_free(device_memory &mem)
1103 {
1104   if (mem.is_resident(this) && mem.device_pointer) {
1105     generic_free(mem);
1106   }
1107 }
1108 
tex_alloc(device_texture & mem)1109 void CUDADevice::tex_alloc(device_texture &mem)
1110 {
1111   CUDAContextScope scope(this);
1112 
1113   /* General variables for both architectures */
1114   string bind_name = mem.name;
1115   size_t dsize = datatype_size(mem.data_type);
1116   size_t size = mem.memory_size();
1117 
1118   CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
1119   switch (mem.info.extension) {
1120     case EXTENSION_REPEAT:
1121       address_mode = CU_TR_ADDRESS_MODE_WRAP;
1122       break;
1123     case EXTENSION_EXTEND:
1124       address_mode = CU_TR_ADDRESS_MODE_CLAMP;
1125       break;
1126     case EXTENSION_CLIP:
1127       address_mode = CU_TR_ADDRESS_MODE_BORDER;
1128       break;
1129     default:
1130       assert(0);
1131       break;
1132   }
1133 
1134   CUfilter_mode filter_mode;
1135   if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
1136     filter_mode = CU_TR_FILTER_MODE_POINT;
1137   }
1138   else {
1139     filter_mode = CU_TR_FILTER_MODE_LINEAR;
1140   }
1141 
1142   /* Image Texture Storage */
1143   CUarray_format_enum format;
1144   switch (mem.data_type) {
1145     case TYPE_UCHAR:
1146       format = CU_AD_FORMAT_UNSIGNED_INT8;
1147       break;
1148     case TYPE_UINT16:
1149       format = CU_AD_FORMAT_UNSIGNED_INT16;
1150       break;
1151     case TYPE_UINT:
1152       format = CU_AD_FORMAT_UNSIGNED_INT32;
1153       break;
1154     case TYPE_INT:
1155       format = CU_AD_FORMAT_SIGNED_INT32;
1156       break;
1157     case TYPE_FLOAT:
1158       format = CU_AD_FORMAT_FLOAT;
1159       break;
1160     case TYPE_HALF:
1161       format = CU_AD_FORMAT_HALF;
1162       break;
1163     default:
1164       assert(0);
1165       return;
1166   }
1167 
1168   CUDAMem *cmem = NULL;
1169   CUarray array_3d = NULL;
1170   size_t src_pitch = mem.data_width * dsize * mem.data_elements;
1171   size_t dst_pitch = src_pitch;
1172 
1173   if (!mem.is_resident(this)) {
1174     cmem = &cuda_mem_map[&mem];
1175     cmem->texobject = 0;
1176 
1177     if (mem.data_depth > 1) {
1178       array_3d = (CUarray)mem.device_pointer;
1179       cmem->array = array_3d;
1180     }
1181     else if (mem.data_height > 0) {
1182       dst_pitch = align_up(src_pitch, pitch_alignment);
1183     }
1184   }
1185   else if (mem.data_depth > 1) {
1186     /* 3D texture using array, there is no API for linear memory. */
1187     CUDA_ARRAY3D_DESCRIPTOR desc;
1188 
1189     desc.Width = mem.data_width;
1190     desc.Height = mem.data_height;
1191     desc.Depth = mem.data_depth;
1192     desc.Format = format;
1193     desc.NumChannels = mem.data_elements;
1194     desc.Flags = 0;
1195 
1196     VLOG(1) << "Array 3D allocate: " << mem.name << ", "
1197             << string_human_readable_number(mem.memory_size()) << " bytes. ("
1198             << string_human_readable_size(mem.memory_size()) << ")";
1199 
1200     cuda_assert(cuArray3DCreate(&array_3d, &desc));
1201 
1202     if (!array_3d) {
1203       return;
1204     }
1205 
1206     CUDA_MEMCPY3D param;
1207     memset(&param, 0, sizeof(param));
1208     param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
1209     param.dstArray = array_3d;
1210     param.srcMemoryType = CU_MEMORYTYPE_HOST;
1211     param.srcHost = mem.host_pointer;
1212     param.srcPitch = src_pitch;
1213     param.WidthInBytes = param.srcPitch;
1214     param.Height = mem.data_height;
1215     param.Depth = mem.data_depth;
1216 
1217     cuda_assert(cuMemcpy3D(&param));
1218 
1219     mem.device_pointer = (device_ptr)array_3d;
1220     mem.device_size = size;
1221     stats.mem_alloc(size);
1222 
1223     cmem = &cuda_mem_map[&mem];
1224     cmem->texobject = 0;
1225     cmem->array = array_3d;
1226   }
1227   else if (mem.data_height > 0) {
1228     /* 2D texture, using pitch aligned linear memory. */
1229     dst_pitch = align_up(src_pitch, pitch_alignment);
1230     size_t dst_size = dst_pitch * mem.data_height;
1231 
1232     cmem = generic_alloc(mem, dst_size - mem.memory_size());
1233     if (!cmem) {
1234       return;
1235     }
1236 
1237     CUDA_MEMCPY2D param;
1238     memset(&param, 0, sizeof(param));
1239     param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1240     param.dstDevice = mem.device_pointer;
1241     param.dstPitch = dst_pitch;
1242     param.srcMemoryType = CU_MEMORYTYPE_HOST;
1243     param.srcHost = mem.host_pointer;
1244     param.srcPitch = src_pitch;
1245     param.WidthInBytes = param.srcPitch;
1246     param.Height = mem.data_height;
1247 
1248     cuda_assert(cuMemcpy2DUnaligned(&param));
1249   }
1250   else {
1251     /* 1D texture, using linear memory. */
1252     cmem = generic_alloc(mem);
1253     if (!cmem) {
1254       return;
1255     }
1256 
1257     cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
1258   }
1259 
1260   /* Resize once */
1261   const uint slot = mem.slot;
1262   if (slot >= texture_info.size()) {
1263     /* Allocate some slots in advance, to reduce amount
1264      * of re-allocations. */
1265     texture_info.resize(slot + 128);
1266   }
1267 
1268   /* Set Mapping and tag that we need to (re-)upload to device */
1269   texture_info[slot] = mem.info;
1270   need_texture_info = true;
1271 
1272   if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
1273       mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
1274     /* Kepler+, bindless textures. */
1275     CUDA_RESOURCE_DESC resDesc;
1276     memset(&resDesc, 0, sizeof(resDesc));
1277 
1278     if (array_3d) {
1279       resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
1280       resDesc.res.array.hArray = array_3d;
1281       resDesc.flags = 0;
1282     }
1283     else if (mem.data_height > 0) {
1284       resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
1285       resDesc.res.pitch2D.devPtr = mem.device_pointer;
1286       resDesc.res.pitch2D.format = format;
1287       resDesc.res.pitch2D.numChannels = mem.data_elements;
1288       resDesc.res.pitch2D.height = mem.data_height;
1289       resDesc.res.pitch2D.width = mem.data_width;
1290       resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1291     }
1292     else {
1293       resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
1294       resDesc.res.linear.devPtr = mem.device_pointer;
1295       resDesc.res.linear.format = format;
1296       resDesc.res.linear.numChannels = mem.data_elements;
1297       resDesc.res.linear.sizeInBytes = mem.device_size;
1298     }
1299 
1300     CUDA_TEXTURE_DESC texDesc;
1301     memset(&texDesc, 0, sizeof(texDesc));
1302     texDesc.addressMode[0] = address_mode;
1303     texDesc.addressMode[1] = address_mode;
1304     texDesc.addressMode[2] = address_mode;
1305     texDesc.filterMode = filter_mode;
1306     texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
1307 
1308     cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
1309 
1310     texture_info[slot].data = (uint64_t)cmem->texobject;
1311   }
1312   else {
1313     texture_info[slot].data = (uint64_t)mem.device_pointer;
1314   }
1315 }
1316 
tex_free(device_texture & mem)1317 void CUDADevice::tex_free(device_texture &mem)
1318 {
1319   if (mem.device_pointer) {
1320     CUDAContextScope scope(this);
1321     const CUDAMem &cmem = cuda_mem_map[&mem];
1322 
1323     if (cmem.texobject) {
1324       /* Free bindless texture. */
1325       cuTexObjectDestroy(cmem.texobject);
1326     }
1327 
1328     if (!mem.is_resident(this)) {
1329       /* Do not free memory here, since it was allocated on a different device. */
1330       cuda_mem_map.erase(cuda_mem_map.find(&mem));
1331     }
1332     else if (cmem.array) {
1333       /* Free array. */
1334       cuArrayDestroy(cmem.array);
1335       stats.mem_free(mem.device_size);
1336       mem.device_pointer = 0;
1337       mem.device_size = 0;
1338 
1339       cuda_mem_map.erase(cuda_mem_map.find(&mem));
1340     }
1341     else {
1342       generic_free(mem);
1343     }
1344   }
1345 }
1346 
1347 #  define CUDA_GET_BLOCKSIZE(func, w, h) \
1348     int threads_per_block; \
1349     cuda_assert( \
1350         cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1351     int threads = (int)sqrt((float)threads_per_block); \
1352     int xblocks = ((w) + threads - 1) / threads; \
1353     int yblocks = ((h) + threads - 1) / threads;
1354 
1355 #  define CUDA_LAUNCH_KERNEL(func, args) \
1356     cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
1357 
1358 /* Similar as above, but for 1-dimensional blocks. */
1359 #  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
1360     int threads_per_block; \
1361     cuda_assert( \
1362         cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1363     int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
1364     int yblocks = h;
1365 
1366 #  define CUDA_LAUNCH_KERNEL_1D(func, args) \
1367     cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
1368 
denoising_non_local_means(device_ptr image_ptr,device_ptr guide_ptr,device_ptr variance_ptr,device_ptr out_ptr,DenoisingTask * task)1369 bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
1370                                            device_ptr guide_ptr,
1371                                            device_ptr variance_ptr,
1372                                            device_ptr out_ptr,
1373                                            DenoisingTask *task)
1374 {
1375   if (have_error())
1376     return false;
1377 
1378   CUDAContextScope scope(this);
1379 
1380   int stride = task->buffer.stride;
1381   int w = task->buffer.width;
1382   int h = task->buffer.h;
1383   int r = task->nlm_state.r;
1384   int f = task->nlm_state.f;
1385   float a = task->nlm_state.a;
1386   float k_2 = task->nlm_state.k_2;
1387 
1388   int pass_stride = task->buffer.pass_stride;
1389   int num_shifts = (2 * r + 1) * (2 * r + 1);
1390   int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1391   int frame_offset = 0;
1392 
1393   if (have_error())
1394     return false;
1395 
1396   CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1397   CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1398   CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
1399   CUdeviceptr scale_ptr = 0;
1400 
1401   cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
1402   cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
1403 
1404   {
1405     CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
1406     cuda_assert(cuModuleGetFunction(
1407         &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1408     cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1409     cuda_assert(cuModuleGetFunction(
1410         &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1411     cuda_assert(cuModuleGetFunction(
1412         &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
1413 
1414     cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1415     cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1416     cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1417     cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
1418 
1419     CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
1420 
1421     void *calc_difference_args[] = {&guide_ptr,
1422                                     &variance_ptr,
1423                                     &scale_ptr,
1424                                     &difference,
1425                                     &w,
1426                                     &h,
1427                                     &stride,
1428                                     &pass_stride,
1429                                     &r,
1430                                     &channel_offset,
1431                                     &frame_offset,
1432                                     &a,
1433                                     &k_2};
1434     void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1435     void *calc_weight_args[] = {
1436         &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1437     void *update_output_args[] = {&blurDifference,
1438                                   &image_ptr,
1439                                   &out_ptr,
1440                                   &weightAccum,
1441                                   &w,
1442                                   &h,
1443                                   &stride,
1444                                   &pass_stride,
1445                                   &channel_offset,
1446                                   &r,
1447                                   &f};
1448 
1449     CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1450     CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1451     CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1452     CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1453     CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
1454   }
1455 
1456   {
1457     CUfunction cuNLMNormalize;
1458     cuda_assert(
1459         cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
1460     cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
1461     void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
1462     CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
1463     CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
1464     cuda_assert(cuCtxSynchronize());
1465   }
1466 
1467   return !have_error();
1468 }
1469 
denoising_construct_transform(DenoisingTask * task)1470 bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
1471 {
1472   if (have_error())
1473     return false;
1474 
1475   CUDAContextScope scope(this);
1476 
1477   CUfunction cuFilterConstructTransform;
1478   cuda_assert(cuModuleGetFunction(
1479       &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
1480   cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
1481   CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
1482 
1483   void *args[] = {&task->buffer.mem.device_pointer,
1484                   &task->tile_info_mem.device_pointer,
1485                   &task->storage.transform.device_pointer,
1486                   &task->storage.rank.device_pointer,
1487                   &task->filter_area,
1488                   &task->rect,
1489                   &task->radius,
1490                   &task->pca_threshold,
1491                   &task->buffer.pass_stride,
1492                   &task->buffer.frame_stride,
1493                   &task->buffer.use_time};
1494   CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
1495   cuda_assert(cuCtxSynchronize());
1496 
1497   return !have_error();
1498 }
1499 
denoising_accumulate(device_ptr color_ptr,device_ptr color_variance_ptr,device_ptr scale_ptr,int frame,DenoisingTask * task)1500 bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
1501                                       device_ptr color_variance_ptr,
1502                                       device_ptr scale_ptr,
1503                                       int frame,
1504                                       DenoisingTask *task)
1505 {
1506   if (have_error())
1507     return false;
1508 
1509   CUDAContextScope scope(this);
1510 
1511   int r = task->radius;
1512   int f = 4;
1513   float a = 1.0f;
1514   float k_2 = task->nlm_k_2;
1515 
1516   int w = task->reconstruction_state.source_w;
1517   int h = task->reconstruction_state.source_h;
1518   int stride = task->buffer.stride;
1519   int frame_offset = frame * task->buffer.frame_stride;
1520   int t = task->tile_info->frames[frame];
1521 
1522   int pass_stride = task->buffer.pass_stride;
1523   int num_shifts = (2 * r + 1) * (2 * r + 1);
1524 
1525   if (have_error())
1526     return false;
1527 
1528   CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1529   CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1530 
1531   CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
1532   cuda_assert(cuModuleGetFunction(
1533       &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1534   cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1535   cuda_assert(
1536       cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1537   cuda_assert(cuModuleGetFunction(
1538       &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
1539 
1540   cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1541   cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1542   cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1543   cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
1544 
1545   CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
1546                         task->reconstruction_state.source_w * task->reconstruction_state.source_h,
1547                         num_shifts);
1548 
1549   void *calc_difference_args[] = {&color_ptr,
1550                                   &color_variance_ptr,
1551                                   &scale_ptr,
1552                                   &difference,
1553                                   &w,
1554                                   &h,
1555                                   &stride,
1556                                   &pass_stride,
1557                                   &r,
1558                                   &pass_stride,
1559                                   &frame_offset,
1560                                   &a,
1561                                   &k_2};
1562   void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1563   void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1564   void *construct_gramian_args[] = {&t,
1565                                     &blurDifference,
1566                                     &task->buffer.mem.device_pointer,
1567                                     &task->storage.transform.device_pointer,
1568                                     &task->storage.rank.device_pointer,
1569                                     &task->storage.XtWX.device_pointer,
1570                                     &task->storage.XtWY.device_pointer,
1571                                     &task->reconstruction_state.filter_window,
1572                                     &w,
1573                                     &h,
1574                                     &stride,
1575                                     &pass_stride,
1576                                     &r,
1577                                     &f,
1578                                     &frame_offset,
1579                                     &task->buffer.use_time};
1580 
1581   CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1582   CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1583   CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1584   CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1585   CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
1586   cuda_assert(cuCtxSynchronize());
1587 
1588   return !have_error();
1589 }
1590 
denoising_solve(device_ptr output_ptr,DenoisingTask * task)1591 bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1592 {
1593   CUfunction cuFinalize;
1594   cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
1595   cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
1596   void *finalize_args[] = {&output_ptr,
1597                            &task->storage.rank.device_pointer,
1598                            &task->storage.XtWX.device_pointer,
1599                            &task->storage.XtWY.device_pointer,
1600                            &task->filter_area,
1601                            &task->reconstruction_state.buffer_params.x,
1602                            &task->render_buffer.samples};
1603   CUDA_GET_BLOCKSIZE(
1604       cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
1605   CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
1606   cuda_assert(cuCtxSynchronize());
1607 
1608   return !have_error();
1609 }
1610 
denoising_combine_halves(device_ptr a_ptr,device_ptr b_ptr,device_ptr mean_ptr,device_ptr variance_ptr,int r,int4 rect,DenoisingTask * task)1611 bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
1612                                           device_ptr b_ptr,
1613                                           device_ptr mean_ptr,
1614                                           device_ptr variance_ptr,
1615                                           int r,
1616                                           int4 rect,
1617                                           DenoisingTask *task)
1618 {
1619   if (have_error())
1620     return false;
1621 
1622   CUDAContextScope scope(this);
1623 
1624   CUfunction cuFilterCombineHalves;
1625   cuda_assert(cuModuleGetFunction(
1626       &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
1627   cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
1628   CUDA_GET_BLOCKSIZE(
1629       cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1630 
1631   void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
1632   CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
1633   cuda_assert(cuCtxSynchronize());
1634 
1635   return !have_error();
1636 }
1637 
denoising_divide_shadow(device_ptr a_ptr,device_ptr b_ptr,device_ptr sample_variance_ptr,device_ptr sv_variance_ptr,device_ptr buffer_variance_ptr,DenoisingTask * task)1638 bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
1639                                          device_ptr b_ptr,
1640                                          device_ptr sample_variance_ptr,
1641                                          device_ptr sv_variance_ptr,
1642                                          device_ptr buffer_variance_ptr,
1643                                          DenoisingTask *task)
1644 {
1645   if (have_error())
1646     return false;
1647 
1648   CUDAContextScope scope(this);
1649 
1650   CUfunction cuFilterDivideShadow;
1651   cuda_assert(cuModuleGetFunction(
1652       &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
1653   cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
1654   CUDA_GET_BLOCKSIZE(
1655       cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1656 
1657   void *args[] = {&task->render_buffer.samples,
1658                   &task->tile_info_mem.device_pointer,
1659                   &a_ptr,
1660                   &b_ptr,
1661                   &sample_variance_ptr,
1662                   &sv_variance_ptr,
1663                   &buffer_variance_ptr,
1664                   &task->rect,
1665                   &task->render_buffer.pass_stride,
1666                   &task->render_buffer.offset};
1667   CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
1668   cuda_assert(cuCtxSynchronize());
1669 
1670   return !have_error();
1671 }
1672 
denoising_get_feature(int mean_offset,int variance_offset,device_ptr mean_ptr,device_ptr variance_ptr,float scale,DenoisingTask * task)1673 bool CUDADevice::denoising_get_feature(int mean_offset,
1674                                        int variance_offset,
1675                                        device_ptr mean_ptr,
1676                                        device_ptr variance_ptr,
1677                                        float scale,
1678                                        DenoisingTask *task)
1679 {
1680   if (have_error())
1681     return false;
1682 
1683   CUDAContextScope scope(this);
1684 
1685   CUfunction cuFilterGetFeature;
1686   cuda_assert(
1687       cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
1688   cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
1689   CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1690 
1691   void *args[] = {&task->render_buffer.samples,
1692                   &task->tile_info_mem.device_pointer,
1693                   &mean_offset,
1694                   &variance_offset,
1695                   &mean_ptr,
1696                   &variance_ptr,
1697                   &scale,
1698                   &task->rect,
1699                   &task->render_buffer.pass_stride,
1700                   &task->render_buffer.offset};
1701   CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
1702   cuda_assert(cuCtxSynchronize());
1703 
1704   return !have_error();
1705 }
1706 
denoising_write_feature(int out_offset,device_ptr from_ptr,device_ptr buffer_ptr,DenoisingTask * task)1707 bool CUDADevice::denoising_write_feature(int out_offset,
1708                                          device_ptr from_ptr,
1709                                          device_ptr buffer_ptr,
1710                                          DenoisingTask *task)
1711 {
1712   if (have_error())
1713     return false;
1714 
1715   CUDAContextScope scope(this);
1716 
1717   CUfunction cuFilterWriteFeature;
1718   cuda_assert(cuModuleGetFunction(
1719       &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
1720   cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
1721   CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1722 
1723   void *args[] = {&task->render_buffer.samples,
1724                   &task->reconstruction_state.buffer_params,
1725                   &task->filter_area,
1726                   &from_ptr,
1727                   &buffer_ptr,
1728                   &out_offset,
1729                   &task->rect};
1730   CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
1731   cuda_assert(cuCtxSynchronize());
1732 
1733   return !have_error();
1734 }
1735 
denoising_detect_outliers(device_ptr image_ptr,device_ptr variance_ptr,device_ptr depth_ptr,device_ptr output_ptr,DenoisingTask * task)1736 bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
1737                                            device_ptr variance_ptr,
1738                                            device_ptr depth_ptr,
1739                                            device_ptr output_ptr,
1740                                            DenoisingTask *task)
1741 {
1742   if (have_error())
1743     return false;
1744 
1745   CUDAContextScope scope(this);
1746 
1747   CUfunction cuFilterDetectOutliers;
1748   cuda_assert(cuModuleGetFunction(
1749       &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
1750   cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
1751   CUDA_GET_BLOCKSIZE(
1752       cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1753 
1754   void *args[] = {
1755       &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
1756 
1757   CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
1758   cuda_assert(cuCtxSynchronize());
1759 
1760   return !have_error();
1761 }
1762 
denoise(RenderTile & rtile,DenoisingTask & denoising)1763 void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1764 {
1765   denoising.functions.construct_transform = function_bind(
1766       &CUDADevice::denoising_construct_transform, this, &denoising);
1767   denoising.functions.accumulate = function_bind(
1768       &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1769   denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
1770   denoising.functions.divide_shadow = function_bind(
1771       &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1772   denoising.functions.non_local_means = function_bind(
1773       &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1774   denoising.functions.combine_halves = function_bind(
1775       &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1776   denoising.functions.get_feature = function_bind(
1777       &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1778   denoising.functions.write_feature = function_bind(
1779       &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1780   denoising.functions.detect_outliers = function_bind(
1781       &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1782 
1783   denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1784   denoising.render_buffer.samples = rtile.sample;
1785   denoising.buffer.gpu_temporary_mem = true;
1786 
1787   denoising.run_denoising(rtile);
1788 }
1789 
adaptive_sampling_filter(uint filter_sample,WorkTile * wtile,CUdeviceptr d_wtile,CUstream stream)1790 void CUDADevice::adaptive_sampling_filter(uint filter_sample,
1791                                           WorkTile *wtile,
1792                                           CUdeviceptr d_wtile,
1793                                           CUstream stream)
1794 {
1795   const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1796 
1797   /* These are a series of tiny kernels because there is no grid synchronization
1798    * from within a kernel, so multiple kernel launches it is. */
1799   uint total_work_size = wtile->h * wtile->w;
1800   void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
1801   uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1802   cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
1803                              num_blocks,
1804                              1,
1805                              1,
1806                              num_threads_per_block,
1807                              1,
1808                              1,
1809                              0,
1810                              stream,
1811                              args2,
1812                              0));
1813   total_work_size = wtile->h;
1814   num_blocks = divide_up(total_work_size, num_threads_per_block);
1815   cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
1816                              num_blocks,
1817                              1,
1818                              1,
1819                              num_threads_per_block,
1820                              1,
1821                              1,
1822                              0,
1823                              stream,
1824                              args2,
1825                              0));
1826   total_work_size = wtile->w;
1827   num_blocks = divide_up(total_work_size, num_threads_per_block);
1828   cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
1829                              num_blocks,
1830                              1,
1831                              1,
1832                              num_threads_per_block,
1833                              1,
1834                              1,
1835                              0,
1836                              stream,
1837                              args2,
1838                              0));
1839 }
1840 
adaptive_sampling_post(RenderTile & rtile,WorkTile * wtile,CUdeviceptr d_wtile,CUstream stream)1841 void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
1842                                         WorkTile *wtile,
1843                                         CUdeviceptr d_wtile,
1844                                         CUstream stream)
1845 {
1846   const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1847   uint total_work_size = wtile->h * wtile->w;
1848 
1849   void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
1850   uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1851   cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
1852                              num_blocks,
1853                              1,
1854                              1,
1855                              num_threads_per_block,
1856                              1,
1857                              1,
1858                              0,
1859                              stream,
1860                              args,
1861                              0));
1862 }
1863 
render(DeviceTask & task,RenderTile & rtile,device_vector<WorkTile> & work_tiles)1864 void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
1865 {
1866   scoped_timer timer(&rtile.buffers->render_time);
1867 
1868   if (have_error())
1869     return;
1870 
1871   CUDAContextScope scope(this);
1872   CUfunction cuRender;
1873 
1874   /* Get kernel function. */
1875   if (rtile.task == RenderTile::BAKE) {
1876     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
1877   }
1878   else if (task.integrator_branched) {
1879     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
1880   }
1881   else {
1882     cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
1883   }
1884 
1885   if (have_error()) {
1886     return;
1887   }
1888 
1889   cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
1890 
1891   /* Allocate work tile. */
1892   work_tiles.alloc(1);
1893 
1894   WorkTile *wtile = work_tiles.data();
1895   wtile->x = rtile.x;
1896   wtile->y = rtile.y;
1897   wtile->w = rtile.w;
1898   wtile->h = rtile.h;
1899   wtile->offset = rtile.offset;
1900   wtile->stride = rtile.stride;
1901   wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
1902 
1903   /* Prepare work size. More step samples render faster, but for now we
1904    * remain conservative for GPUs connected to a display to avoid driver
1905    * timeouts and display freezing. */
1906   int min_blocks, num_threads_per_block;
1907   cuda_assert(
1908       cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
1909   if (!info.display_device) {
1910     min_blocks *= 8;
1911   }
1912 
1913   uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
1914   if (task.adaptive_sampling.use) {
1915     step_samples = task.adaptive_sampling.align_static_samples(step_samples);
1916   }
1917 
1918   /* Render all samples. */
1919   int start_sample = rtile.start_sample;
1920   int end_sample = rtile.start_sample + rtile.num_samples;
1921 
1922   for (int sample = start_sample; sample < end_sample; sample += step_samples) {
1923     /* Setup and copy work tile to device. */
1924     wtile->start_sample = sample;
1925     wtile->num_samples = min(step_samples, end_sample - sample);
1926     work_tiles.copy_to_device();
1927 
1928     CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1929     uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
1930     uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1931 
1932     /* Launch kernel. */
1933     void *args[] = {&d_work_tiles, &total_work_size};
1934 
1935     cuda_assert(
1936         cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
1937 
1938     /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
1939     uint filter_sample = sample + wtile->num_samples - 1;
1940     if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
1941       adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
1942     }
1943 
1944     cuda_assert(cuCtxSynchronize());
1945 
1946     /* Update progress. */
1947     rtile.sample = sample + wtile->num_samples;
1948     task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1949 
1950     if (task.get_cancel()) {
1951       if (task.need_finish_queue == false)
1952         break;
1953     }
1954   }
1955 
1956   /* Finalize adaptive sampling. */
1957   if (task.adaptive_sampling.use) {
1958     CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1959     adaptive_sampling_post(rtile, wtile, d_work_tiles);
1960     cuda_assert(cuCtxSynchronize());
1961     task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1962   }
1963 }
1964 
film_convert(DeviceTask & task,device_ptr buffer,device_ptr rgba_byte,device_ptr rgba_half)1965 void CUDADevice::film_convert(DeviceTask &task,
1966                               device_ptr buffer,
1967                               device_ptr rgba_byte,
1968                               device_ptr rgba_half)
1969 {
1970   if (have_error())
1971     return;
1972 
1973   CUDAContextScope scope(this);
1974 
1975   CUfunction cuFilmConvert;
1976   CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
1977   CUdeviceptr d_buffer = (CUdeviceptr)buffer;
1978 
1979   /* get kernel function */
1980   if (rgba_half) {
1981     cuda_assert(
1982         cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
1983   }
1984   else {
1985     cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
1986   }
1987 
1988   float sample_scale = 1.0f / (task.sample + 1);
1989 
1990   /* pass in parameters */
1991   void *args[] = {&d_rgba,
1992                   &d_buffer,
1993                   &sample_scale,
1994                   &task.x,
1995                   &task.y,
1996                   &task.w,
1997                   &task.h,
1998                   &task.offset,
1999                   &task.stride};
2000 
2001   /* launch kernel */
2002   int threads_per_block;
2003   cuda_assert(cuFuncGetAttribute(
2004       &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
2005 
2006   int xthreads = (int)sqrt(threads_per_block);
2007   int ythreads = (int)sqrt(threads_per_block);
2008   int xblocks = (task.w + xthreads - 1) / xthreads;
2009   int yblocks = (task.h + ythreads - 1) / ythreads;
2010 
2011   cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
2012 
2013   cuda_assert(cuLaunchKernel(cuFilmConvert,
2014                              xblocks,
2015                              yblocks,
2016                              1, /* blocks */
2017                              xthreads,
2018                              ythreads,
2019                              1, /* threads */
2020                              0,
2021                              0,
2022                              args,
2023                              0));
2024 
2025   unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
2026 
2027   cuda_assert(cuCtxSynchronize());
2028 }
2029 
shader(DeviceTask & task)2030 void CUDADevice::shader(DeviceTask &task)
2031 {
2032   if (have_error())
2033     return;
2034 
2035   CUDAContextScope scope(this);
2036 
2037   CUfunction cuShader;
2038   CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
2039   CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
2040 
2041   /* get kernel function */
2042   if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
2043     cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
2044   }
2045   else {
2046     cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
2047   }
2048 
2049   /* do tasks in smaller chunks, so we can cancel it */
2050   const int shader_chunk_size = 65536;
2051   const int start = task.shader_x;
2052   const int end = task.shader_x + task.shader_w;
2053   int offset = task.offset;
2054 
2055   bool canceled = false;
2056   for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
2057     for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
2058       int shader_w = min(shader_chunk_size, end - shader_x);
2059 
2060       /* pass in parameters */
2061       void *args[8];
2062       int arg = 0;
2063       args[arg++] = &d_input;
2064       args[arg++] = &d_output;
2065       args[arg++] = &task.shader_eval_type;
2066       if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
2067         args[arg++] = &task.shader_filter;
2068       }
2069       args[arg++] = &shader_x;
2070       args[arg++] = &shader_w;
2071       args[arg++] = &offset;
2072       args[arg++] = &sample;
2073 
2074       /* launch kernel */
2075       int threads_per_block;
2076       cuda_assert(cuFuncGetAttribute(
2077           &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
2078 
2079       int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
2080 
2081       cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
2082       cuda_assert(cuLaunchKernel(cuShader,
2083                                  xblocks,
2084                                  1,
2085                                  1, /* blocks */
2086                                  threads_per_block,
2087                                  1,
2088                                  1, /* threads */
2089                                  0,
2090                                  0,
2091                                  args,
2092                                  0));
2093 
2094       cuda_assert(cuCtxSynchronize());
2095 
2096       if (task.get_cancel()) {
2097         canceled = true;
2098         break;
2099       }
2100     }
2101 
2102     task.update_progress(NULL);
2103   }
2104 }
2105 
map_pixels(device_ptr mem)2106 CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
2107 {
2108   if (!background) {
2109     PixelMem pmem = pixel_mem_map[mem];
2110     CUdeviceptr buffer;
2111 
2112     size_t bytes;
2113     cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
2114     cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
2115 
2116     return buffer;
2117   }
2118 
2119   return (CUdeviceptr)mem;
2120 }
2121 
unmap_pixels(device_ptr mem)2122 void CUDADevice::unmap_pixels(device_ptr mem)
2123 {
2124   if (!background) {
2125     PixelMem pmem = pixel_mem_map[mem];
2126 
2127     cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
2128   }
2129 }
2130 
pixels_alloc(device_memory & mem)2131 void CUDADevice::pixels_alloc(device_memory &mem)
2132 {
2133   PixelMem pmem;
2134 
2135   pmem.w = mem.data_width;
2136   pmem.h = mem.data_height;
2137 
2138   CUDAContextScope scope(this);
2139 
2140   glGenBuffers(1, &pmem.cuPBO);
2141   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2142   if (mem.data_type == TYPE_HALF)
2143     glBufferData(
2144         GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
2145   else
2146     glBufferData(
2147         GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
2148 
2149   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2150 
2151   glActiveTexture(GL_TEXTURE0);
2152   glGenTextures(1, &pmem.cuTexId);
2153   glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2154   if (mem.data_type == TYPE_HALF)
2155     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
2156   else
2157     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
2158   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
2159   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
2160   glBindTexture(GL_TEXTURE_2D, 0);
2161 
2162   CUresult result = cuGraphicsGLRegisterBuffer(
2163       &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
2164 
2165   if (result == CUDA_SUCCESS) {
2166     mem.device_pointer = pmem.cuTexId;
2167     pixel_mem_map[mem.device_pointer] = pmem;
2168 
2169     mem.device_size = mem.memory_size();
2170     stats.mem_alloc(mem.device_size);
2171 
2172     return;
2173   }
2174   else {
2175     /* failed to register buffer, fallback to no interop */
2176     glDeleteBuffers(1, &pmem.cuPBO);
2177     glDeleteTextures(1, &pmem.cuTexId);
2178 
2179     background = true;
2180   }
2181 }
2182 
pixels_copy_from(device_memory & mem,int y,int w,int h)2183 void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
2184 {
2185   PixelMem pmem = pixel_mem_map[mem.device_pointer];
2186 
2187   CUDAContextScope scope(this);
2188 
2189   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2190   uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
2191   size_t offset = sizeof(uchar) * 4 * y * w;
2192   memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
2193   glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
2194   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2195 }
2196 
pixels_free(device_memory & mem)2197 void CUDADevice::pixels_free(device_memory &mem)
2198 {
2199   if (mem.device_pointer) {
2200     PixelMem pmem = pixel_mem_map[mem.device_pointer];
2201 
2202     CUDAContextScope scope(this);
2203 
2204     cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
2205     glDeleteBuffers(1, &pmem.cuPBO);
2206     glDeleteTextures(1, &pmem.cuTexId);
2207 
2208     pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
2209     mem.device_pointer = 0;
2210 
2211     stats.mem_free(mem.device_size);
2212     mem.device_size = 0;
2213   }
2214 }
2215 
draw_pixels(device_memory & mem,int y,int w,int h,int width,int height,int dx,int dy,int dw,int dh,bool transparent,const DeviceDrawParams & draw_params)2216 void CUDADevice::draw_pixels(device_memory &mem,
2217                              int y,
2218                              int w,
2219                              int h,
2220                              int width,
2221                              int height,
2222                              int dx,
2223                              int dy,
2224                              int dw,
2225                              int dh,
2226                              bool transparent,
2227                              const DeviceDrawParams &draw_params)
2228 {
2229   assert(mem.type == MEM_PIXELS);
2230 
2231   if (!background) {
2232     const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
2233     PixelMem pmem = pixel_mem_map[mem.device_pointer];
2234     float *vpointer;
2235 
2236     CUDAContextScope scope(this);
2237 
2238     /* for multi devices, this assumes the inefficient method that we allocate
2239      * all pixels on the device even though we only render to a subset */
2240     size_t offset = 4 * y * w;
2241 
2242     if (mem.data_type == TYPE_HALF)
2243       offset *= sizeof(GLhalf);
2244     else
2245       offset *= sizeof(uint8_t);
2246 
2247     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2248     glActiveTexture(GL_TEXTURE0);
2249     glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2250     if (mem.data_type == TYPE_HALF) {
2251       glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
2252     }
2253     else {
2254       glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
2255     }
2256     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2257 
2258     if (transparent) {
2259       glEnable(GL_BLEND);
2260       glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
2261     }
2262 
2263     GLint shader_program;
2264     if (use_fallback_shader) {
2265       if (!bind_fallback_display_space_shader(dw, dh)) {
2266         return;
2267       }
2268       shader_program = fallback_shader_program;
2269     }
2270     else {
2271       draw_params.bind_display_space_shader_cb();
2272       glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
2273     }
2274 
2275     if (!vertex_buffer) {
2276       glGenBuffers(1, &vertex_buffer);
2277     }
2278 
2279     glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
2280     /* invalidate old contents -
2281      * avoids stalling if buffer is still waiting in queue to be rendered */
2282     glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
2283 
2284     vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
2285 
2286     if (vpointer) {
2287       /* texture coordinate - vertex pair */
2288       vpointer[0] = 0.0f;
2289       vpointer[1] = 0.0f;
2290       vpointer[2] = dx;
2291       vpointer[3] = dy;
2292 
2293       vpointer[4] = (float)w / (float)pmem.w;
2294       vpointer[5] = 0.0f;
2295       vpointer[6] = (float)width + dx;
2296       vpointer[7] = dy;
2297 
2298       vpointer[8] = (float)w / (float)pmem.w;
2299       vpointer[9] = (float)h / (float)pmem.h;
2300       vpointer[10] = (float)width + dx;
2301       vpointer[11] = (float)height + dy;
2302 
2303       vpointer[12] = 0.0f;
2304       vpointer[13] = (float)h / (float)pmem.h;
2305       vpointer[14] = dx;
2306       vpointer[15] = (float)height + dy;
2307 
2308       glUnmapBuffer(GL_ARRAY_BUFFER);
2309     }
2310 
2311     GLuint vertex_array_object;
2312     GLuint position_attribute, texcoord_attribute;
2313 
2314     glGenVertexArrays(1, &vertex_array_object);
2315     glBindVertexArray(vertex_array_object);
2316 
2317     texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
2318     position_attribute = glGetAttribLocation(shader_program, "pos");
2319 
2320     glEnableVertexAttribArray(texcoord_attribute);
2321     glEnableVertexAttribArray(position_attribute);
2322 
2323     glVertexAttribPointer(
2324         texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
2325     glVertexAttribPointer(position_attribute,
2326                           2,
2327                           GL_FLOAT,
2328                           GL_FALSE,
2329                           4 * sizeof(float),
2330                           (const GLvoid *)(sizeof(float) * 2));
2331 
2332     glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
2333 
2334     if (use_fallback_shader) {
2335       glUseProgram(0);
2336     }
2337     else {
2338       draw_params.unbind_display_space_shader_cb();
2339     }
2340 
2341     if (transparent) {
2342       glDisable(GL_BLEND);
2343     }
2344 
2345     glBindTexture(GL_TEXTURE_2D, 0);
2346 
2347     return;
2348   }
2349 
2350   Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
2351 }
2352 
thread_run(DeviceTask & task)2353 void CUDADevice::thread_run(DeviceTask &task)
2354 {
2355   CUDAContextScope scope(this);
2356 
2357   if (task.type == DeviceTask::RENDER) {
2358     DeviceRequestedFeatures requested_features;
2359     if (use_split_kernel()) {
2360       if (split_kernel == NULL) {
2361         split_kernel = new CUDASplitKernel(this);
2362         split_kernel->load_kernels(requested_features);
2363       }
2364     }
2365 
2366     device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
2367 
2368     /* keep rendering tiles until done */
2369     RenderTile tile;
2370     DenoisingTask denoising(this, task);
2371 
2372     while (task.acquire_tile(this, tile, task.tile_types)) {
2373       if (tile.task == RenderTile::PATH_TRACE) {
2374         if (use_split_kernel()) {
2375           device_only_memory<uchar> void_buffer(this, "void_buffer");
2376           split_kernel->path_trace(task, tile, void_buffer, void_buffer);
2377         }
2378         else {
2379           render(task, tile, work_tiles);
2380         }
2381       }
2382       else if (tile.task == RenderTile::BAKE) {
2383         render(task, tile, work_tiles);
2384       }
2385       else if (tile.task == RenderTile::DENOISE) {
2386         tile.sample = tile.start_sample + tile.num_samples;
2387 
2388         denoise(tile, denoising);
2389 
2390         task.update_progress(&tile, tile.w * tile.h);
2391       }
2392 
2393       task.release_tile(tile);
2394 
2395       if (task.get_cancel()) {
2396         if (task.need_finish_queue == false)
2397           break;
2398       }
2399     }
2400 
2401     work_tiles.free();
2402   }
2403   else if (task.type == DeviceTask::SHADER) {
2404     shader(task);
2405 
2406     cuda_assert(cuCtxSynchronize());
2407   }
2408   else if (task.type == DeviceTask::DENOISE_BUFFER) {
2409     RenderTile tile;
2410     tile.x = task.x;
2411     tile.y = task.y;
2412     tile.w = task.w;
2413     tile.h = task.h;
2414     tile.buffer = task.buffer;
2415     tile.sample = task.sample + task.num_samples;
2416     tile.num_samples = task.num_samples;
2417     tile.start_sample = task.sample;
2418     tile.offset = task.offset;
2419     tile.stride = task.stride;
2420     tile.buffers = task.buffers;
2421 
2422     DenoisingTask denoising(this, task);
2423     denoise(tile, denoising);
2424     task.update_progress(&tile, tile.w * tile.h);
2425   }
2426 }
2427 
task_add(DeviceTask & task)2428 void CUDADevice::task_add(DeviceTask &task)
2429 {
2430   CUDAContextScope scope(this);
2431 
2432   /* Load texture info. */
2433   load_texture_info();
2434 
2435   /* Synchronize all memory copies before executing task. */
2436   cuda_assert(cuCtxSynchronize());
2437 
2438   if (task.type == DeviceTask::FILM_CONVERT) {
2439     /* must be done in main thread due to opengl access */
2440     film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
2441   }
2442   else {
2443     task_pool.push([=] {
2444       DeviceTask task_copy = task;
2445       thread_run(task_copy);
2446     });
2447   }
2448 }
2449 
task_wait()2450 void CUDADevice::task_wait()
2451 {
2452   task_pool.wait();
2453 }
2454 
task_cancel()2455 void CUDADevice::task_cancel()
2456 {
2457   task_pool.cancel();
2458 }
2459 
2460 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
2461  * now that the definition of that class is complete
2462  */
2463 #  undef cuda_assert
2464 #  define cuda_assert(stmt) \
2465     { \
2466       CUresult result = stmt; \
2467       if (result != CUDA_SUCCESS) { \
2468         const char *name = cuewErrorString(result); \
2469         device->set_error( \
2470             string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
2471       } \
2472     } \
2473     (void)0
2474 
2475 /* CUDA context scope. */
2476 
CUDAContextScope(CUDADevice * device)2477 CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
2478 {
2479   cuda_assert(cuCtxPushCurrent(device->cuContext));
2480 }
2481 
~CUDAContextScope()2482 CUDAContextScope::~CUDAContextScope()
2483 {
2484   cuda_assert(cuCtxPopCurrent(NULL));
2485 }
2486 
2487 /* split kernel */
2488 
2489 class CUDASplitKernelFunction : public SplitKernelFunction {
2490   CUDADevice *device;
2491   CUfunction func;
2492 
2493  public:
CUDASplitKernelFunction(CUDADevice * device,CUfunction func)2494   CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
2495   {
2496   }
2497 
2498   /* enqueue the kernel, returns false if there is an error */
enqueue(const KernelDimensions & dim,device_memory &,device_memory &)2499   bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
2500   {
2501     return enqueue(dim, NULL);
2502   }
2503 
2504   /* enqueue the kernel, returns false if there is an error */
enqueue(const KernelDimensions & dim,void * args[])2505   bool enqueue(const KernelDimensions &dim, void *args[])
2506   {
2507     if (device->have_error())
2508       return false;
2509 
2510     CUDAContextScope scope(device);
2511 
2512     /* we ignore dim.local_size for now, as this is faster */
2513     int threads_per_block;
2514     cuda_assert(
2515         cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
2516 
2517     int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
2518                   threads_per_block;
2519 
2520     cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
2521 
2522     cuda_assert(cuLaunchKernel(func,
2523                                xblocks,
2524                                1,
2525                                1, /* blocks */
2526                                threads_per_block,
2527                                1,
2528                                1, /* threads */
2529                                0,
2530                                0,
2531                                args,
2532                                0));
2533 
2534     return !device->have_error();
2535   }
2536 };
2537 
CUDASplitKernel(CUDADevice * device)2538 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
2539 {
2540 }
2541 
state_buffer_size(device_memory &,device_memory &,size_t num_threads)2542 uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
2543                                             device_memory & /*data*/,
2544                                             size_t num_threads)
2545 {
2546   CUDAContextScope scope(device);
2547 
2548   device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
2549   size_buffer.alloc(1);
2550   size_buffer.zero_to_device();
2551 
2552   uint threads = num_threads;
2553   CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
2554 
2555   struct args_t {
2556     uint *num_threads;
2557     CUdeviceptr *size;
2558   };
2559 
2560   args_t args = {&threads, &d_size};
2561 
2562   CUfunction state_buffer_size;
2563   cuda_assert(
2564       cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
2565 
2566   cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
2567 
2568   size_buffer.copy_from_device(0, 1, 1);
2569   size_t size = size_buffer[0];
2570   size_buffer.free();
2571 
2572   return size;
2573 }
2574 
enqueue_split_kernel_data_init(const KernelDimensions & dim,RenderTile & rtile,int num_global_elements,device_memory &,device_memory &,device_memory & split_data,device_memory & ray_state,device_memory & queue_index,device_memory & use_queues_flag,device_memory & work_pool_wgs)2575 bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
2576                                                      RenderTile &rtile,
2577                                                      int num_global_elements,
2578                                                      device_memory & /*kernel_globals*/,
2579                                                      device_memory & /*kernel_data*/,
2580                                                      device_memory &split_data,
2581                                                      device_memory &ray_state,
2582                                                      device_memory &queue_index,
2583                                                      device_memory &use_queues_flag,
2584                                                      device_memory &work_pool_wgs)
2585 {
2586   CUDAContextScope scope(device);
2587 
2588   CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
2589   CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
2590   CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
2591   CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
2592   CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
2593 
2594   CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
2595 
2596   int end_sample = rtile.start_sample + rtile.num_samples;
2597   int queue_size = dim.global_size[0] * dim.global_size[1];
2598 
2599   struct args_t {
2600     CUdeviceptr *split_data_buffer;
2601     int *num_elements;
2602     CUdeviceptr *ray_state;
2603     int *start_sample;
2604     int *end_sample;
2605     int *sx;
2606     int *sy;
2607     int *sw;
2608     int *sh;
2609     int *offset;
2610     int *stride;
2611     CUdeviceptr *queue_index;
2612     int *queuesize;
2613     CUdeviceptr *use_queues_flag;
2614     CUdeviceptr *work_pool_wgs;
2615     int *num_samples;
2616     CUdeviceptr *buffer;
2617   };
2618 
2619   args_t args = {&d_split_data,
2620                  &num_global_elements,
2621                  &d_ray_state,
2622                  &rtile.start_sample,
2623                  &end_sample,
2624                  &rtile.x,
2625                  &rtile.y,
2626                  &rtile.w,
2627                  &rtile.h,
2628                  &rtile.offset,
2629                  &rtile.stride,
2630                  &d_queue_index,
2631                  &queue_size,
2632                  &d_use_queues_flag,
2633                  &d_work_pool_wgs,
2634                  &rtile.num_samples,
2635                  &d_buffer};
2636 
2637   CUfunction data_init;
2638   cuda_assert(
2639       cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
2640   if (device->have_error()) {
2641     return false;
2642   }
2643 
2644   CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
2645 
2646   return !device->have_error();
2647 }
2648 
get_split_kernel_function(const string & kernel_name,const DeviceRequestedFeatures &)2649 SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
2650                                                                 const DeviceRequestedFeatures &)
2651 {
2652   const CUDAContextScope scope(device);
2653 
2654   CUfunction func;
2655   const CUresult result = cuModuleGetFunction(
2656       &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
2657   if (result != CUDA_SUCCESS) {
2658     device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
2659                                     kernel_name.data(),
2660                                     cuewErrorString(result)));
2661     return NULL;
2662   }
2663 
2664   return new CUDASplitKernelFunction(device, func);
2665 }
2666 
split_kernel_local_size()2667 int2 CUDASplitKernel::split_kernel_local_size()
2668 {
2669   return make_int2(32, 1);
2670 }
2671 
split_kernel_global_size(device_memory & kg,device_memory & data,DeviceTask &)2672 int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
2673                                                device_memory &data,
2674                                                DeviceTask & /*task*/)
2675 {
2676   CUDAContextScope scope(device);
2677   size_t free;
2678   size_t total;
2679 
2680   cuda_assert(cuMemGetInfo(&free, &total));
2681 
2682   VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
2683           << " bytes. (" << string_human_readable_size(free) << ").";
2684 
2685   size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
2686   size_t side = round_down((int)sqrt(num_elements), 32);
2687   int2 global_size = make_int2(side, round_down(num_elements / side, 16));
2688   VLOG(1) << "Global size: " << global_size << ".";
2689   return global_size;
2690 }
2691 
2692 CCL_NAMESPACE_END
2693 
2694 #endif
2695