1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_fuse_based_visibility.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_fuse_based_visibility.h"
6 //:
7 // \file
8 // \brief A process for fusing models based on visibility ( fusion of 3-d models ( zeroth order ) )
9 //
10 // \author Vishal Jain
11 // \date Nov 13, 2013
12
13 #ifdef _MSC_VER
14 # include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/boxm2_data_base.h>
17 #include <boxm2/ocl/boxm2_ocl_util.h>
18 #include <boxm2/boxm2_util.h>
19 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
20 #include "vil/vil_image_view.h"
21
22 //directory utility
23 #include "vul/vul_timer.h"
24 #include <vcl_where_root_dir.h>
25 #include <bocl/bocl_device.h>
26 #include <bocl/bocl_kernel.h>
27 #include <boct/boct_bit_tree.h>
28 #include "vnl/vnl_numeric_traits.h"
29
30 //: Map of kernels should persist between process executions
31 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_based_visibility::kernels_;
32
33 //Main public method, updates color model
fuse_based_visibility(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)34 bool boxm2_ocl_fuse_based_visibility::fuse_based_visibility(boxm2_scene_sptr sceneA,
35 const boxm2_scene_sptr& sceneB,
36 const bocl_device_sptr& device,
37 const boxm2_opencl_cache_sptr& opencl_cache)
38 {
39
40
41 float transfer_time=0.0f;
42 float gpu_time=0.0f;
43 std::size_t local_threads[1]={64};
44 std::size_t global_threads[1]={64};
45
46 bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
47 bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
48 bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
49 centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
50 centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
51 centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
52 // output buffer for debugging
53 float output_buff[1000];
54 bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
55 output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
56 // bit lookup buffer
57 cl_uchar lookup_arr[256];
58 boxm2_ocl_util::set_bit_lookup(lookup_arr);
59 bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
60 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
61 int status = 0;
62 cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
63 //cache size sanity check
64 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
65 std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
66
67 std::string options = "";
68 // compile the kernel if not already compiled
69 std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
70 std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
71 std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
72 std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
73 auto iter_blks_A = blocks_A.begin();
74 auto iter_blks_B = blocks_B.begin();
75
76 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
77
78 bocl_kernel * kern = boxm2_ocl_fuse_based_visibility::get_kernels(device,"")[0];
79 for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
80 {
81 if((*iter_blks_A) != (*iter_blks_B))
82 {
83 std::cout<<"Blocks do not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
84 return false;
85 }
86 bocl_mem* blk_A = opencl_cache->get_block(sceneA, *iter_blks_A);
87 bocl_mem* alpha_A = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
88 bocl_mem* vis_score_A = opencl_cache->get_data<BOXM2_VIS_SCORE>(sceneA, *iter_blks_A,0,true);
89 bocl_mem* app_A = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
90 boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
91 info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
92 bocl_mem* blk_info_A = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
93 blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
94
95 bocl_mem* blk_B = opencl_cache->get_block(sceneB, *iter_blks_B);
96 bocl_mem* alpha_B = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
97 bocl_mem* app_B = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_A,0,false);
98 bocl_mem* vis_score_B = opencl_cache->get_data<BOXM2_VIS_SCORE>(sceneB, *iter_blks_B,0,true);
99 boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
100 info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
101 bocl_mem* blk_info_B = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
102 blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
103 global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
104
105 std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
106 kern->set_arg(centerX.ptr());
107 kern->set_arg(centerY.ptr());
108 kern->set_arg(centerZ.ptr());
109 kern->set_arg(lookup.ptr());
110 kern->set_arg(blk_info_A);
111 kern->set_arg(blk_info_B);
112 kern->set_arg(blk_A);
113 kern->set_arg(alpha_A);
114 kern->set_arg(vis_score_A);
115 kern->set_arg(app_A);
116 kern->set_arg(blk_B);
117 kern->set_arg(alpha_B);
118 kern->set_arg(vis_score_B);
119 kern->set_arg(app_B);
120 kern->set_arg(output.ptr());
121 kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) ); // cumsum buffer,
122 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
123 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
124 if(!kern->execute(queue, 1, local_threads, global_threads))
125 {
126 std::cout<<"Kernel Failed to Execute "<<std::endl;
127 return false;
128 }
129 int status = clFinish(queue);
130 check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
131 gpu_time += kern->exec_time();
132 //clear render kernel args so it can reset em on next execution
133 kern->clear_args();
134 clFinish(queue);
135 alpha_A->read_to_buffer(queue);
136 app_A->read_to_buffer(queue);
137 clFinish(queue);
138
139 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
140 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
141
142 blk_info_B->release_memory();
143 delete info_buffer_B;
144 blk_info_A->release_memory();
145 delete info_buffer_A;
146
147 }
148 std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
149 clReleaseCommandQueue(queue);
150 return true;
151 }
152
153
154 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)155 std::vector<bocl_kernel*>& boxm2_ocl_fuse_based_visibility::get_kernels(const bocl_device_sptr& device, const std::string& opts)
156 {
157 // compile kernels if not already compiled
158 std::string identifier = device->device_identifier() + opts;
159 if (kernels_.find(identifier) != kernels_.end())
160 return kernels_[identifier];
161
162 //otherwise compile the kernels
163 std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
164
165 std::vector<std::string> src_paths;
166 std::string source_dir = boxm2_ocl_util::ocl_src_root();
167 src_paths.push_back(source_dir + "scene_info.cl");
168 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
169 src_paths.push_back(source_dir + "pixel_conversion.cl");
170 src_paths.push_back(source_dir + "statistics_library_functions.cl");
171 src_paths.push_back(source_dir + "atomics_util.cl");
172 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
173 src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
174 //compilation options
175 const std::string& options = opts;
176 //populate vector of kernels
177 std::vector<bocl_kernel*> vec_kernels;
178 //may need DIFF LIST OF SOURCES FOR
179 auto* fuse = new bocl_kernel();
180 std::string update_opts = options + " -D VISIBILITY_BASED";
181 fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_visibility", update_opts, "fusion::fuse_blockwise_based_visibility");
182 vec_kernels.push_back(fuse);
183 //store and return
184 kernels_[identifier] = vec_kernels;
185 return kernels_[identifier];
186 }
187
188
189 //: Map of kernels should persist between process executions
190 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_based_orientation::kernels_;
191
192 //Main public method, updates color model
fuse_based_orientation(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)193 bool boxm2_ocl_fuse_based_orientation::fuse_based_orientation(boxm2_scene_sptr sceneA,
194 const boxm2_scene_sptr& sceneB,
195 const bocl_device_sptr& device,
196 const boxm2_opencl_cache_sptr& opencl_cache)
197 {
198
199
200 float transfer_time=0.0f;
201 float gpu_time=0.0f;
202 std::size_t local_threads[1]={64};
203 std::size_t global_threads[1]={64};
204
205 bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
206 bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
207 bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
208 centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
209 centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
210 centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
211 // output buffer for debugging
212 float output_buff[1000];
213 bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
214 output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
215 // bit lookup buffer
216 cl_uchar lookup_arr[256];
217 boxm2_ocl_util::set_bit_lookup(lookup_arr);
218 bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
219 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
220 int status = 0;
221 cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
222 //cache size sanity check
223 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
224 std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
225
226 std::string options = "";
227 // compile the kernel if not already compiled
228 std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
229 std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
230 std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
231 std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
232 auto iter_blks_A = blocks_A.begin();
233 auto iter_blks_B = blocks_B.begin();
234
235 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
236
237 bocl_kernel * kern = boxm2_ocl_fuse_based_orientation::get_kernels(device,"")[0];
238 for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
239 {
240 if((*iter_blks_A) != (*iter_blks_B))
241 {
242 std::cout<<"Blocks do not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
243 return false;
244 }
245 bocl_mem* blk_A = opencl_cache->get_block(sceneA, *iter_blks_A);
246 bocl_mem* alpha_A = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
247 bocl_mem* vis_A = opencl_cache->get_data<BOXM2_AUX3>(sceneA, *iter_blks_A,0,true,"normaldot");
248 bocl_mem* exp_A = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneA, *iter_blks_A,0,true,"normaldot");
249 bocl_mem* app_A = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
250 boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
251 info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
252 bocl_mem* blk_info_A = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
253 blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
254
255 bocl_mem* blk_B = opencl_cache->get_block(sceneB, *iter_blks_B);
256 bocl_mem* alpha_B = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
257 bocl_mem* app_B = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_B,0,false);
258 bocl_mem* vis_B = opencl_cache->get_data<BOXM2_AUX3>(sceneB, *iter_blks_B,0,true,"normaldot");
259 bocl_mem* exp_B = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneB, *iter_blks_B,0,true,"normaldot");
260 boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
261 info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
262 bocl_mem* blk_info_B = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
263 blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
264 global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
265
266 std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
267 kern->set_arg(centerX.ptr());
268 kern->set_arg(centerY.ptr());
269 kern->set_arg(centerZ.ptr());
270 kern->set_arg(lookup.ptr());
271 kern->set_arg(blk_info_A);
272 kern->set_arg(blk_info_B);
273 kern->set_arg(blk_A);
274 kern->set_arg(alpha_A);
275 kern->set_arg(exp_A);
276 kern->set_arg(vis_A);
277 kern->set_arg(app_A);
278 kern->set_arg(blk_B);
279 kern->set_arg(alpha_B);
280 kern->set_arg(exp_B);
281 kern->set_arg(vis_B);
282 kern->set_arg(app_B);
283 kern->set_arg(output.ptr());
284 kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) ); // cumsum buffer,
285 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
286 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
287 if(!kern->execute(queue, 1, local_threads, global_threads))
288 {
289 std::cout<<"Kernel Failed to Execute "<<std::endl;
290 return false;
291 }
292 int status = clFinish(queue);
293 check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
294 gpu_time += kern->exec_time();
295 //clear render kernel args so it can reset em on next execution
296 kern->clear_args();
297 clFinish(queue);
298 alpha_A->read_to_buffer(queue);
299 app_A->read_to_buffer(queue);
300 clFinish(queue);
301
302 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
303 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
304
305 blk_info_B->release_memory();
306 delete info_buffer_B;
307 blk_info_A->release_memory();
308 delete info_buffer_A;
309
310 }
311 std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
312 clReleaseCommandQueue(queue);
313 return true;
314 }
315
316
317 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)318 std::vector<bocl_kernel*>& boxm2_ocl_fuse_based_orientation::get_kernels(const bocl_device_sptr& device, const std::string& opts)
319 {
320 // compile kernels if not already compiled
321 std::string identifier = device->device_identifier() + opts;
322 if (kernels_.find(identifier) != kernels_.end())
323 return kernels_[identifier];
324
325 //otherwise compile the kernels
326 std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
327
328 std::vector<std::string> src_paths;
329 std::string source_dir = boxm2_ocl_util::ocl_src_root();
330 src_paths.push_back(source_dir + "scene_info.cl");
331 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
332 src_paths.push_back(source_dir + "pixel_conversion.cl");
333 src_paths.push_back(source_dir + "statistics_library_functions.cl");
334 src_paths.push_back(source_dir + "atomics_util.cl");
335 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
336 src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
337 //compilation options
338 const std::string& options = opts;
339 //populate vector of kernels
340 std::vector<bocl_kernel*> vec_kernels;
341 //may need DIFF LIST OF SOURCES FOR
342 auto* fuse = new bocl_kernel();
343 std::string update_opts = options + " -D ORIENTATION_BASED";
344 fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_orientation", update_opts, "fusion::fuse_blockwise_based_orientation");
345 vec_kernels.push_back(fuse);
346 //store and return
347 kernels_[identifier] = vec_kernels;
348 return kernels_[identifier];
349 }
350
351
352
353
354 //: Map of kernels should persist between process executions
355 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_surface_density::kernels_;
356
357 //Main public method, updates color model
fuse_surface_density(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)358 bool boxm2_ocl_fuse_surface_density::fuse_surface_density(boxm2_scene_sptr sceneA,
359 const boxm2_scene_sptr& sceneB,
360 const bocl_device_sptr& device,
361 const boxm2_opencl_cache_sptr& opencl_cache)
362 {
363
364
365 float transfer_time=0.0f;
366 float gpu_time=0.0f;
367 std::size_t local_threads[1]={64};
368 std::size_t global_threads[1]={64};
369
370 bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
371 bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
372 bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
373 centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
374 centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
375 centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
376 // output buffer for debugging
377 float output_buff[1000];
378 bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
379 output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
380 // bit lookup buffer
381 cl_uchar lookup_arr[256];
382 boxm2_ocl_util::set_bit_lookup(lookup_arr);
383 bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
384 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
385 int status = 0;
386 cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
387 //cache size sanity check
388 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
389 std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
390
391 std::string options = "";
392 // compile the kernel if not already compiled
393 std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
394 std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
395 std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
396 std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
397 auto iter_blks_A = blocks_A.begin();
398 auto iter_blks_B = blocks_B.begin();
399
400 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
401
402 bocl_kernel * kern = boxm2_ocl_fuse_surface_density::get_kernels(device,"")[0];
403 for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
404 {
405 if((*iter_blks_A) != (*iter_blks_B))
406 {
407 std::cout<<"Blocks do not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
408 return false;
409 }
410 bocl_mem* blk_A = opencl_cache->get_block(sceneA, *iter_blks_A);
411 bocl_mem* alpha_A = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
412 bocl_mem* vis_A = opencl_cache->get_data<BOXM2_AUX3>(sceneA, *iter_blks_A,0,true,"surfacedensity");
413 bocl_mem* exp_A = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneA, *iter_blks_A,0,true,"surfacedensity");
414 bocl_mem* app_A = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
415 boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
416 info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
417 bocl_mem* blk_info_A = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
418 blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
419
420 bocl_mem* blk_B = opencl_cache->get_block(sceneB, *iter_blks_B);
421 bocl_mem* alpha_B = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
422 bocl_mem* app_B = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_B,0,false);
423 bocl_mem* vis_B = opencl_cache->get_data<BOXM2_AUX3>(sceneB, *iter_blks_B,0,true,"surfacedensity");
424 bocl_mem* exp_B = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneB, *iter_blks_B,0,true,"surfacedensity");
425 boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
426 info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
427 bocl_mem* blk_info_B = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
428 blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
429 global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
430
431 std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
432 kern->set_arg(centerX.ptr());
433 kern->set_arg(centerY.ptr());
434 kern->set_arg(centerZ.ptr());
435 kern->set_arg(lookup.ptr());
436 kern->set_arg(blk_info_A);
437 kern->set_arg(blk_info_B);
438 kern->set_arg(blk_A);
439 kern->set_arg(alpha_A);
440 kern->set_arg(exp_A);
441 kern->set_arg(vis_A);
442 kern->set_arg(app_A);
443 kern->set_arg(blk_B);
444 kern->set_arg(alpha_B);
445 kern->set_arg(exp_B);
446 kern->set_arg(vis_B);
447 kern->set_arg(app_B);
448 kern->set_arg(output.ptr());
449 kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) ); // cumsum buffer,
450 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
451 kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
452 if(!kern->execute(queue, 1, local_threads, global_threads))
453 {
454 std::cout<<"Kernel Failed to Execute "<<std::endl;
455 return false;
456 }
457 int status = clFinish(queue);
458 check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
459 gpu_time += kern->exec_time();
460 //clear render kernel args so it can reset em on next execution
461 kern->clear_args();
462 clFinish(queue);
463 alpha_A->read_to_buffer(queue);
464 app_A->read_to_buffer(queue);
465 clFinish(queue);
466
467 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
468 opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
469
470 blk_info_B->release_memory();
471 delete info_buffer_B;
472 blk_info_A->release_memory();
473 delete info_buffer_A;
474
475 }
476 std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
477 clReleaseCommandQueue(queue);
478 return true;
479 }
480
481
482 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)483 std::vector<bocl_kernel*>& boxm2_ocl_fuse_surface_density::get_kernels(const bocl_device_sptr& device, const std::string& opts)
484 {
485 // compile kernels if not already compiled
486 std::string identifier = device->device_identifier() + opts;
487 if (kernels_.find(identifier) != kernels_.end())
488 return kernels_[identifier];
489
490 //otherwise compile the kernels
491 std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
492
493 std::vector<std::string> src_paths;
494 std::string source_dir = boxm2_ocl_util::ocl_src_root();
495 src_paths.push_back(source_dir + "scene_info.cl");
496 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
497 src_paths.push_back(source_dir + "pixel_conversion.cl");
498 src_paths.push_back(source_dir + "statistics_library_functions.cl");
499 src_paths.push_back(source_dir + "atomics_util.cl");
500 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
501 src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
502 //compilation options
503 const std::string& options = opts;
504 //populate vector of kernels
505 std::vector<bocl_kernel*> vec_kernels;
506 //may need DIFF LIST OF SOURCES FOR
507 auto* fuse = new bocl_kernel();
508 std::string update_opts = options + " -D SURFACE_DENSITY_BASED";
509 fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_surface_density", update_opts, "fusion::fuse_blockwise_based_surface_density");
510 vec_kernels.push_back(fuse);
511 //store and return
512 kernels_[identifier] = vec_kernels;
513 return kernels_[identifier];
514 }
515