1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_fuse_based_visibility.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_fuse_based_visibility.h"
6 //:
7 // \file
8 // \brief  A process for fusing models based on visibility ( fusion of 3-d models ( zeroth order ) )
9 //
10 // \author Vishal Jain
11 // \date Nov 13, 2013
12 
13 #ifdef _MSC_VER
14 #  include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/boxm2_data_base.h>
17 #include <boxm2/ocl/boxm2_ocl_util.h>
18 #include <boxm2/boxm2_util.h>
19 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
20 #include "vil/vil_image_view.h"
21 
22 //directory utility
23 #include "vul/vul_timer.h"
24 #include <vcl_where_root_dir.h>
25 #include <bocl/bocl_device.h>
26 #include <bocl/bocl_kernel.h>
27 #include <boct/boct_bit_tree.h>
28 #include "vnl/vnl_numeric_traits.h"
29 
30 //: Map of kernels should persist between process executions
31 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_based_visibility::kernels_;
32 
33 //Main public method, updates color model
fuse_based_visibility(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)34 bool boxm2_ocl_fuse_based_visibility::fuse_based_visibility(boxm2_scene_sptr         sceneA,
35                                                             const boxm2_scene_sptr&         sceneB,
36                                                             const bocl_device_sptr&         device,
37                                                             const boxm2_opencl_cache_sptr&  opencl_cache)
38 {
39 
40 
41   float transfer_time=0.0f;
42   float gpu_time=0.0f;
43   std::size_t local_threads[1]={64};
44   std::size_t global_threads[1]={64};
45 
46   bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
47   bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
48   bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
49   centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
50   centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
51   centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
52   // output buffer for debugging
53   float output_buff[1000];
54   bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
55   output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
56   // bit lookup buffer
57   cl_uchar lookup_arr[256];
58   boxm2_ocl_util::set_bit_lookup(lookup_arr);
59   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
60   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
61   int status = 0;
62   cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
63   //cache size sanity check
64   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
65   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
66 
67   std::string options = "";
68   // compile the kernel if not already compiled
69   std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
70   std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
71   std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
72   std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
73   auto iter_blks_A = blocks_A.begin();
74   auto iter_blks_B = blocks_B.begin();
75 
76   int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
77 
78   bocl_kernel * kern = boxm2_ocl_fuse_based_visibility::get_kernels(device,"")[0];
79   for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
80   {
81       if((*iter_blks_A) != (*iter_blks_B))
82       {
83           std::cout<<"Blocks do  not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
84           return false;
85       }
86       bocl_mem* blk_A       = opencl_cache->get_block(sceneA, *iter_blks_A);
87       bocl_mem* alpha_A     = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
88       bocl_mem* vis_score_A     = opencl_cache->get_data<BOXM2_VIS_SCORE>(sceneA, *iter_blks_A,0,true);
89       bocl_mem* app_A     = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
90       boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
91       info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
92       bocl_mem* blk_info_A  = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
93       blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
94 
95       bocl_mem* blk_B       = opencl_cache->get_block(sceneB, *iter_blks_B);
96       bocl_mem* alpha_B     = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
97       bocl_mem* app_B       = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_A,0,false);
98       bocl_mem* vis_score_B = opencl_cache->get_data<BOXM2_VIS_SCORE>(sceneB, *iter_blks_B,0,true);
99       boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
100       info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
101       bocl_mem* blk_info_B  = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
102       blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
103       global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
104 
105       std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
106       kern->set_arg(centerX.ptr());
107       kern->set_arg(centerY.ptr());
108       kern->set_arg(centerZ.ptr());
109       kern->set_arg(lookup.ptr());
110       kern->set_arg(blk_info_A);
111       kern->set_arg(blk_info_B);
112       kern->set_arg(blk_A);
113       kern->set_arg(alpha_A);
114       kern->set_arg(vis_score_A);
115       kern->set_arg(app_A);
116       kern->set_arg(blk_B);
117       kern->set_arg(alpha_B);
118       kern->set_arg(vis_score_B);
119       kern->set_arg(app_B);
120       kern->set_arg(output.ptr());
121       kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) );    // cumsum buffer,
122       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
123       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
124       if(!kern->execute(queue, 1, local_threads, global_threads))
125       {
126           std::cout<<"Kernel Failed to Execute "<<std::endl;
127           return false;
128       }
129       int status = clFinish(queue);
130       check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
131       gpu_time += kern->exec_time();
132       //clear render kernel args so it can reset em on next execution
133       kern->clear_args();
134       clFinish(queue);
135       alpha_A->read_to_buffer(queue);
136       app_A->read_to_buffer(queue);
137       clFinish(queue);
138 
139       opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
140        opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
141 
142        blk_info_B->release_memory();
143       delete info_buffer_B;
144       blk_info_A->release_memory();
145       delete info_buffer_A;
146 
147   }
148   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
149   clReleaseCommandQueue(queue);
150   return true;
151 }
152 
153 
154 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)155 std::vector<bocl_kernel*>& boxm2_ocl_fuse_based_visibility::get_kernels(const bocl_device_sptr& device, const std::string& opts)
156 {
157   // compile kernels if not already compiled
158   std::string identifier = device->device_identifier() + opts;
159   if (kernels_.find(identifier) != kernels_.end())
160     return kernels_[identifier];
161 
162   //otherwise compile the kernels
163   std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
164 
165   std::vector<std::string> src_paths;
166   std::string source_dir = boxm2_ocl_util::ocl_src_root();
167   src_paths.push_back(source_dir + "scene_info.cl");
168   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
169   src_paths.push_back(source_dir + "pixel_conversion.cl");
170   src_paths.push_back(source_dir + "statistics_library_functions.cl");
171   src_paths.push_back(source_dir + "atomics_util.cl");
172   src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
173   src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
174   //compilation options
175   const std::string& options = opts;
176   //populate vector of kernels
177   std::vector<bocl_kernel*> vec_kernels;
178   //may need DIFF LIST OF SOURCES FOR
179   auto* fuse = new bocl_kernel();
180   std::string update_opts = options + " -D VISIBILITY_BASED";
181   fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_visibility", update_opts, "fusion::fuse_blockwise_based_visibility");
182   vec_kernels.push_back(fuse);
183   //store and return
184   kernels_[identifier] = vec_kernels;
185   return kernels_[identifier];
186 }
187 
188 
189 //: Map of kernels should persist between process executions
190 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_based_orientation::kernels_;
191 
192 //Main public method, updates color model
fuse_based_orientation(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)193 bool boxm2_ocl_fuse_based_orientation::fuse_based_orientation(boxm2_scene_sptr         sceneA,
194                                                                             const boxm2_scene_sptr&         sceneB,
195                                                                             const bocl_device_sptr&         device,
196                                                                             const boxm2_opencl_cache_sptr&  opencl_cache)
197 {
198 
199 
200   float transfer_time=0.0f;
201   float gpu_time=0.0f;
202   std::size_t local_threads[1]={64};
203   std::size_t global_threads[1]={64};
204 
205   bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
206   bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
207   bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
208   centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
209   centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
210   centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
211   // output buffer for debugging
212   float output_buff[1000];
213   bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
214   output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
215   // bit lookup buffer
216   cl_uchar lookup_arr[256];
217   boxm2_ocl_util::set_bit_lookup(lookup_arr);
218   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
219   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
220   int status = 0;
221   cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
222   //cache size sanity check
223   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
224   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
225 
226   std::string options = "";
227   // compile the kernel if not already compiled
228   std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
229   std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
230   std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
231   std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
232   auto iter_blks_A = blocks_A.begin();
233   auto iter_blks_B = blocks_B.begin();
234 
235   int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
236 
237   bocl_kernel * kern = boxm2_ocl_fuse_based_orientation::get_kernels(device,"")[0];
238   for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
239   {
240       if((*iter_blks_A) != (*iter_blks_B))
241       {
242           std::cout<<"Blocks do  not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
243           return false;
244       }
245       bocl_mem* blk_A       = opencl_cache->get_block(sceneA, *iter_blks_A);
246       bocl_mem* alpha_A     = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
247       bocl_mem* vis_A     = opencl_cache->get_data<BOXM2_AUX3>(sceneA, *iter_blks_A,0,true,"normaldot");
248       bocl_mem* exp_A     = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneA, *iter_blks_A,0,true,"normaldot");
249       bocl_mem* app_A     = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
250       boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
251       info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
252       bocl_mem* blk_info_A  = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
253       blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
254 
255       bocl_mem* blk_B       = opencl_cache->get_block(sceneB, *iter_blks_B);
256       bocl_mem* alpha_B     = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
257       bocl_mem* app_B       = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_B,0,false);
258       bocl_mem* vis_B     = opencl_cache->get_data<BOXM2_AUX3>(sceneB, *iter_blks_B,0,true,"normaldot");
259       bocl_mem* exp_B     = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneB, *iter_blks_B,0,true,"normaldot");
260       boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
261       info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
262       bocl_mem* blk_info_B  = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
263       blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
264       global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
265 
266       std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
267       kern->set_arg(centerX.ptr());
268       kern->set_arg(centerY.ptr());
269       kern->set_arg(centerZ.ptr());
270       kern->set_arg(lookup.ptr());
271       kern->set_arg(blk_info_A);
272       kern->set_arg(blk_info_B);
273       kern->set_arg(blk_A);
274       kern->set_arg(alpha_A);
275       kern->set_arg(exp_A);
276       kern->set_arg(vis_A);
277       kern->set_arg(app_A);
278       kern->set_arg(blk_B);
279       kern->set_arg(alpha_B);
280       kern->set_arg(exp_B);
281       kern->set_arg(vis_B);
282       kern->set_arg(app_B);
283       kern->set_arg(output.ptr());
284       kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) );    // cumsum buffer,
285       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
286       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
287       if(!kern->execute(queue, 1, local_threads, global_threads))
288       {
289           std::cout<<"Kernel Failed to Execute "<<std::endl;
290           return false;
291       }
292       int status = clFinish(queue);
293       check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
294       gpu_time += kern->exec_time();
295       //clear render kernel args so it can reset em on next execution
296       kern->clear_args();
297       clFinish(queue);
298       alpha_A->read_to_buffer(queue);
299       app_A->read_to_buffer(queue);
300       clFinish(queue);
301 
302       opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
303        opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
304 
305        blk_info_B->release_memory();
306       delete info_buffer_B;
307       blk_info_A->release_memory();
308       delete info_buffer_A;
309 
310   }
311   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
312   clReleaseCommandQueue(queue);
313   return true;
314 }
315 
316 
317 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)318 std::vector<bocl_kernel*>& boxm2_ocl_fuse_based_orientation::get_kernels(const bocl_device_sptr& device, const std::string& opts)
319 {
320   // compile kernels if not already compiled
321   std::string identifier = device->device_identifier() + opts;
322   if (kernels_.find(identifier) != kernels_.end())
323     return kernels_[identifier];
324 
325   //otherwise compile the kernels
326   std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
327 
328   std::vector<std::string> src_paths;
329   std::string source_dir = boxm2_ocl_util::ocl_src_root();
330   src_paths.push_back(source_dir + "scene_info.cl");
331   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
332   src_paths.push_back(source_dir + "pixel_conversion.cl");
333   src_paths.push_back(source_dir + "statistics_library_functions.cl");
334   src_paths.push_back(source_dir + "atomics_util.cl");
335   src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
336   src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
337   //compilation options
338   const std::string& options = opts;
339   //populate vector of kernels
340   std::vector<bocl_kernel*> vec_kernels;
341   //may need DIFF LIST OF SOURCES FOR
342   auto* fuse = new bocl_kernel();
343   std::string update_opts = options + " -D ORIENTATION_BASED";
344   fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_orientation", update_opts, "fusion::fuse_blockwise_based_orientation");
345   vec_kernels.push_back(fuse);
346   //store and return
347   kernels_[identifier] = vec_kernels;
348   return kernels_[identifier];
349 }
350 
351 
352 
353 
354 //: Map of kernels should persist between process executions
355 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_fuse_surface_density::kernels_;
356 
357 //Main public method, updates color model
fuse_surface_density(boxm2_scene_sptr sceneA,const boxm2_scene_sptr & sceneB,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)358 bool boxm2_ocl_fuse_surface_density::fuse_surface_density(boxm2_scene_sptr         sceneA,
359                                                           const boxm2_scene_sptr&         sceneB,
360                                                           const bocl_device_sptr&         device,
361                                                           const boxm2_opencl_cache_sptr&  opencl_cache)
362 {
363 
364 
365   float transfer_time=0.0f;
366   float gpu_time=0.0f;
367   std::size_t local_threads[1]={64};
368   std::size_t global_threads[1]={64};
369 
370   bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
371   bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
372   bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
373   centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
374   centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
375   centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
376   // output buffer for debugging
377   float output_buff[1000];
378   bocl_mem_sptr output = new bocl_mem(device->context(), output_buff, sizeof(float)*1000, "output" );
379   output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR );
380   // bit lookup buffer
381   cl_uchar lookup_arr[256];
382   boxm2_ocl_util::set_bit_lookup(lookup_arr);
383   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
384   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
385   int status = 0;
386   cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
387   //cache size sanity check
388   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
389   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
390 
391   std::string options = "";
392   // compile the kernel if not already compiled
393   std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
394   std::vector<boxm2_block_id> blocks_A = sceneA->get_block_ids();
395   std::vector<boxm2_block_id> blocks_B = sceneB->get_block_ids();
396   std::cout<<sceneA->data_path()<<" "<<sceneB->data_path()<<std::endl;
397   auto iter_blks_A = blocks_A.begin();
398   auto iter_blks_B = blocks_B.begin();
399 
400   int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
401 
402   bocl_kernel * kern = boxm2_ocl_fuse_surface_density::get_kernels(device,"")[0];
403   for (;iter_blks_A!=blocks_A.end() || iter_blks_B!=blocks_B.end(); iter_blks_A++,iter_blks_B++)
404   {
405       if((*iter_blks_A) != (*iter_blks_B))
406       {
407           std::cout<<"Blocks do  not match "<<(*iter_blks_A)<<" "<<(*iter_blks_B)<<std::endl;
408           return false;
409       }
410       bocl_mem* blk_A       = opencl_cache->get_block(sceneA, *iter_blks_A);
411       bocl_mem* alpha_A     = opencl_cache->get_data<BOXM2_ALPHA>(sceneA, *iter_blks_A,0,false);
412       bocl_mem* vis_A     = opencl_cache->get_data<BOXM2_AUX3>(sceneA, *iter_blks_A,0,true,"surfacedensity");
413       bocl_mem* exp_A     = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneA, *iter_blks_A,0,true,"surfacedensity");
414       bocl_mem* app_A     = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneA, *iter_blks_A,0,false);
415       boxm2_scene_info* info_buffer_A = sceneA->get_blk_metadata(*iter_blks_A);
416       info_buffer_A->data_buffer_length = (int) (alpha_A->num_bytes()/alphaTypeSize);
417       bocl_mem* blk_info_A  = new bocl_mem(device->context(), info_buffer_A, sizeof(boxm2_scene_info), " Scene Info" );
418       blk_info_A->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
419 
420       bocl_mem* blk_B       = opencl_cache->get_block(sceneB, *iter_blks_B);
421       bocl_mem* alpha_B     = opencl_cache->get_data<BOXM2_ALPHA>(sceneB, *iter_blks_B,0,false);
422       bocl_mem* app_B       = opencl_cache->get_data<BOXM2_GAUSS_RGB>(sceneB, *iter_blks_B,0,false);
423       bocl_mem* vis_B     = opencl_cache->get_data<BOXM2_AUX3>(sceneB, *iter_blks_B,0,true,"surfacedensity");
424       bocl_mem* exp_B     = opencl_cache->get_data<BOXM2_EXPECTATION>(sceneB, *iter_blks_B,0,true,"surfacedensity");
425       boxm2_scene_info* info_buffer_B = sceneB->get_blk_metadata(*iter_blks_B);
426       info_buffer_B->data_buffer_length = (int) (alpha_B->num_bytes()/alphaTypeSize);
427       bocl_mem* blk_info_B  = new bocl_mem(device->context(), info_buffer_B, sizeof(boxm2_scene_info), " Scene Info" );
428       blk_info_B->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
429       global_threads[0] = (unsigned) RoundUp(info_buffer_A->scene_dims[0]*info_buffer_A->scene_dims[1]*info_buffer_A->scene_dims[2],(int)local_threads[0]);
430 
431       std::cout<<alpha_A->num_bytes()<<" "<<alpha_B->num_bytes()<<std::endl;
432       kern->set_arg(centerX.ptr());
433       kern->set_arg(centerY.ptr());
434       kern->set_arg(centerZ.ptr());
435       kern->set_arg(lookup.ptr());
436       kern->set_arg(blk_info_A);
437       kern->set_arg(blk_info_B);
438       kern->set_arg(blk_A);
439       kern->set_arg(alpha_A);
440       kern->set_arg(exp_A);
441       kern->set_arg(vis_A);
442       kern->set_arg(app_A);
443       kern->set_arg(blk_B);
444       kern->set_arg(alpha_B);
445       kern->set_arg(exp_B);
446       kern->set_arg(vis_B);
447       kern->set_arg(app_B);
448       kern->set_arg(output.ptr());
449       kern->set_local_arg(local_threads[0]*10*sizeof(cl_uchar) );    // cumsum buffer,
450       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
451       kern->set_local_arg(16*local_threads[0]*sizeof(unsigned char)); // local trees
452       if(!kern->execute(queue, 1, local_threads, global_threads))
453       {
454           std::cout<<"Kernel Failed to Execute "<<std::endl;
455           return false;
456       }
457       int status = clFinish(queue);
458       check_val(status, MEM_FAILURE, "Fusion ( Based on Visibility ) EXECUTE FAILED: " + error_to_string(status));
459       gpu_time += kern->exec_time();
460       //clear render kernel args so it can reset em on next execution
461       kern->clear_args();
462       clFinish(queue);
463       alpha_A->read_to_buffer(queue);
464       app_A->read_to_buffer(queue);
465       clFinish(queue);
466 
467       opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_ALPHA>::prefix());
468        opencl_cache->get_cpu_cache()->remove_data_base(sceneA,(*iter_blks_A),boxm2_data_traits<BOXM2_GAUSS_RGB>::prefix());
469 
470        blk_info_B->release_memory();
471       delete info_buffer_B;
472       blk_info_A->release_memory();
473       delete info_buffer_A;
474 
475   }
476   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
477   clReleaseCommandQueue(queue);
478   return true;
479 }
480 
481 
482 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)483 std::vector<bocl_kernel*>& boxm2_ocl_fuse_surface_density::get_kernels(const bocl_device_sptr& device, const std::string& opts)
484 {
485   // compile kernels if not already compiled
486   std::string identifier = device->device_identifier() + opts;
487   if (kernels_.find(identifier) != kernels_.end())
488     return kernels_[identifier];
489 
490   //otherwise compile the kernels
491   std::cout<<"=== boxm2_ocl_fuse_based_visibility_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
492 
493   std::vector<std::string> src_paths;
494   std::string source_dir = boxm2_ocl_util::ocl_src_root();
495   src_paths.push_back(source_dir + "scene_info.cl");
496   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
497   src_paths.push_back(source_dir + "pixel_conversion.cl");
498   src_paths.push_back(source_dir + "statistics_library_functions.cl");
499   src_paths.push_back(source_dir + "atomics_util.cl");
500   src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
501   src_paths.push_back(source_dir + "fusion/fusion_kernels.cl");
502   //compilation options
503   const std::string& options = opts;
504   //populate vector of kernels
505   std::vector<bocl_kernel*> vec_kernels;
506   //may need DIFF LIST OF SOURCES FOR
507   auto* fuse = new bocl_kernel();
508   std::string update_opts = options + " -D SURFACE_DENSITY_BASED";
509   fuse->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_blockwise_based_surface_density", update_opts, "fusion::fuse_blockwise_based_surface_density");
510   vec_kernels.push_back(fuse);
511   //store and return
512   kernels_[identifier] = vec_kernels;
513   return kernels_[identifier];
514 }
515