1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_update_auxQ.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_update_auxQ.h"
6 //:
7 // \file
8 // \brief  A process for updating a color model
9 //
10 // \author Vishal Jain
11 // \date Mar 25, 2011
12 
13 #ifdef _MSC_VER
14 #  include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/ocl/boxm2_opencl_cache.h>
17 #include <boxm2/boxm2_scene.h>
18 #include <boxm2/boxm2_block.h>
19 #include <boxm2/boxm2_data_base.h>
20 #include <boxm2/ocl/boxm2_ocl_util.h>
21 #include <boxm2/boxm2_util.h>
22 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
23 #include "vil/vil_image_view.h"
24 
25 //directory utility
26 #include "vul/vul_timer.h"
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 
31 //: Map of kernels should persist between process executions
32 std::map<std::string,std::vector<bocl_kernel*> > boxm2_ocl_update_auxQ::kernels_;
33 
34 //Main public method, updates color model
update_auxQ(const boxm2_scene_sptr & scene,bocl_device_sptr device,const boxm2_opencl_cache_sptr & opencl_cache,vpgl_camera_double_sptr cam,const vil_image_view_base_sptr & img,const std::string & app_ident,std::string view_ident,float resnearfactor,float resfarfactor)35 bool boxm2_ocl_update_auxQ::update_auxQ(const boxm2_scene_sptr&         scene,
36                               bocl_device_sptr         device,
37                               const boxm2_opencl_cache_sptr&  opencl_cache,
38                               vpgl_camera_double_sptr  cam,
39                               const vil_image_view_base_sptr& img,
40                               const std::string&               app_ident,
41                               std::string               view_ident,
42                               float resnearfactor ,
43                               float resfarfactor )
44 {
45   enum {
46     UPDATE_SEGLEN = 0,
47     UPDATE_PREINF = 1,
48     UPDATE_PROC   = 2,
49     UPDATE_BAYES  = 3,
50     CONVERT_AUX_INT_FLOAT = 4,
51   };
52   float transfer_time=0.0f;
53   float gpu_time=0.0f;
54   std::size_t local_threads[2]={8,8};
55   std::size_t global_threads[2]={8,8};
56   view_ident = view_ident +"_curr";
57   //cache size sanity check
58   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
59   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
60 
61   //make correct data types are here
62   std::string data_type, num_obs_type,options;
63 
64   int appTypeSize;
65   bool isRGB = false;
66   if (!validate_appearances(scene, data_type, appTypeSize, num_obs_type, options, isRGB))
67       return false;
68   if (app_ident.size() > 0) {
69       data_type += "_" + app_ident;
70       num_obs_type += "_" + app_ident;
71   }
72   // create a command queue.
73   int status=0;
74   cl_command_queue queue = clCreateCommandQueue( device->context(),
75                                                  *(device->device_id()),
76                                                  CL_QUEUE_PROFILING_ENABLE,
77                                                  &status);
78   if (status!=0)
79     return false;
80   // compile the kernel if not already compiled
81   std::vector<bocl_kernel*>& kernels = get_kernels(device, options);
82 
83   //grab input image, establish cl_ni, cl_nj (so global size is divisible by local size)
84   vil_image_view_base_sptr float_img = boxm2_util::prepare_input_image(img, true);
85   auto* img_view = static_cast<vil_image_view<float>* >(float_img.ptr());
86   auto cl_ni=(unsigned)RoundUp(img_view->ni(),(int)local_threads[0]);
87   auto cl_nj=(unsigned)RoundUp(img_view->nj(),(int)local_threads[1]);
88   global_threads[0]=cl_ni;
89   global_threads[1]=cl_nj;
90   //set generic cam
91   auto* ray_origins    = new cl_float[4*cl_ni*cl_nj];
92   auto* ray_directions = new cl_float[4*cl_ni*cl_nj];
93   bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
94   bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
95   boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);
96 
97   float tnearfar[2] = { 0.0f, 1000000} ;
98 
99   if(cam->type_name() == "vpgl_perspective_camera")
100   {
101 
102       float f  = ((vpgl_perspective_camera<double> *)cam.ptr())->get_calibration().focal_length()*((vpgl_perspective_camera<double> *)cam.ptr())->get_calibration().x_scale();
103       std::cout<<"Focal Length " << f<<std::endl;
104       tnearfar[0] = f* scene->finest_resolution()/resnearfactor ;
105       tnearfar[1] = f* scene->finest_resolution()*resfarfactor ;
106       std::cout<<"Near and Far Clipping planes "<<tnearfar[0]<<" "<<tnearfar[1]<<std::endl;
107   }
108   bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2*sizeof(float), tnearfar, "tnearfar  buffer");
109   tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
110   //Visibility, Preinf, Norm, and input image buffers
111   auto* vis_buff = new float[cl_ni*cl_nj];
112   auto* pre_buff = new float[cl_ni*cl_nj];
113   auto* norm_buff = new float[cl_ni*cl_nj];
114   auto* input_buff=new float[cl_ni*cl_nj];
115   for (unsigned i=0;i<cl_ni*cl_nj;i++)
116   {
117     vis_buff[i]=1.0f;
118     pre_buff[i]=0.0f;
119     norm_buff[i]=0.0f;
120   }
121   //copy input vals into image
122   int count=0;
123   for (unsigned int j=0;j<cl_nj;++j) {
124     for (unsigned int i=0;i<cl_ni;++i) {
125       input_buff[count] = 0.0f;
126       if ( i<img_view->ni() && j< img_view->nj() )
127         input_buff[count] = (*img_view)(i,j);
128       ++count;
129     }
130   }
131   //bocl_mem_sptr in_image=new bocl_mem(device->context(),input_buff,cl_ni*cl_nj*sizeof(float),"input image buffer");
132   bocl_mem_sptr in_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), input_buff, "input image buffer");
133   in_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
134 
135   //bocl_mem_sptr vis_image=new bocl_mem(device->context(),vis_buff,cl_ni*cl_nj*sizeof(float),"vis image buffer");
136   bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
137   vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
138 
139   //bocl_mem_sptr pre_image=new bocl_mem(device->context(),pre_buff,cl_ni*cl_nj*sizeof(float),"pre image buffer");
140   bocl_mem_sptr pre_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), pre_buff, "pre image buffer");
141   pre_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
142 
143   //bocl_mem_sptr norm_image=new bocl_mem(device->context(),norm_buff,cl_ni*cl_nj*sizeof(float),"pre image buffer");
144   bocl_mem_sptr norm_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), norm_buff, "norm image buffer");
145   norm_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
146 
147   // Image Dimensions
148   int img_dim_buff[4];
149   img_dim_buff[0] = 0;
150   img_dim_buff[1] = 0;
151   img_dim_buff[2] = img_view->ni();
152   img_dim_buff[3] = img_view->nj();
153 
154   bocl_mem_sptr img_dim=new bocl_mem(device->context(), img_dim_buff, sizeof(int)*4, "image dims");
155   img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
156 
157   // Output Array
158   float output_arr[100];
159   for (float & i : output_arr) i = 0.0f;
160   bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
161   cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
162 
163   // bit lookup buffer
164   cl_uchar lookup_arr[256];
165   boxm2_ocl_util::set_bit_lookup(lookup_arr);
166   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
167   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
168 
169   // app density used for proc_norm_image
170   float app_buffer[4]={1.0,0.0,0.0,0.0};
171   bocl_mem_sptr app_density = new bocl_mem(device->context(), app_buffer, sizeof(cl_float4), "app density buffer");
172   app_density->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
173 
174   // set arguments
175   std::vector<boxm2_block_id> vis_order;
176   //if(cam->type_name() == "vpgl_perspective_camera")
177   //    vis_order= scene->get_vis_blocks_opt((vpgl_perspective_camera<double>*)cam.ptr(),img_view->ni(),img_view->nj());
178   //else
179       vis_order= scene->get_vis_blocks(cam);
180   std::vector<boxm2_block_id>::iterator id;
181   for (unsigned int i=0; i<kernels.size(); ++i)
182   {
183     if ( i == UPDATE_PROC ) {
184       bocl_kernel * proc_kern=kernels[i];
185 
186       proc_kern->set_arg( norm_image.ptr() );
187       proc_kern->set_arg( vis_image.ptr() );
188       proc_kern->set_arg( pre_image.ptr());
189       proc_kern->set_arg( img_dim.ptr() );
190 
191       //execute kernel
192       proc_kern->execute( queue, 2, local_threads, global_threads);
193       int status = clFinish(queue);
194       if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
195         return false;
196       proc_kern->clear_args();
197       norm_image->read_to_buffer(queue);
198       vil_image_view<float> norm(img_view->ni(),img_view->nj());
199 
200       int count=0;
201       for (unsigned int j=0;j<cl_nj;++j) {
202           for (unsigned int i=0;i<cl_ni;++i) {
203               if ( i<img_view->ni() && j< img_view->nj() )
204                   norm(i,j) = norm_buff[count] ;
205               ++count;
206           }
207       }
208       continue;
209     }
210 
211     //set masked values
212     vis_image->read_to_buffer(queue);
213     for (id = vis_order.begin(); id != vis_order.end(); ++id)
214     {
215       //choose correct render kernel
216       boxm2_block_metadata mdata = scene->get_block_metadata(*id);
217       bocl_kernel* kern = kernels[i];
218 
219       //write the image values to the buffer
220       vul_timer transfer;
221       bocl_mem* blk       = opencl_cache->get_block(scene,*id);
222       bocl_mem* blk_info  = opencl_cache->loaded_block_info();
223       bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(scene,*id,0,false);
224       auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
225       int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
226       info_buffer->data_buffer_length = (int) (alpha->num_bytes()/alphaTypeSize);
227       blk_info->write_to_buffer((queue));
228 
229       int nobsTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_NUM_OBS>::prefix());
230       int appTypeSize =  (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_MOG3_GREY>::prefix());
231       // data type string may contain an identifier so determine the buffer size
232       bocl_mem* mog       = opencl_cache->get_data(scene,*id,data_type,alpha->num_bytes()/alphaTypeSize*appTypeSize,false);    //info_buffer->data_buffer_length*boxm2_data_info::datasize(data_type));
233       bocl_mem* num_obs   = opencl_cache->get_data(scene,*id,num_obs_type,alpha->num_bytes()/alphaTypeSize*nobsTypeSize,false);//,info_buffer->data_buffer_length*boxm2_data_info::datasize(num_obs_type));
234 
235       //grab an appropriately sized AUX data buffer
236       int auxTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX0>::prefix());
237       bocl_mem *aux0   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX0>::prefix(view_ident),info_buffer->data_buffer_length*auxTypeSize,false);
238       auxTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX1>::prefix());
239       bocl_mem *aux1   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX1>::prefix(view_ident),info_buffer->data_buffer_length*auxTypeSize,false);
240       auxTypeSize = boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX2>::prefix());
241       bocl_mem *aux2   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX2>::prefix(view_ident),info_buffer->data_buffer_length*auxTypeSize,false);
242 
243       auxTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX3>::prefix());
244       bocl_mem *aux3   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX3>::prefix(view_ident),info_buffer->data_buffer_length*auxTypeSize,false);
245 
246       transfer_time += (float) transfer.all();
247       if (i==UPDATE_SEGLEN)
248       {
249         aux0->zero_gpu_buffer(queue);
250         aux1->zero_gpu_buffer(queue);
251         kern->set_arg( blk_info );
252         kern->set_arg( blk );
253         kern->set_arg( alpha );
254         kern->set_arg( aux0 );
255         kern->set_arg( aux1 );
256         kern->set_arg( lookup.ptr() );
257         kern->set_arg( ray_o_buff.ptr() );
258         kern->set_arg( ray_d_buff.ptr() );
259         kern->set_arg( tnearfar_mem_ptr.ptr() );
260         kern->set_arg( img_dim.ptr() );
261         kern->set_arg( in_image.ptr() );
262         kern->set_arg( cl_output.ptr() );
263         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );//local tree,
264         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar4) ); //ray bundle,
265         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_int) );    //cell pointers,
266         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_float4) ); //cached aux,
267         kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) ); //cumsum buffer, imindex buffer
268         //execute kernel
269         kern->execute(queue, 2, local_threads, global_threads);
270         int status = clFinish(queue);
271         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
272           return false;
273         gpu_time += kern->exec_time();
274         //clear render kernel args so it can reset em on next execution
275         kern->clear_args();
276         aux0->read_to_buffer(queue);
277         aux1->read_to_buffer(queue);
278       }
279       else if (i==UPDATE_PREINF)
280       {
281 
282         kern->set_arg( blk_info );
283         kern->set_arg( blk );
284         kern->set_arg( alpha );
285         kern->set_arg( mog );
286         kern->set_arg( num_obs );
287         kern->set_arg( aux0 );
288         kern->set_arg( aux1 );
289         kern->set_arg( lookup.ptr() );
290         kern->set_arg( ray_o_buff.ptr() );
291         kern->set_arg( ray_d_buff.ptr() );
292         kern->set_arg( tnearfar_mem_ptr.ptr() );
293         kern->set_arg( img_dim.ptr() );
294         kern->set_arg( vis_image.ptr() );
295         kern->set_arg( pre_image.ptr() );
296         kern->set_arg( cl_output.ptr() );
297         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );//local tree,
298         kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) ); //cumsum buffer, imindex buffer
299         //execute kernel
300         kern->execute(queue, 2, local_threads, global_threads);
301         int status = clFinish(queue);
302         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
303           return false;
304         gpu_time += kern->exec_time();
305         //clear render kernel args so it can reset em on next execution
306         kern->clear_args();
307         //write info to disk
308       }
309       else if (i==UPDATE_BAYES)
310       {
311       aux2->zero_gpu_buffer(queue);
312       aux3->zero_gpu_buffer(queue);
313         kern->set_arg( blk_info );
314         kern->set_arg( blk );
315         kern->set_arg( alpha );
316         kern->set_arg( mog );
317         kern->set_arg( num_obs );
318         kern->set_arg( aux0 );
319         kern->set_arg( aux1 );
320         kern->set_arg( aux2 );
321         kern->set_arg( aux3 );
322         kern->set_arg( lookup.ptr() );
323         kern->set_arg( ray_o_buff.ptr() );
324         kern->set_arg( ray_d_buff.ptr() );
325         kern->set_arg( tnearfar_mem_ptr.ptr() );
326         kern->set_arg( img_dim.ptr() );
327         kern->set_arg( vis_image.ptr() );
328         kern->set_arg( pre_image.ptr() );
329         kern->set_arg( norm_image.ptr() );
330         kern->set_arg( cl_output.ptr() );
331         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );//local tree,
332         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_short2) ); //ray bundle,
333         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_int) );    //cell pointers,
334         kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_float) ); //cached aux,
335         kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) ); //cumsum buffer, imindex buffer
336                 //execute kernel
337         kern->execute(queue, 2, local_threads, global_threads);
338         int status = clFinish(queue);
339         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
340           return false;
341         gpu_time += kern->exec_time();
342         //clear render kernel args so it can reset em on next execution
343         kern->clear_args();
344 
345 
346       }
347       else if (i==CONVERT_AUX_INT_FLOAT)
348       {
349           std::size_t ltr[2];
350           ltr[0] = 64;
351           ltr[1] = 1;
352           std::size_t gt[2];
353           gt[0] = RoundUp(info_buffer->data_buffer_length,ltr[0]);
354           gt[1] = 1;
355           kern->set_arg( blk_info );
356           kern->set_arg( aux0 );
357           kern->set_arg( aux1 );
358           kern->set_arg( aux2 );
359           kern->set_arg( aux3 );
360 
361           //execute kernel
362           kern->execute(queue, 2, ltr, gt);
363           int status = clFinish(queue);
364           check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status));
365           gpu_time += kern->exec_time();
366 
367           //clear render kernel args so it can reset em on next execution
368           kern->clear_args();
369           //write info to disk
370           aux0->read_to_buffer(queue);
371           aux1->read_to_buffer(queue);
372           aux2->read_to_buffer(queue);
373           aux3->read_to_buffer(queue);
374 
375           opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX0>::prefix(view_ident),true);
376           opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX1>::prefix(view_ident),true);
377           opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX2>::prefix(view_ident),true);
378           opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX3>::prefix(view_ident),true);
379 
380       }
381       //read image out to buffer (from gpu)
382       in_image->read_to_buffer(queue);
383       vis_image->read_to_buffer(queue);
384       pre_image->read_to_buffer(queue);
385       cl_output->read_to_buffer(queue);
386       clFinish(queue);
387     }
388   }
389 
390   delete [] vis_buff;
391   delete [] pre_buff;
392   delete [] norm_buff;
393   delete [] input_buff;
394   delete [] ray_origins;
395   delete [] ray_directions;
396   opencl_cache->unref_mem(in_image.ptr());
397   opencl_cache->unref_mem(vis_image.ptr());
398   opencl_cache->unref_mem(pre_image.ptr());
399   opencl_cache->unref_mem(norm_image.ptr());
400   opencl_cache->unref_mem(ray_o_buff.ptr());
401   opencl_cache->unref_mem(ray_d_buff.ptr());
402   opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
403   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
404   clReleaseCommandQueue(queue);
405   return true;
406 }
407 
408 
409 //Returns vector of color update kernels (and caches them per device
get_kernels(const bocl_device_sptr & device,const std::string & opts)410 std::vector<bocl_kernel*>& boxm2_ocl_update_auxQ::get_kernels(const bocl_device_sptr& device, const std::string& opts)
411 {
412   // compile kernels if not already compiled
413   std::string identifier = device->device_identifier() + opts;
414   if (kernels_.find(identifier) != kernels_.end())
415     return kernels_[identifier];
416 
417   //otherwise compile the kernels
418   std::cout<<"=== boxm2_ocl_update_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
419 
420   std::vector<std::string> src_paths;
421   std::string source_dir = boxm2_ocl_util::ocl_src_root();
422   src_paths.push_back(source_dir + "scene_info.cl");
423   src_paths.push_back(source_dir + "pixel_conversion.cl");
424   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
425   src_paths.push_back(source_dir + "backproject.cl");
426   src_paths.push_back(source_dir + "atomics_util.cl");
427   src_paths.push_back(source_dir + "statistics_library_functions.cl");
428   src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
429     src_paths.push_back(source_dir + "bit/update_kernels.cl");
430   std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
431 
432     src_paths.push_back(source_dir + "update_functors.cl");
433   src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
434 
435   //compilation options
436   const std::string& options = /*"-D ATOMIC_FLOAT " +*/ opts;
437 
438   //populate vector of kernels
439   std::vector<bocl_kernel*> vec_kernels;
440 
441   //seg len pass
442   auto* seg_len = new bocl_kernel();
443   std::string seg_opts = options + " -D SEGLEN  -D STEP_CELL=step_cell_seglen(aux_args,data_ptr,llid,d)";
444   seg_len->create_kernel(&device->context(), device->device_id(), src_paths, "seg_len_main", seg_opts, "update::seg_len");
445   vec_kernels.push_back(seg_len);
446 
447 
448   auto* pre_inf = new bocl_kernel();
449   std::string pre_opts = options + " -D PREINF  -D STEP_CELL=step_cell_preinf(aux_args,data_ptr,llid,d)";
450   pre_inf->create_kernel(&device->context(), device->device_id(), src_paths, "pre_inf_main", pre_opts, "update::pre_inf");
451   vec_kernels.push_back(pre_inf);
452 
453   //may need DIFF LIST OF SOURCES FOR THIS GUY
454   auto* proc_img = new bocl_kernel();
455   std::string proc_opts = options + " -D PROC_NORM ";
456   proc_img->create_kernel(&device->context(), device->device_id(), non_ray_src, "proc_norm_image", proc_opts, "update::proc_norm_image");
457   vec_kernels.push_back(proc_img);
458 
459   //push back cast_ray_bit
460   auto* bayes_main = new bocl_kernel();
461   std::string bayes_opt = options + " -D BAYES  -D STEP_CELL=step_cell_bayes(aux_args,data_ptr,llid,d)";
462   bayes_main->create_kernel(&device->context(), device->device_id(), src_paths, "bayes_main", bayes_opt, "update::bayes_main");
463   vec_kernels.push_back(bayes_main);
464 
465   std::vector<std::string> src_paths_4;
466   src_paths_4.push_back(source_dir + "scene_info.cl");
467   src_paths_4.push_back(source_dir + "bit/batch_update_kernels.cl");
468   //convert aux buffer int values to float (just divide by SEGLENFACTOR
469   auto* convert_aux_int_float = new bocl_kernel();
470   convert_aux_int_float->create_kernel(&device->context(),device->device_id(), src_paths_4, "convert_aux_int_to_float", opts+" -D CONVERT_AUX ", "batch_update::convert_aux_int_to_float");
471   vec_kernels.push_back(convert_aux_int_float);
472 
473   //store and return
474   kernels_[identifier] = vec_kernels;
475   return kernels_[identifier];
476 }
477 //makes sure appearance types correspond correctly
validate_appearances(const boxm2_scene_sptr & scene,std::string & data_type,int & appTypeSize,std::string & num_obs_type,std::string & options,bool &)478 bool boxm2_ocl_update_auxQ::validate_appearances(const boxm2_scene_sptr& scene,
479                                             std::string& data_type,
480                                             int& appTypeSize,
481                                             std::string& num_obs_type,
482                                             std::string& options,
483                                             bool&  /*isRGB*/)
484 {
485   std::vector<std::string> apps = scene->appearances();
486   bool foundDataType = false, foundNumObsType = false;
487   for (const auto & app : apps) {
488     if ( app == boxm2_data_traits<BOXM2_MOG3_GREY>::prefix() )
489     {
490       data_type = app;
491       foundDataType = true;
492       options=" -D MOG_TYPE_8";
493       appTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_MOG3_GREY>::prefix());
494     }
495     else if ( app == boxm2_data_traits<BOXM2_MOG3_GREY_16>::prefix() )
496     {
497       data_type = app;
498       foundDataType = true;
499       options=" -D MOG_TYPE_16";
500       appTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_MOG3_GREY_16>::prefix());
501     }
502     else if ( app == boxm2_data_traits<BOXM2_NUM_OBS>::prefix() )
503     {
504       num_obs_type = app;
505       foundNumObsType = true;
506     }
507   }
508   if (!foundDataType) {
509     std::cout<<"BOXM2_OPENCL_UPDATE_PROCESS ERROR: scene doesn't have BOXM2_MOG3_GREY or BOXM2_MOG3_GREY_16 data type"<<std::endl;
510     return false;
511   }
512   if (!foundNumObsType) {
513     std::cout<<"BOXM2_OPENCL_UPDATE_PROCESS ERROR: scene doesn't have BOXM2_NUM_OBS type"<<std::endl;
514     return false;
515   }
516   return true;
517 }
518 
519 
520 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_update_PusingQ::kernels_;
521 
init_product(boxm2_scene_sptr scene,const boxm2_cache_sptr & cache)522 bool boxm2_ocl_update_PusingQ::init_product(boxm2_scene_sptr scene, const boxm2_cache_sptr& cache)
523 {
524     std::vector<boxm2_block_id> vis_order = scene->get_block_ids();
525     std::vector<boxm2_block_id>::iterator id;
526     for (id = vis_order.begin(); id != vis_order.end(); ++id)
527     {
528         boxm2_data_base *  aux3 = cache->get_data_base(scene, *id,boxm2_data_traits<BOXM2_AUX3>::prefix(),0,false);
529         auto *   aux3_data = reinterpret_cast<boxm2_data_traits<BOXM2_AUX3>::datatype*> ( aux3->data_buffer());
530         std::fill_n(aux3_data,aux3->buffer_length()/boxm2_data_traits<BOXM2_AUX3>::datasize(),1);
531         cache->remove_data_base(scene,*id,boxm2_data_traits<BOXM2_AUX3>::prefix());
532     }
533     return true;
534 }
535 
accumulate_product(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const std::string & identifier)536 bool boxm2_ocl_update_PusingQ::accumulate_product(const boxm2_scene_sptr&         scene,
537                                                   const bocl_device_sptr&         device,
538                                                   const boxm2_opencl_cache_sptr&  opencl_cache,
539                                                   const std::string& identifier)
540 {
541   float transfer_time=0.0f;
542   float gpu_time=0.0f;
543   //cache size sanity check
544   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
545   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
546   // create a command queue.
547   int status=0;
548   cl_command_queue queue = clCreateCommandQueue( device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
549   if (status!=0)
550       return false;
551 
552   // compile the kernel if not already compiled
553   std::vector<bocl_kernel*>& kernels = get_kernels(device,"");
554   // bit lookup buffer
555   cl_uchar lookup_arr[256];
556   boxm2_ocl_util::set_bit_lookup(lookup_arr);
557   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
558   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
559   // set arguments
560   std::vector<boxm2_block_id> vis_order = scene->get_block_ids();
561   std::vector<boxm2_block_id>::iterator id;
562   bocl_kernel * kern=kernels[0];
563   for (id = vis_order.begin(); id != vis_order.end(); ++id)
564   {
565       //choose correct render kernel
566       boxm2_block_metadata mdata = scene->get_block_metadata(*id);
567       //write the image values to the buffer
568       vul_timer transfer;
569       bocl_mem* blk       = opencl_cache->get_block(scene,*id);
570       bocl_mem* blk_info  = opencl_cache->loaded_block_info();
571       bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(scene,*id,0,false);
572       auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
573       int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
574       info_buffer->data_buffer_length = (int) (alpha->num_bytes()/alphaTypeSize);
575       blk_info->write_to_buffer((queue));
576       //grab an appropriately sized AUX data buffer
577       int auxTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX3>::prefix());
578       bocl_mem *aux3_curr   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX3>::prefix(identifier+"_curr"),info_buffer->data_buffer_length*auxTypeSize,false);
579       bocl_mem *aux3_prev   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX3>::prefix(identifier+"_prev"),info_buffer->data_buffer_length*auxTypeSize,false);
580       bocl_mem *aux0_curr   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX0>::prefix(identifier+"_curr"),info_buffer->data_buffer_length*auxTypeSize,false);
581       transfer_time += (float) transfer.all();
582       //set workspace
583       std::size_t ltr[] = {64};
584       std::size_t gtr[] = { RoundUp(info_buffer->data_buffer_length, ltr[0]) };
585       kern->set_arg( blk_info );
586       kern->set_arg( aux3_prev );
587       kern->set_arg( aux3_curr );
588       kern->set_arg( aux0_curr );
589       kern->set_arg( alpha );
590       //execute kernel
591       kern->execute(queue, 1, ltr, gtr);
592       int status = clFinish(queue);
593       check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status));
594       gpu_time += kern->exec_time();
595       //clear render kernel args so it can reset em on next execution
596       kern->clear_args();
597       aux3_prev->read_to_buffer(queue);
598       alpha->read_to_buffer(queue);
599       opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX3>::prefix(identifier+"_prev"),true);
600       opencl_cache->deep_remove_data(scene,*id,boxm2_data_traits<BOXM2_AUX3>::prefix(identifier+"_curr"),false);
601   }
602   //read image out to buffer (from gpu)
603   clFinish(queue);
604   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
605   clReleaseCommandQueue(queue);
606   return true;
607 }
compute_probability(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache)608 bool boxm2_ocl_update_PusingQ::compute_probability(const boxm2_scene_sptr&         scene,
609                                                    const bocl_device_sptr&         device,
610                                                    const boxm2_opencl_cache_sptr&  opencl_cache)
611 
612 {
613   float transfer_time=0.0f;
614   float gpu_time=0.0f;
615   //cache size sanity check
616   std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
617   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
618   // create a command queue.
619   int status=0;
620   cl_command_queue queue = clCreateCommandQueue( device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
621   if (status!=0)
622       return false;
623 
624 
625 
626   // compile the kernel if not already compiled
627   std::vector<bocl_kernel*>& kernels = get_kernels(device,"");
628   // bit lookup buffer
629   cl_uchar lookup_arr[256];
630   boxm2_ocl_util::set_bit_lookup(lookup_arr);
631   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
632   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
633   // set arguments
634   std::vector<boxm2_block_id> vis_order = scene->get_block_ids();
635   std::vector<boxm2_block_id>::iterator id;
636   bocl_kernel * kern=kernels[1];
637   for (id = vis_order.begin(); id != vis_order.end(); ++id)
638   {
639       //choose correct render kernel
640       boxm2_block_metadata mdata = scene->get_block_metadata(*id);
641       float pinit_buf[1];
642       pinit_buf[0] = mdata.p_init_;
643       bocl_mem * pinit=new bocl_mem(device->context(), pinit_buf, sizeof(float), "pinit");
644       pinit->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
645       //write the image values to the buffer
646       vul_timer transfer;
647       bocl_mem* blk       = opencl_cache->get_block(scene,*id);
648       bocl_mem* blk_info  = opencl_cache->loaded_block_info();
649       bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(scene,*id,0,false);
650       auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
651       int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
652       // check for invalid parameters
653       if( alphaTypeSize == 0 ) //This should never happen, it will result in division by zero later
654       {
655           std::cout << "ERROR: alphaTypeSize == 0 in " << __FILE__ << __LINE__ << std::endl;
656           return false;
657       }
658 
659       info_buffer->data_buffer_length = (int) (alpha->num_bytes()/alphaTypeSize);
660       blk_info->write_to_buffer((queue));
661       //grab an appropriately sized AUX data buffer
662       int auxTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_AUX3>::prefix());
663       bocl_mem *aux3_product   = opencl_cache->get_data(scene,*id, boxm2_data_traits<BOXM2_AUX3>::prefix(),info_buffer->data_buffer_length*auxTypeSize,true);
664       transfer_time += (float) transfer.all();
665 
666       //set workspace
667       std::size_t ltr[] = {4, 4, 4};
668       std::size_t gtr[] = { RoundUp(mdata.sub_block_num_.x(), ltr[0]),
669                            RoundUp(mdata.sub_block_num_.y(), ltr[1]),
670                            RoundUp(mdata.sub_block_num_.z(), ltr[2])};
671 
672       kern->set_arg( blk_info );
673       kern->set_arg( blk );
674       kern->set_arg( alpha );
675       kern->set_arg( aux3_product );
676       kern->set_arg( pinit );
677       kern->set_arg( lookup.ptr() );
678       kern->set_local_arg( ltr[0]*ltr[1]*ltr[2]*10*sizeof(cl_uchar) );
679       kern->set_local_arg( ltr[0]*ltr[1]*ltr[2]*sizeof(cl_uchar16) );
680       //execute kernel
681       kern->execute(queue, 3, ltr, gtr);
682       int status = clFinish(queue);
683       check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status));
684       gpu_time += kern->exec_time();
685 
686       //clear render kernel args so it can reset em on next execution
687       kern->clear_args();
688 
689       alpha->read_to_buffer(queue);
690       clFinish(queue);
691       //pinit->release_memory();
692 
693       //delete pinit;
694       //opencl_cache->deep_remove_data(*id,boxm2_data_traits<BOXM2_ALPHA>::prefix(),true);
695   }
696   //read image out to buffer (from gpu)
697   clFinish(queue);
698   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
699   clReleaseCommandQueue(queue);
700 
701   return true;
702 }
703 
get_kernels(const bocl_device_sptr & device,const std::string & opts)704 std::vector<bocl_kernel*>& boxm2_ocl_update_PusingQ::get_kernels(const bocl_device_sptr& device, const std::string& opts)
705 {
706   // compile kernels if not already compiled
707   std::string identifier = device->device_identifier() + opts;
708   if (kernels_.find(identifier) != kernels_.end())
709     return kernels_[identifier];
710 
711   //otherwise compile the kernels
712   std::cout<<"=== boxm2_ocl_update_auxQ_process::compiling kernels on device "<<identifier<<"==="<<std::endl;
713   std::vector<std::string> src_paths;
714   std::string source_dir = boxm2_ocl_util::ocl_src_root();
715   src_paths.push_back(source_dir + "scene_info.cl");
716   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
717   src_paths.push_back(source_dir + "bit/update_kernels.cl");
718   std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
719 
720   //populate vector of kernels
721   std::vector<bocl_kernel*> vec_kernels;
722 
723 
724   //push back cast_ray_bit
725   auto* apply_beta = new bocl_kernel();
726   std::string apply_beta_opts = opts + " -D APPLYBETA";
727   apply_beta->create_kernel(&device->context(), device->device_id(), non_ray_src, "apply_beta", apply_beta_opts, "update::apply_beta");
728   vec_kernels.push_back(apply_beta);
729 
730   auto* compute_product_Q = new bocl_kernel();
731   std::string product_q = opts + " -D PRODUCTQ";
732   compute_product_Q->create_kernel(&device->context(), device->device_id(), non_ray_src, "compute_product_Q", product_q, "update::compute_product_Q");
733   vec_kernels.push_back(compute_product_Q);
734   auto* update_P = new bocl_kernel();
735   std::string update_q_opts = opts + " -D UPDATEP";
736   update_P->create_kernel(&device->context(), device->device_id(), non_ray_src, "update_P_using_Q", update_q_opts, "update::update_P_using_Q");
737   vec_kernels.push_back(update_P);
738 
739   //store and return
740   kernels_[identifier] = vec_kernels;
741   return kernels_[identifier];
742 }
743