1 #include <iostream>
2 #include <algorithm>
3 #include <sstream>
4 #include <iomanip>
5 #include "boxm2_multi_pre_vis_inf.h"
6 #include <boxm2_multi_util.h>
7 
8 #ifdef _MSC_VER
9 #  include "vcl_msvc_warnings.h"
10 #endif
11 #include <boxm2/boxm2_scene.h>
12 #include <boxm2/boxm2_util.h>
13 #include <bocl/bocl_manager.h>
14 #include <boxm2/ocl/boxm2_ocl_util.h>
15 #include <boxm2/ocl/boxm2_opencl_cache1.h>
16 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
17 
18 #include <bocl/bocl_mem.h>
19 #include <bocl/bocl_device.h>
20 #include <bocl/bocl_kernel.h>
21 #include <brdb/brdb_value.h>
22 #include <brdb/brdb_selection.h>
23 #include <bprb/bprb_batch_process_manager.h>
24 #include <bprb/bprb_parameters.h>
25 #include <bprb/bprb_macros.h>
26 #include <bprb/bprb_func_process.h>
27 #include "vil/vil_image_view.h"
28 #include "vil/vil_save.h"
29 #include "vpgl/vpgl_camera_double_sptr.h"
30 #include "vul/vul_timer.h"
31 
32 std::map<std::string, std::vector<bocl_kernel*> > boxm2_multi_pre_vis_inf::kernels_;
33 
34 
35 //-------------------------------------------------------------
36 // pre_vis_inf
37 //-------------------------------------------------------------
pre_vis_inf(boxm2_multi_cache & cache,const vil_image_view<float> & img,const vpgl_camera_double_sptr & cam,float * norm_img,boxm2_multi_update_helper & helper)38 float boxm2_multi_pre_vis_inf::pre_vis_inf( boxm2_multi_cache&              cache,
39                                             const vil_image_view<float>&    img,
40                                             const vpgl_camera_double_sptr&         cam,
41                                             float*                          norm_img,
42                                             boxm2_multi_update_helper&      helper)
43 {
44   std::cout<<"  -- boxm2_pre_vis_inf map --"<<std::endl;
45   //verify appearance model
46   std::size_t lthreads[2] = {8,8};
47   std::string data_type, options;
48   int apptypesize;
49   if ( !boxm2_multi_util::get_scene_appearances(cache.get_scene(), data_type, options, apptypesize) )
50     return 0.0f;
51 
52   //setup image size
53   int ni=img.ni(),
54       nj=img.nj();
55   unsigned cl_ni=RoundUp(ni,lthreads[0]);
56   unsigned cl_nj=RoundUp(nj,lthreads[1]);
57   std::size_t gThreads[] = {cl_ni,cl_nj};
58 
59   //vis inf and pre inf buffers
60   auto* visImg = new float[cl_ni*cl_nj];
61   auto* preImg = new float[cl_ni*cl_nj];
62   std::fill(visImg, visImg+cl_ni*cl_nj, 1.0f);
63   std::fill(preImg, preImg+cl_ni*cl_nj, 0.0f);
64 
65   //-------------------------------------------------------
66   //prepare buffers for each device
67   //-------------------------------------------------------
68   std::vector<cl_command_queue>& queues = helper.queues_;
69   std::vector<bocl_mem_sptr>& out_imgs = helper.outputs_,
70                              img_dims = helper.img_dims_,
71                              ray_ds = helper.ray_ds_,
72                              ray_os = helper.ray_os_,
73                              lookups = helper.lookups_,
74                              tnearfarptrs=  helper.tnearfarptrs_;
75   std::vector<boxm2_opencl_cache1*>& ocl_caches = helper.vis_caches_;
76   std::vector<bocl_mem_sptr> vis_mems, pre_mems, visInfMems, preInfMems;
77   for (auto ocl_cache : ocl_caches) {
78     //grab sub scene and it's cache
79     //pre/vis images
80     auto* vis_buff = new float[cl_ni*cl_nj];
81     std::fill(vis_buff, vis_buff+cl_ni*cl_nj, 1.0f);
82     bocl_mem_sptr vis_image = ocl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff,"vis image buffer");
83     vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
84     vis_mems.push_back(vis_image);
85 
86     auto* pre_buff = new float[cl_ni*cl_nj];
87     std::fill(pre_buff, pre_buff+cl_ni*cl_nj, 0.0f);
88     bocl_mem_sptr pre_image = ocl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), pre_buff,"pre image buffer");
89     pre_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
90     pre_mems.push_back(pre_image);
91   }
92 
93   //initialize per group images (vis/pre)
94   std::vector<boxm2_multi_cache_group*> grp = helper.group_orders_; //cache.get_vis_groups(cam);
95   vul_timer t; t.mark();
96   float gpu_time = 0.0f, cpu_time = 0.0f;
97 
98   //----------------------------------------------------------------
99   // Call per block/per scene update (to ensure cpu-> gpu cache works
100   //---------------------------------------------------------------
101   for (auto & grpId : grp)
102   {
103     boxm2_multi_cache_group& group = *grpId;
104     std::vector<boxm2_block_id>& ids = group.ids();
105     std::vector<int> indices = group.order_from_cam(cam);
106     for (int i : indices) {
107       //grab sub scene and it's cache
108       boxm2_opencl_cache1* ocl_cache = ocl_caches[i];
109       boxm2_scene_sptr    sub_scene = ocl_cache->get_scene();
110       bocl_device_sptr    device    = ocl_cache->get_device();
111 
112       // compile the kernel/retrieve cached kernel for this device
113       std::vector<bocl_kernel*> kerns = get_kernels(device, options);
114 
115       //Run block store aux
116       boxm2_block_id id = ids[i]; //vis_order[blk];
117 
118       //set visibility to one, set pre to zero
119       vis_mems[i]->fill(queues[i], 1.0f, "float");
120       pre_mems[i]->zero_gpu_buffer(queues[i]);
121       cpu_time += pre_vis_per_block(id, sub_scene, ocl_cache, queues[i], data_type, kerns[0],
122                                     vis_mems[i], pre_mems[i], img_dims[i],
123                                     ray_os[i], ray_ds[i],tnearfarptrs[i],
124                                     out_imgs[i], lookups[i], lthreads, gThreads);
125     }
126 
127     //finish queues before moving on
128     for (int i : indices) {
129 
130 #if 1
131 
132       vul_timer cpuTimer; cpuTimer.mark();
133       //first store vis/pre images for first member of group
134       std::memcpy(group.get_vis(i), visImg, cl_ni*cl_nj*sizeof(float));
135       std::memcpy(group.get_pre(i), preImg, cl_ni*cl_nj*sizeof(float));
136       //next update the vis and pre images
137       clFinish(queues[i]);
138 
139       vis_mems[i]->read_to_buffer(queues[i]);
140       pre_mems[i]->read_to_buffer(queues[i]);
141       auto* v = (float*) vis_mems[i]->cpu_buffer();
142       auto* p = (float*) pre_mems[i]->cpu_buffer();
143       for (int jj=(int)0; jj<(int)cl_nj; ++jj)
144         for (int ii=(int)0; ii<(int)cl_ni; ++ii) {
145           int index = jj*cl_ni + ii;
146       preImg[index]  = preImg[index] + p[index]*visImg[index];
147           visImg[index] *= v[index];
148         }
149       cpu_time += cpuTimer.all();
150 #else
151 
152       //first store vis/pre images for first member of group
153       float* p = (float*) preInfMems[i]->enqueue_map(queues[i]);
154       float* v = (float*) visInfMems[i]->enqueue_map(queues[i]);
155       clFinish(queues[i]);
156       std::memcpy(group.get_vis(i), v, cl_ni*cl_nj*sizeof(float));
157       std::memcpy(group.get_pre(i), p/*preImg*/, cl_ni*cl_nj*sizeof(float));
158 
159       bocl_device_sptr device = ocl_caches[i]->get_device();
160       bocl_kernel*     kern   = get_kernels(device, options)[2];
161 
162       preInfMems[i]->enqueue_unmap(queues[i], p);
163       visInfMems[i]->enqueue_unmap(queues[i], v);
164       kern->set_arg( preInfMems[i].ptr() );
165       kern->set_arg( visInfMems[i].ptr() );
166       kern->set_arg( pre_mems[i].ptr() );
167       kern->set_arg( vis_mems[i].ptr() );
168       kern->set_arg( img_dims[i].ptr() );
169       kern->execute(queues[i], 2, lthreads, gThreads);
170       clFinish(queues[i]);
171       kern->clear_args();
172 #endif
173     }
174 
175 
176 
177   }
178   gpu_time = t.all();
179 
180   t.mark();
181   //---- This instead of the reduce step ----
182   //Norm image create on CPU
183   for (unsigned int c=0; c<cl_ni*cl_nj; ++c)
184     norm_img[c] = visImg[c] + preImg[c];
185 
186   //grab accurate GPU time (includes transfers)
187   cpu_time += t.all();
188   gpu_time -= cpu_time;
189 
190 #if 1
191   vil_image_view<float> nimg(ni,nj), vimg(ni,nj), pimg(ni,nj);
192   int c=0;
193 
194   for (size_t j=0; j<cl_nj; ++j)
195     for (size_t i=0; i<cl_ni; ++i)
196       {
197       if ( i < ni && j < nj )
198         {
199         nimg(i,j) = norm_img[c];
200         vimg(i,j) = visImg[c];
201         pimg(i,j) = preImg[c];
202         }
203       c++;
204     }
205   vil_save(nimg, "e:/norm_image.tiff");
206   vil_save(vimg, "e:/vis_image.tiff");
207   vil_save(pimg, "e:/pre_image.tiff");
208 #endif
209 
210   //-------------------------------------
211   //clean up
212   //-------------------------------------
213   delete[] visImg;
214   delete[] preImg;
215 
216   for (unsigned int i=0; i<queues.size(); ++i) {
217     boxm2_opencl_cache1* ocl_cache = ocl_caches[i];
218     auto* v = (float*) vis_mems[i]->cpu_buffer();
219     auto* p = (float*) pre_mems[i]->cpu_buffer();
220     delete[] v;
221     delete[] p;
222 
223     //free vis mem, pre mem
224     ocl_cache->unref_mem(vis_mems[i].ptr());
225     ocl_cache->unref_mem(pre_mems[i].ptr());
226   }
227   return gpu_time;
228 }
229 
230 
pre_vis_per_block(const boxm2_block_id & id,const boxm2_scene_sptr & scene,boxm2_opencl_cache1 * opencl_cache,cl_command_queue & queue,const std::string & data_type,bocl_kernel * kern,bocl_mem_sptr & vis_image,bocl_mem_sptr & pre_image,bocl_mem_sptr & img_dim,bocl_mem_sptr & ray_o_buff,bocl_mem_sptr & ray_d_buff,bocl_mem_sptr & tnearfarptr,bocl_mem_sptr & cl_output,bocl_mem_sptr & lookup,std::size_t * lthreads,std::size_t * gThreads)231 float boxm2_multi_pre_vis_inf::pre_vis_per_block(const boxm2_block_id& id,
232                                                  const boxm2_scene_sptr&      scene,
233                                                  boxm2_opencl_cache1*   opencl_cache,
234                                                  cl_command_queue&     queue,
235                                                  const std::string&            data_type,
236                                                  bocl_kernel*          kern,
237                                                  bocl_mem_sptr&        vis_image,
238                                                  bocl_mem_sptr&        pre_image,
239                                                  bocl_mem_sptr&        img_dim,
240                                                  bocl_mem_sptr&        ray_o_buff,
241                                                  bocl_mem_sptr&        ray_d_buff,
242                                                  bocl_mem_sptr&        tnearfarptr,
243                                                  bocl_mem_sptr&        cl_output,
244                                                  bocl_mem_sptr&        lookup,
245                                                  std::size_t*           lthreads,
246                                                  std::size_t*           gThreads)
247 {
248   vul_timer ttime; ttime.mark();
249 
250   //choose correct render kernel
251   boxm2_block_metadata mdata = scene->get_block_metadata(id);
252 
253   //write the image values to the buffer
254   bocl_mem* blk       = opencl_cache->get_block(id);
255   bocl_mem* blk_info  = opencl_cache->loaded_block_info();
256   bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(id,0,false);
257 
258   //calc data buffer length (write it in blk_info)
259   auto dataLen = (std::size_t) (alpha->num_bytes()/boxm2_data_traits<BOXM2_ALPHA>::datasize());
260   auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
261   info_buffer->data_buffer_length = (int) dataLen;
262   blk_info->write_to_buffer(queue);
263 
264   //grab MOG
265   int apptypesize = (int) boxm2_data_info::datasize(data_type);
266   bocl_mem* mog  = opencl_cache->get_data(id,data_type, dataLen*apptypesize,false);
267 
268   //grab NumObs
269   std::string numObsType = boxm2_data_traits<BOXM2_NUM_OBS>::prefix();
270   int nobsTypeSize = (int) boxm2_data_info::datasize(numObsType);
271   bocl_mem* num_obs = opencl_cache->get_data(id,numObsType, dataLen*nobsTypeSize,false);
272 
273   //grab an appropriately sized AUX data buffer
274   bocl_mem *aux0 = opencl_cache->get_data<BOXM2_AUX0>(id, dataLen*boxm2_data_traits<BOXM2_AUX0>::datasize());
275   bocl_mem *aux1 = opencl_cache->get_data<BOXM2_AUX1>(id, dataLen*boxm2_data_traits<BOXM2_AUX1>::datasize());
276 
277   //--------- set args and execute ------------
278   kern->set_arg( blk_info );
279   kern->set_arg( blk );
280   kern->set_arg( alpha );
281   kern->set_arg( mog );
282   kern->set_arg( num_obs );
283   kern->set_arg( aux0 );
284   kern->set_arg( aux1 );
285   kern->set_arg( lookup.ptr() );
286   kern->set_arg( ray_o_buff.ptr() );
287   kern->set_arg( ray_d_buff.ptr() );
288   kern->set_arg( tnearfarptr.ptr() );
289   kern->set_arg( img_dim.ptr() );
290   kern->set_arg( vis_image.ptr() );
291   kern->set_arg( pre_image.ptr() );
292   kern->set_arg( cl_output.ptr() );
293   kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );//local tree,
294   kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) ); //cumsum buffer, imindex buffer
295   float transfer_time = ttime.all();
296 
297 
298   //execute kernel
299   kern->execute(queue, 2, lthreads, gThreads);
300   //clear render kernel args so it can reset em on next execution
301   kern->clear_args();
302   return transfer_time;
303 }
304 
305 
306 
307 //-----------------------------------------------------------------
308 // returns vector of bocl_kernels for this specific device
309 //-----------------------------------------------------------------
get_kernels(const bocl_device_sptr & device,const std::string & opts)310 std::vector<bocl_kernel*>& boxm2_multi_pre_vis_inf::get_kernels(const bocl_device_sptr& device, const std::string& opts)
311 {
312   // check to see if this device has compiled kernels already
313   std::string identifier = device->device_identifier()+opts;
314   if (kernels_.find(identifier) != kernels_.end())
315     return kernels_[identifier];
316 
317   //if not, compile and cache them
318   std::cout<<"===========Compiling multi update kernels===========\n"
319           <<"  for device: "<<device->device_identifier()<<std::endl;
320 
321   //gather all render sources... seems like a lot for rendering...
322   std::vector<std::string> src_paths;
323   std::string source_dir = boxm2_ocl_util::ocl_src_root();
324   src_paths.push_back(source_dir + "scene_info.cl");
325   src_paths.push_back(source_dir + "cell_utils.cl");
326   src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
327   src_paths.push_back(source_dir + "backproject.cl");
328   src_paths.push_back(source_dir + "statistics_library_functions.cl");
329   src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
330   src_paths.push_back(source_dir + "bit/update_kernels.cl");
331 
332   std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
333   src_paths.push_back(source_dir + "update_functors.cl");
334   src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
335 
336   //compilation options
337   std::string options = opts+"";
338   //create all passes
339   auto* pre_inf = new bocl_kernel();
340   std::string pre_opts = options + " -D PREINF -D STEP_CELL=step_cell_preinf(aux_args,data_ptr,llid,d) ";
341   pre_inf->create_kernel(&device->context(),device->device_id(), src_paths, "pre_inf_main", pre_opts, "update::pre_inf");
342 
343   //may need DIFF LIST OF SOURCES FOR THIS GUY
344   auto* proc_img = new bocl_kernel();
345   std::string proc_opts = options + " -D PROC_NORM ";
346   proc_img->create_kernel(&device->context(),device->device_id(), non_ray_src, "proc_norm_image", proc_opts, "update::proc_norm_image");
347 
348   auto* combine_pre_vis = new bocl_kernel();
349   std::string comb_opts = options + " -D COMBINE_PRE_VIS ";
350   combine_pre_vis->create_kernel(&device->context(), device->device_id(), non_ray_src, "combine_pre_vis", comb_opts, "update::combine_pre_vis");
351 
352   //vector of kernels:
353   std::vector<bocl_kernel*> kerns(3);
354   kerns[0] = pre_inf;
355   kerns[1] = proc_img;
356   kerns[2] = combine_pre_vis;
357 
358   //cache in map
359   kernels_[identifier] = kerns;
360   return kernels_[identifier];
361 }
362 
363 
write_imgs_out(std::map<bocl_device *,float * > & img_map,int ni,int nj,const std::string & name)364 void boxm2_multi_pre_vis_inf::write_imgs_out(std::map<bocl_device*, float*>& img_map, int ni, int nj, const std::string& name)
365 {
366   std::map<bocl_device*, float*>::iterator iter;
367   for (iter=img_map.begin(); iter!=img_map.end(); ++iter) {
368     float* img = iter->second;
369     int count=0;
370     vil_image_view<float> outImg(ni,nj);
371     for (int j=0; j<nj; ++j)
372       for (int i=0; i<ni; ++i)
373         outImg(i,j) = img[count++];
374 
375     std::string outName = name + iter->first->device_identifier() + ".tiff";
376     vil_save(outImg, outName.c_str());
377   }
378 }
379