1 // This is brl/bseg/boxm2/ocl/pro/processes/boxm2_ocl_render_expected_z_image_process.cxx
2 //:
3 // \file
4 // \brief  A process for rendering the scene.
5 //
6 // \author Daniel Crispell
7 // \date November 11, 2011
8 
9 #include <fstream>
10 #include <iostream>
11 #include <algorithm>
12 #include <bprb/bprb_func_process.h>
13 
14 #ifdef _MSC_VER
15 #  include "vcl_msvc_warnings.h"
16 #endif
17 #include <boxm2/ocl/boxm2_opencl_cache.h>
18 #include <boxm2/boxm2_scene.h>
19 #include <boxm2/boxm2_block.h>
20 #include <boxm2/boxm2_data_base.h>
21 #include <boxm2/ocl/boxm2_ocl_util.h>
22 #include "vil/vil_image_view.h"
23 //brdb stuff
24 #include <brdb/brdb_value.h>
25 
26 //directory utility
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 #include <boxm2/ocl/algo/boxm2_ocl_render_expected_image_function.h>
31 #include "vul/vul_timer.h"
32 
33 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
34 
35 namespace boxm2_ocl_render_expected_z_image_process_globals
36 {
37   constexpr unsigned n_inputs_ = 7;
38   constexpr unsigned n_outputs_ = 2;
39   std::size_t lthreads[2]={8,8};
40 
41   static std::map<std::string,std::vector<bocl_kernel*> > kernels;
compile_kernel(const bocl_device_sptr & device,std::vector<bocl_kernel * > & vec_kernels)42   void compile_kernel(const bocl_device_sptr& device,std::vector<bocl_kernel*> & vec_kernels)
43   {
44     //gather all render sources... seems like a lot for rendering...
45     std::vector<std::string> src_paths;
46     std::string source_dir = boxm2_ocl_util::ocl_src_root();
47     src_paths.push_back(source_dir + "scene_info.cl");
48     src_paths.push_back(source_dir + "pixel_conversion.cl");
49     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
50     src_paths.push_back(source_dir + "backproject.cl");
51     src_paths.push_back(source_dir + "statistics_library_functions.cl");
52     src_paths.push_back(source_dir + "expected_functor.cl");
53     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
54     src_paths.push_back(source_dir + "bit/render_bit_scene.cl");
55     src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
56 
57     //set kernel options
58     std::string options = " -D RENDER_Z_IMAGE ";
59     options += " -D DETERMINISTIC ";
60     options += " -D STEP_CELL=step_cell_render_z(cell_minz*linfo->block_len,aux_args.alpha,data_ptr,d*linfo->block_len,vis,aux_args.exp_z,aux_args.exp_z_sqr,aux_args.probsum)";
61 
62     //have kernel construct itself using the context and device
63     auto * ray_trace_kernel = new bocl_kernel();
64 
65     ray_trace_kernel->create_kernel( &device->context(),
66                                      device->device_id(),
67                                      src_paths,
68                                      "render_z_image",   //kernel name
69                                      options,              //options
70                                      "boxm2 opencl render z image"); //kernel identifier (for error checking)
71     vec_kernels.push_back(ray_trace_kernel);
72   }
73 }
74 
boxm2_ocl_render_expected_z_image_process_cons(bprb_func_process & pro)75 bool boxm2_ocl_render_expected_z_image_process_cons(bprb_func_process& pro)
76 {
77   using namespace boxm2_ocl_render_expected_z_image_process_globals;
78 
79   //process takes 6 inputs
80   std::vector<std::string> input_types_(n_inputs_);
81   input_types_[0] = "bocl_device_sptr";
82   input_types_[1] = "boxm2_scene_sptr";
83   input_types_[2] = "boxm2_opencl_cache_sptr";
84   input_types_[3] = "vpgl_camera_double_sptr";
85   input_types_[4] = "unsigned";
86   input_types_[5] = "unsigned";
87   input_types_[6] = "bool"; // if True, scale z values to [0, 255] and save as vil_image_view<vxl_byte>
88 
89 
90   // process has 1 output:
91   // output[0]: scene sptr
92   std::vector<std::string>  output_types_(n_outputs_);
93   output_types_[0] = "vil_image_view_base_sptr";
94   output_types_[1] = "vil_image_view_base_sptr";
95 
96   return pro.set_input_types(input_types_) && pro.set_output_types(output_types_);
97 }
98 
boxm2_ocl_render_expected_z_image_process(bprb_func_process & pro)99 bool boxm2_ocl_render_expected_z_image_process(bprb_func_process& pro)
100 {
101   using namespace boxm2_ocl_render_expected_z_image_process_globals;
102 
103   if ( pro.n_inputs() < n_inputs_ ) {
104     std::cout << pro.name() << ": The input number should be " << n_inputs_<< std::endl;
105     return false;
106   }
107   float transfer_time=0.0f;
108   float gpu_time=0.0f;
109 
110   //get the inputs
111   bocl_device_sptr device= pro.get_input<bocl_device_sptr>(0);
112   boxm2_scene_sptr scene =pro.get_input<boxm2_scene_sptr>(1);
113 
114   boxm2_opencl_cache_sptr opencl_cache= pro.get_input<boxm2_opencl_cache_sptr>(2);
115   vpgl_camera_double_sptr cam= pro.get_input<vpgl_camera_double_sptr>(3);
116   auto ni=pro.get_input<unsigned>(4);
117   auto nj=pro.get_input<unsigned>(5);
118   bool normalize_z_values = pro.get_input<bool>(6);
119 
120   std::string identifier=device->device_identifier();
121 
122   // create a command queue.
123   int status=0;
124   cl_command_queue queue = clCreateCommandQueue(device->context(),
125                                                 *(device->device_id()),
126                                                 CL_QUEUE_PROFILING_ENABLE,
127                                                 &status);
128   if (status!=0)
129     return false;
130 
131   // compile the kernel
132   if (kernels.find(identifier)==kernels.end())
133   {
134     std::cout<<"===========Compiling kernels==========="<<std::endl;
135     std::vector<bocl_kernel*> ks;
136     compile_kernel(device,ks);
137     kernels[identifier]=ks;
138   }
139 
140 #if 0
141   // create all buffers
142   cl_float cam_buffer[48];
143   boxm2_ocl_util::set_persp_camera(cam, cam_buffer);
144   bocl_mem_sptr persp_cam=new bocl_mem(device->context(), cam_buffer, 3*sizeof(cl_float16), "persp cam buffer");
145   persp_cam->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
146 #endif
147 
148   std::size_t local_threads[2]={8,8};
149   unsigned cl_ni=RoundUp(ni,local_threads[0]);
150   unsigned cl_nj=RoundUp(nj,local_threads[1]);
151 
152   auto* buff = new float[cl_ni*cl_nj];
153   for (unsigned i=0;i<cl_ni*cl_nj;i++) buff[i]=0.0f;
154   auto* var_buff = new float[cl_ni*cl_nj];
155   for (unsigned i=0;i<cl_ni*cl_nj;i++) var_buff[i]=0.0f;
156   auto* vis_buff = new float[cl_ni*cl_nj];
157   for (unsigned i=0;i<cl_ni*cl_nj;i++) vis_buff[i]=1.0f;
158   auto* prob_buff = new float[cl_ni*cl_nj];
159   for (unsigned i=0;i<cl_ni*cl_nj;i++) prob_buff[i]=0.0f;
160 
161   bocl_mem_sptr exp_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), buff, "exp z image buffer");
162   exp_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
163 
164   bocl_mem_sptr var_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), var_buff, "z var image buffer");
165   var_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
166 
167   bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
168   vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
169 
170   bocl_mem_sptr prob_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), prob_buff, "vis x omega image buffer");
171   prob_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
172 
173   //set generic cam
174   auto* ray_origins = new cl_float[4*cl_ni*cl_nj];
175   auto* ray_directions = new cl_float[4*cl_ni*cl_nj];
176   bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
177   bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
178   boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);
179 
180   // Image Dimensions
181   int img_dim_buff[4];
182   img_dim_buff[0] = 0;
183   img_dim_buff[1] = 0;
184   img_dim_buff[2] = ni;
185   img_dim_buff[3] = nj;
186   bocl_mem_sptr exp_img_dim=new bocl_mem(device->context(), img_dim_buff, sizeof(int)*4, "image dims");
187   exp_img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
188 
189   // Output Array
190   float output_arr[100];
191   for (float & i : output_arr) i = 0.0f;
192   bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
193   cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
194 
195   // bit lookup buffer
196   cl_uchar lookup_arr[256];
197   boxm2_ocl_util::set_bit_lookup(lookup_arr);
198   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
199   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
200 
201   //2. set workgroup size
202   std::size_t lThreads[] = {8, 8};
203   std::size_t gThreads[] = {cl_ni,cl_nj};
204 
205   // set arguments
206   std::vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
207   std::vector<boxm2_block_id>::iterator id;
208   for (id = vis_order.begin(); id != vis_order.end(); ++id)
209   {
210     //choose correct render kernel
211     boxm2_block_metadata mdata = scene->get_block_metadata(*id);
212     bocl_kernel* kern =  kernels[identifier][0];
213 
214     //write the image values to the buffer
215     vul_timer transfer;
216     bocl_mem* blk = opencl_cache->get_block(scene,*id);
217     bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene,*id);
218     bocl_mem * blk_info = opencl_cache->loaded_block_info();
219     transfer_time          += (float) transfer.all();
220 
221     ////3. SET args
222     kern->set_arg( blk_info );
223     kern->set_arg( blk );
224     kern->set_arg( alpha );
225     kern->set_arg( ray_o_buff.ptr() );
226     kern->set_arg( ray_d_buff.ptr() );
227     kern->set_arg( exp_image.ptr() );
228     kern->set_arg( var_image.ptr() );
229     kern->set_arg( exp_img_dim.ptr());
230     kern->set_arg( cl_output.ptr() );
231     kern->set_arg( lookup.ptr() );
232     kern->set_arg( vis_image.ptr() );
233     kern->set_arg( prob_image.ptr() );
234 
235     //local tree , cumsum buffer, imindex buffer
236     kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );
237     kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) );
238     kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_int) );
239 
240     //execute kernel
241     kern->execute(queue, 2, lThreads, gThreads);
242     clFinish(queue);
243     gpu_time += kern->exec_time();
244 
245     cl_output->read_to_buffer(queue);
246 
247     // clear render kernel args so it can reset em on next execution
248     kern->clear_args();
249   }
250 #if 0
251   // normalize
252   {
253     bocl_kernel* normalize_kern= kernels[identifier][1];
254     normalize_kern->set_arg( exp_image.ptr() );
255     normalize_kern->set_arg( var_image.ptr() );
256     normalize_kern->set_arg( prob_image.ptr() );
257     normalize_kern->set_arg( exp_img_dim.ptr());
258     normalize_kern->execute( queue, 2, local_threads, gThreads);
259     clFinish(queue);
260     gpu_time += normalize_kern->exec_time();
261 
262     //clear render kernel args so it can reset em on next execution
263     normalize_kern->clear_args();
264     exp_image->read_to_buffer(queue);
265     var_image->read_to_buffer(queue);
266     vis_image->read_to_buffer(queue);
267   }
268 #else
269   exp_image->read_to_buffer(queue);
270   var_image->read_to_buffer(queue);
271   vis_image->read_to_buffer(queue);
272 #endif
273 
274   opencl_cache->unref_mem(ray_o_buff.ptr());
275   opencl_cache->unref_mem(ray_d_buff.ptr());
276   opencl_cache->unref_mem(exp_image.ptr());
277   opencl_cache->unref_mem(vis_image.ptr());
278   opencl_cache->unref_mem(var_image.ptr());
279   opencl_cache->unref_mem(prob_image.ptr());
280 
281 
282   clReleaseCommandQueue(queue);
283 
284 
285   auto* z_image = new vil_image_view<float>(ni,nj);
286   auto* mask = new vil_image_view<vxl_byte>(ni,nj);
287   vil_image_view<float> vis_vil_image(ni,nj);
288 
289   float max_z = -vnl_numeric_traits<float>::maxval;
290   float min_z = vnl_numeric_traits<float>::maxval;
291 
292   std::vector<float> zval_vector; // for sorting and normalization
293 
294   for (unsigned c=0;c<nj;c++)
295     for (unsigned r=0;r<ni;r++)
296     {
297       float vis = vis_buff[c*cl_ni+r];
298       vis_vil_image(r,c) = vis;
299       float z = 0.0f;
300       bool good_z = false;
301       if (vis < 0.95) {
302         z = buff[c*cl_ni+r] / (1.0f - vis);
303         good_z = true;
304         zval_vector.push_back(z);
305         max_z = std::max(max_z,z);
306         min_z = std::min(min_z,z);
307       }
308       (*z_image)(r,c) = z;
309       (*mask)(r,c) = good_z ? vxl_byte(255) : vxl_byte(0);
310     }
311     std::cout << " min_z = " << min_z << "  max_z = " << max_z << std::endl;
312   if (normalize_z_values) {
313     std::sort(zval_vector.begin(),zval_vector.end());
314     unsigned int ngood = zval_vector.size();
315     float min_z = zval_vector[(ngood-1)*0.01];
316     float max_z = zval_vector[(ngood-1)*0.99];
317     std::cout << " min_z (pcent) = " << min_z << "  max_z (pcent) = " << max_z << std::endl;
318     double scale = 255.0 / (max_z - min_z);
319     double offset = -scale * min_z;
320 
321     auto* z_image_byte = new vil_image_view<vxl_byte>(ni,nj);
322     for (unsigned int c=0; c<nj; ++c) {
323       for (unsigned int r=0; r<ni; ++r) {
324         if ((*mask)(r,c)) {
325           float zval = (*z_image)(r,c);
326           float zval_byte = std::max(0.0f,std::min(255.0f,float(zval*scale + offset)));
327           (*z_image_byte)(r,c) = vxl_byte(zval_byte);
328         }
329         else {
330           (*z_image_byte)(r,c) = vxl_byte(0);
331         }
332       }
333     }
334     pro.set_output_val<vil_image_view_base_sptr>(0,z_image_byte);
335   }
336   else {
337     // store scene smart pointer
338     pro.set_output_val<vil_image_view_base_sptr>(0, z_image);
339   }
340 
341   pro.set_output_val<vil_image_view_base_sptr>(1, mask);
342   return true;
343 }
344