1 // This is brl/bseg/boxm2/ocl/pro/processes/boxm2_ocl_make_inside_voxels_empty_process.cxx
2 #include <iostream>
3 #include <fstream>
4 #include <bprb/bprb_func_process.h>
5 //:
6 // \file
7 // \brief  A process for making the inside cells empty
8 //
9 // TODO: implement a vis_sphere initializer kernel.
10 // \author Ali Osman Ulusoy
11 // \date Oct 10, 2011
12 
13 #ifdef _MSC_VER
14 #  include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/ocl/boxm2_opencl_cache.h>
17 #include <boxm2/boxm2_scene.h>
18 #include <boxm2/boxm2_block.h>
19 #include <boxm2/boxm2_data_base.h>
20 #include <boxm2/ocl/boxm2_ocl_util.h>
21 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
22 
23 //brdb stuff
24 #include <brdb/brdb_value.h>
25 
26 //directory utility
27 #include "vul/vul_timer.h"
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 
31 namespace boxm2_ocl_make_inside_voxels_empty_process_globals
32 {
33   constexpr unsigned n_inputs_ = 4;
34   constexpr unsigned n_outputs_ = 0;
35   enum {
36       COMPUTE_VIS = 0,
37       DECIDE_INSIDE = 1
38   };
39 
compile_kernel(const bocl_device_sptr & device,std::vector<bocl_kernel * > & vec_kernels,const std::string & opts)40   void compile_kernel(const bocl_device_sptr& device,std::vector<bocl_kernel*> & vec_kernels,const std::string& opts)
41   {
42     //gather all render sources... seems like a lot for rendering...
43     std::vector<std::string> src_paths;
44     std::string source_dir = boxm2_ocl_util::ocl_src_root();
45     src_paths.push_back(source_dir + "scene_info.cl");
46     src_paths.push_back(source_dir + "cell_utils.cl");
47     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
48     src_paths.push_back(source_dir + "basic/sort_vector.cl");
49     src_paths.push_back(source_dir + "backproject.cl");
50     src_paths.push_back(source_dir + "statistics_library_functions.cl");
51     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
52     src_paths.push_back(source_dir + "bit/compute_vis.cl");
53     src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
54 
55     //compilation options
56     std::string options = opts+ "-D INTENSITY ";
57 
58     auto* compute_vis = new bocl_kernel();
59     std::string seg_opts = options + "-D COMPVIS -D STEP_CELL=step_cell_computevis(aux_args,data_ptr,llid,d)";
60     compute_vis->create_kernel(&device->context(),device->device_id(), src_paths, "compute_vis", seg_opts, "compute_vis");
61     vec_kernels.push_back(compute_vis);
62 
63     auto* decide_inside_cell = new bocl_kernel();
64     decide_inside_cell->create_kernel(&device->context(),device->device_id(), src_paths, "decide_inside_cell", seg_opts, "decide_inside_cell");
65     vec_kernels.push_back(decide_inside_cell);
66     return ;
67   }
68 
69   static std::map<std::string,std::vector<bocl_kernel*> > kernels;
70 }
71 
boxm2_ocl_make_inside_voxels_empty_process_cons(bprb_func_process & pro)72 bool boxm2_ocl_make_inside_voxels_empty_process_cons(bprb_func_process& pro)
73 {
74   using namespace boxm2_ocl_make_inside_voxels_empty_process_globals;
75 
76   //process takes 4 inputs
77   std::vector<std::string> input_types_(n_inputs_);
78   input_types_[0] = "bocl_device_sptr";
79   input_types_[1] = "boxm2_scene_sptr";
80   input_types_[2] = "boxm2_opencl_cache_sptr";
81   input_types_[3] = "bool";
82 
83   // process has no outputs
84   std::vector<std::string>  output_types_(n_outputs_);
85   bool good = pro.set_input_types(input_types_) && pro.set_output_types(output_types_);
86 
87   return good;
88 }
89 
90 
boxm2_ocl_make_inside_voxels_empty_process(bprb_func_process & pro)91 bool boxm2_ocl_make_inside_voxels_empty_process(bprb_func_process& pro)
92 {
93   using namespace boxm2_ocl_make_inside_voxels_empty_process_globals;
94   std::size_t local_threads[2]={8,8};
95   std::size_t global_threads[2]={8,8};
96 
97   //sanity check inputs
98   if ( pro.n_inputs() < n_inputs_ ) {
99     std::cout << pro.name() << ": The input number should be " << n_inputs_<< std::endl;
100     return false;
101   }
102   float transfer_time=0.0f;
103   float gpu_time=0.0f;
104 
105   //get the inputs
106   unsigned i = 0;
107   bocl_device_sptr         device = pro.get_input<bocl_device_sptr>(i++);
108   boxm2_scene_sptr         scene = pro.get_input<boxm2_scene_sptr>(i++);
109   boxm2_opencl_cache_sptr  opencl_cache = pro.get_input<boxm2_opencl_cache_sptr>(i++);
110   bool use_sum = false;    use_sum = pro.get_input<bool>(i++);
111 
112   //cache size sanity check
113   long binCache = opencl_cache.ptr()->bytes_in_cache();
114   std::cout<<"Update MBs in cache: "<<binCache/(1024.0*1024.0)<<std::endl;
115 
116   //make correct data types are here
117   std::string data_type,num_obs_type,options;
118 
119   if (use_sum) {
120     options="-D USESUM ";
121     std::cout << "Using sum to compute visibility" << std::endl;
122   }
123 
124 
125   // create a command queue.
126   int status=0;
127   cl_command_queue queue = clCreateCommandQueue( device->context(),
128                                                  *(device->device_id()),
129                                                  CL_QUEUE_PROFILING_ENABLE,
130                                                  &status);
131   if (status!=0)
132     return false;
133 
134   // compile the kernel if not already compiled
135   std::string identifier=device->device_identifier()+options;
136   if (kernels.find(identifier)==kernels.end()) {
137     std::cout<<"===========Compiling kernels==========="<<std::endl;
138     std::vector<bocl_kernel*> ks;
139     compile_kernel(device,ks,options);
140     kernels[identifier]=ks;
141   }
142 
143   // bit lookup buffer
144   cl_uchar lookup_arr[256];
145   boxm2_ocl_util::set_bit_lookup(lookup_arr);
146   bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
147   lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
148 
149   // dodecahedron directions lookup buffer
150   cl_float4 dodecahedron_dir[12];
151   boxm2_ocl_util::set_dodecahedron_dir_lookup(dodecahedron_dir);
152   bocl_mem_sptr dodecahedron_dir_lookup=new bocl_mem(device->context(), dodecahedron_dir, sizeof(cl_float4)*12, "dodecahedron directions lookup buffer");
153   dodecahedron_dir_lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
154 
155   cl_bool contain_point[1];
156   bocl_mem_sptr contain_point_mem =new bocl_mem(device->context(), contain_point, sizeof(cl_bool), "contains point buffer");
157   contain_point_mem->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
158 
159   cl_uint datasize[1];
160   bocl_mem_sptr datasize_mem =new bocl_mem(device->context(), datasize, sizeof(cl_uint), "data buffer size");
161   datasize_mem->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
162 
163   //zip through each block
164   std::map<boxm2_block_id, boxm2_block_metadata> blocks = scene->blocks();
165   std::map<boxm2_block_id, boxm2_block_metadata>::iterator blk_iter;
166   for (unsigned int i=0; i<kernels[identifier].size(); ++i)
167   {
168       //remove all the alphas and points from opencl cache
169       if (i == DECIDE_INSIDE) {
170           for (blk_iter = blocks.begin(); blk_iter != blocks.end(); ++blk_iter)
171           {
172             boxm2_block_id id = blk_iter->first;
173             //opencl_cache->shallow_remove_data(id,boxm2_data_traits<BOXM2_ALPHA>::prefix());
174             opencl_cache->shallow_remove_data(scene,id,boxm2_data_traits<BOXM2_POINT>::prefix());
175           }
176       }
177 
178       for (blk_iter = blocks.begin(); blk_iter != blocks.end(); ++blk_iter)
179       {
180         boxm2_block_id id = blk_iter->first;
181         std::cout << "Processing block: " << id << std::endl;
182 
183         //get kernel
184         bocl_kernel* kern =  kernels[identifier][i];
185 
186         vul_timer transfer;
187 
188         //load normals
189         bocl_mem* normals = opencl_cache->get_data<BOXM2_NORMAL>(scene,blk_iter->first,0,false);
190         std::size_t normalsTypeSize = boxm2_data_info::datasize(boxm2_data_traits<BOXM2_NORMAL>::prefix());
191 
192         //load block info
193         datasize[0] = (unsigned)(normals->num_bytes()/normalsTypeSize);
194         datasize_mem->write_to_buffer((queue));
195 
196         transfer_time += (float) transfer.all();
197         if (i==COMPUTE_VIS) {
198 
199             //array to store visibilities computed around a sphere
200             //ask for a new BOXM2_VIS_SPHERE data so that it gets initialized properly.
201             std::size_t visTypeSize = boxm2_data_info::datasize(boxm2_data_traits<BOXM2_VIS_SPHERE>::prefix());
202             bocl_mem *vis_sphere = opencl_cache->get_data_new<BOXM2_VIS_SPHERE>(scene,blk_iter->first, (normals->num_bytes()/normalsTypeSize)*visTypeSize, false);
203 
204             //zip through each block
205             std::map<boxm2_block_id, boxm2_block_metadata>::iterator blk_iter_inner;
206             for (blk_iter_inner = blocks.begin(); blk_iter_inner != blocks.end(); ++blk_iter_inner) {
207 
208               transfer.mark();
209               boxm2_block_id id_inner = blk_iter_inner->first;
210               //std::cout << "--Loading block " << id_inner << std::endl;
211 
212               //load tree and alpha
213               boxm2_block_metadata mdata = blk_iter_inner->second;
214               vul_timer transfer;
215               bocl_mem* blk = opencl_cache->get_block(scene,blk_iter_inner->first);
216               bocl_mem* blk_info = opencl_cache->loaded_block_info();
217               bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene,blk_iter_inner->first,0,false);
218               auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
219               int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
220               info_buffer->data_buffer_length = (int) (alpha->num_bytes()/alphaTypeSize);
221               blk_info->write_to_buffer((queue));
222 
223               bocl_mem* points = opencl_cache->get_data<BOXM2_POINT>(scene,blk_iter->first,0,false);
224 
225               if (id == id_inner)
226                 contain_point[0] = true;
227               else
228                 contain_point[0] = false;
229               contain_point_mem->write_to_buffer(queue);
230 
231               transfer_time += (float) transfer.all();
232 
233               local_threads[0] = 64;
234               local_threads[1] = 1;
235               global_threads[0] = RoundUp((normals->num_bytes()/normalsTypeSize), local_threads[0]);
236               global_threads[1]=1;
237 
238               kern->set_arg( datasize_mem.ptr() );
239               kern->set_arg( blk_info );
240               kern->set_arg( dodecahedron_dir_lookup.ptr());
241               kern->set_arg( blk );
242               kern->set_arg( lookup.ptr()  );
243               kern->set_arg( alpha  );
244               kern->set_arg( points );
245               kern->set_arg( normals );
246               kern->set_arg( vis_sphere);
247               kern->set_arg( contain_point_mem.ptr());
248               kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );//local tree,
249               kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) ); //cumsum buffer, imindex buffer
250 
251               //execute kernel
252               kern->execute(queue, 2, local_threads, global_threads);
253               int status = clFinish(queue);
254               check_val(status, MEM_FAILURE, "VISIBIITY EXECUTE FAILED: " + error_to_string(status));
255               gpu_time += kern->exec_time();
256 
257               //clear render kernel args so it can reset em on next execution
258               kern->clear_args();
259             }
260 
261           //read from gpu
262           vis_sphere->read_to_buffer(queue);
263           int status = clFinish(queue);
264           check_val(status, MEM_FAILURE, "READ VIS_SPHERE FAILED: " + error_to_string(status));
265         }
266         else if (i == DECIDE_INSIDE) {
267           transfer.mark();
268 
269           //load tree
270           boxm2_block_metadata mdata = blk_iter->second;
271           vul_timer transfer;
272           /* bocl_mem* blk = */ opencl_cache->get_block(scene,blk_iter->first);
273           bocl_mem* blk_info = opencl_cache->loaded_block_info();
274           auto* info_buffer = (boxm2_scene_info*) blk_info->cpu_buffer();
275           info_buffer->data_buffer_length = (int) (normals->num_bytes()/normalsTypeSize);
276           blk_info->write_to_buffer((queue));
277           bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene,blk_iter->first,0,false);
278 
279           //load visibilities
280           bocl_mem* vis_sphere = opencl_cache->get_data<BOXM2_VIS_SPHERE>(scene,blk_iter->first,0,false);
281 
282           //array to store final visibility score of a point
283           bocl_mem* vis = opencl_cache->get_data<BOXM2_VIS_SCORE>(scene,blk_iter->first, (normals->num_bytes()/normalsTypeSize)
284                                                   *boxm2_data_info::datasize(boxm2_data_traits<BOXM2_VIS_SCORE>::prefix()),false);
285 
286           transfer_time += (float) transfer.all();
287 
288           local_threads[0] = 128;
289           local_threads[1] = 1;
290           global_threads[0] = RoundUp((normals->num_bytes()/normalsTypeSize), local_threads[0]);
291           global_threads[1]=1;
292 
293           kern->set_arg( blk_info );
294           kern->set_arg( alpha );
295           kern->set_arg( vis );
296           kern->set_arg( vis_sphere);
297           //execute kernel
298           kern->execute(queue, 2, local_threads, global_threads);
299           int status = clFinish(queue);
300           check_val(status, MEM_FAILURE, "DECIDE NORMAL DIR EXECUTE FAILED: " + error_to_string(status));
301           gpu_time += kern->exec_time();
302 
303           //read normals and vis from gpu
304           alpha->read_to_buffer(queue);
305           status = clFinish(queue);
306           check_val(status, MEM_FAILURE, "READ NORMALS FAILED: " + error_to_string(status));
307 
308           //clear render kernel args so it can reset em on next execution
309           kern->clear_args();
310       }
311 
312       //shallow remove from ocl cache unnecessary items from ocl cache.
313       opencl_cache->shallow_remove_data(scene,id,boxm2_data_traits<BOXM2_VIS_SPHERE>::prefix());
314     }
315   }
316 
317   std::cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<std::endl;
318   clReleaseCommandQueue(queue);
319   return true;
320 }
321