1 // This is brl/bseg/boxm2/ocl/pro/processes/boxm2_ocl_render_expected_z_image_process.cxx
2 //:
3 // \file
4 // \brief A process for rendering the scene.
5 //
6 // \author Daniel Crispell
7 // \date November 11, 2011
8
9 #include <fstream>
10 #include <iostream>
11 #include <algorithm>
12 #include <bprb/bprb_func_process.h>
13
14 #ifdef _MSC_VER
15 # include "vcl_msvc_warnings.h"
16 #endif
17 #include <boxm2/ocl/boxm2_opencl_cache.h>
18 #include <boxm2/boxm2_scene.h>
19 #include <boxm2/boxm2_block.h>
20 #include <boxm2/boxm2_data_base.h>
21 #include <boxm2/ocl/boxm2_ocl_util.h>
22 #include "vil/vil_image_view.h"
23 //brdb stuff
24 #include <brdb/brdb_value.h>
25
26 //directory utility
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 #include <boxm2/ocl/algo/boxm2_ocl_render_expected_image_function.h>
31 #include "vul/vul_timer.h"
32
33 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
34
35 namespace boxm2_ocl_render_expected_z_image_process_globals
36 {
37 constexpr unsigned n_inputs_ = 7;
38 constexpr unsigned n_outputs_ = 2;
39 std::size_t lthreads[2]={8,8};
40
41 static std::map<std::string,std::vector<bocl_kernel*> > kernels;
compile_kernel(const bocl_device_sptr & device,std::vector<bocl_kernel * > & vec_kernels)42 void compile_kernel(const bocl_device_sptr& device,std::vector<bocl_kernel*> & vec_kernels)
43 {
44 //gather all render sources... seems like a lot for rendering...
45 std::vector<std::string> src_paths;
46 std::string source_dir = boxm2_ocl_util::ocl_src_root();
47 src_paths.push_back(source_dir + "scene_info.cl");
48 src_paths.push_back(source_dir + "pixel_conversion.cl");
49 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
50 src_paths.push_back(source_dir + "backproject.cl");
51 src_paths.push_back(source_dir + "statistics_library_functions.cl");
52 src_paths.push_back(source_dir + "expected_functor.cl");
53 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
54 src_paths.push_back(source_dir + "bit/render_bit_scene.cl");
55 src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
56
57 //set kernel options
58 std::string options = " -D RENDER_Z_IMAGE ";
59 options += " -D DETERMINISTIC ";
60 options += " -D STEP_CELL=step_cell_render_z(cell_minz*linfo->block_len,aux_args.alpha,data_ptr,d*linfo->block_len,vis,aux_args.exp_z,aux_args.exp_z_sqr,aux_args.probsum)";
61
62 //have kernel construct itself using the context and device
63 auto * ray_trace_kernel = new bocl_kernel();
64
65 ray_trace_kernel->create_kernel( &device->context(),
66 device->device_id(),
67 src_paths,
68 "render_z_image", //kernel name
69 options, //options
70 "boxm2 opencl render z image"); //kernel identifier (for error checking)
71 vec_kernels.push_back(ray_trace_kernel);
72 }
73 }
74
boxm2_ocl_render_expected_z_image_process_cons(bprb_func_process & pro)75 bool boxm2_ocl_render_expected_z_image_process_cons(bprb_func_process& pro)
76 {
77 using namespace boxm2_ocl_render_expected_z_image_process_globals;
78
79 //process takes 6 inputs
80 std::vector<std::string> input_types_(n_inputs_);
81 input_types_[0] = "bocl_device_sptr";
82 input_types_[1] = "boxm2_scene_sptr";
83 input_types_[2] = "boxm2_opencl_cache_sptr";
84 input_types_[3] = "vpgl_camera_double_sptr";
85 input_types_[4] = "unsigned";
86 input_types_[5] = "unsigned";
87 input_types_[6] = "bool"; // if True, scale z values to [0, 255] and save as vil_image_view<vxl_byte>
88
89
90 // process has 1 output:
91 // output[0]: scene sptr
92 std::vector<std::string> output_types_(n_outputs_);
93 output_types_[0] = "vil_image_view_base_sptr";
94 output_types_[1] = "vil_image_view_base_sptr";
95
96 return pro.set_input_types(input_types_) && pro.set_output_types(output_types_);
97 }
98
boxm2_ocl_render_expected_z_image_process(bprb_func_process & pro)99 bool boxm2_ocl_render_expected_z_image_process(bprb_func_process& pro)
100 {
101 using namespace boxm2_ocl_render_expected_z_image_process_globals;
102
103 if ( pro.n_inputs() < n_inputs_ ) {
104 std::cout << pro.name() << ": The input number should be " << n_inputs_<< std::endl;
105 return false;
106 }
107 float transfer_time=0.0f;
108 float gpu_time=0.0f;
109
110 //get the inputs
111 bocl_device_sptr device= pro.get_input<bocl_device_sptr>(0);
112 boxm2_scene_sptr scene =pro.get_input<boxm2_scene_sptr>(1);
113
114 boxm2_opencl_cache_sptr opencl_cache= pro.get_input<boxm2_opencl_cache_sptr>(2);
115 vpgl_camera_double_sptr cam= pro.get_input<vpgl_camera_double_sptr>(3);
116 auto ni=pro.get_input<unsigned>(4);
117 auto nj=pro.get_input<unsigned>(5);
118 bool normalize_z_values = pro.get_input<bool>(6);
119
120 std::string identifier=device->device_identifier();
121
122 // create a command queue.
123 int status=0;
124 cl_command_queue queue = clCreateCommandQueue(device->context(),
125 *(device->device_id()),
126 CL_QUEUE_PROFILING_ENABLE,
127 &status);
128 if (status!=0)
129 return false;
130
131 // compile the kernel
132 if (kernels.find(identifier)==kernels.end())
133 {
134 std::cout<<"===========Compiling kernels==========="<<std::endl;
135 std::vector<bocl_kernel*> ks;
136 compile_kernel(device,ks);
137 kernels[identifier]=ks;
138 }
139
140 #if 0
141 // create all buffers
142 cl_float cam_buffer[48];
143 boxm2_ocl_util::set_persp_camera(cam, cam_buffer);
144 bocl_mem_sptr persp_cam=new bocl_mem(device->context(), cam_buffer, 3*sizeof(cl_float16), "persp cam buffer");
145 persp_cam->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
146 #endif
147
148 std::size_t local_threads[2]={8,8};
149 unsigned cl_ni=RoundUp(ni,local_threads[0]);
150 unsigned cl_nj=RoundUp(nj,local_threads[1]);
151
152 auto* buff = new float[cl_ni*cl_nj];
153 for (unsigned i=0;i<cl_ni*cl_nj;i++) buff[i]=0.0f;
154 auto* var_buff = new float[cl_ni*cl_nj];
155 for (unsigned i=0;i<cl_ni*cl_nj;i++) var_buff[i]=0.0f;
156 auto* vis_buff = new float[cl_ni*cl_nj];
157 for (unsigned i=0;i<cl_ni*cl_nj;i++) vis_buff[i]=1.0f;
158 auto* prob_buff = new float[cl_ni*cl_nj];
159 for (unsigned i=0;i<cl_ni*cl_nj;i++) prob_buff[i]=0.0f;
160
161 bocl_mem_sptr exp_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), buff, "exp z image buffer");
162 exp_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
163
164 bocl_mem_sptr var_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), var_buff, "z var image buffer");
165 var_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
166
167 bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
168 vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
169
170 bocl_mem_sptr prob_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), prob_buff, "vis x omega image buffer");
171 prob_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
172
173 //set generic cam
174 auto* ray_origins = new cl_float[4*cl_ni*cl_nj];
175 auto* ray_directions = new cl_float[4*cl_ni*cl_nj];
176 bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
177 bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
178 boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);
179
180 // Image Dimensions
181 int img_dim_buff[4];
182 img_dim_buff[0] = 0;
183 img_dim_buff[1] = 0;
184 img_dim_buff[2] = ni;
185 img_dim_buff[3] = nj;
186 bocl_mem_sptr exp_img_dim=new bocl_mem(device->context(), img_dim_buff, sizeof(int)*4, "image dims");
187 exp_img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
188
189 // Output Array
190 float output_arr[100];
191 for (float & i : output_arr) i = 0.0f;
192 bocl_mem_sptr cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
193 cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
194
195 // bit lookup buffer
196 cl_uchar lookup_arr[256];
197 boxm2_ocl_util::set_bit_lookup(lookup_arr);
198 bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
199 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
200
201 //2. set workgroup size
202 std::size_t lThreads[] = {8, 8};
203 std::size_t gThreads[] = {cl_ni,cl_nj};
204
205 // set arguments
206 std::vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
207 std::vector<boxm2_block_id>::iterator id;
208 for (id = vis_order.begin(); id != vis_order.end(); ++id)
209 {
210 //choose correct render kernel
211 boxm2_block_metadata mdata = scene->get_block_metadata(*id);
212 bocl_kernel* kern = kernels[identifier][0];
213
214 //write the image values to the buffer
215 vul_timer transfer;
216 bocl_mem* blk = opencl_cache->get_block(scene,*id);
217 bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene,*id);
218 bocl_mem * blk_info = opencl_cache->loaded_block_info();
219 transfer_time += (float) transfer.all();
220
221 ////3. SET args
222 kern->set_arg( blk_info );
223 kern->set_arg( blk );
224 kern->set_arg( alpha );
225 kern->set_arg( ray_o_buff.ptr() );
226 kern->set_arg( ray_d_buff.ptr() );
227 kern->set_arg( exp_image.ptr() );
228 kern->set_arg( var_image.ptr() );
229 kern->set_arg( exp_img_dim.ptr());
230 kern->set_arg( cl_output.ptr() );
231 kern->set_arg( lookup.ptr() );
232 kern->set_arg( vis_image.ptr() );
233 kern->set_arg( prob_image.ptr() );
234
235 //local tree , cumsum buffer, imindex buffer
236 kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_uchar16) );
237 kern->set_local_arg( local_threads[0]*local_threads[1]*10*sizeof(cl_uchar) );
238 kern->set_local_arg( local_threads[0]*local_threads[1]*sizeof(cl_int) );
239
240 //execute kernel
241 kern->execute(queue, 2, lThreads, gThreads);
242 clFinish(queue);
243 gpu_time += kern->exec_time();
244
245 cl_output->read_to_buffer(queue);
246
247 // clear render kernel args so it can reset em on next execution
248 kern->clear_args();
249 }
250 #if 0
251 // normalize
252 {
253 bocl_kernel* normalize_kern= kernels[identifier][1];
254 normalize_kern->set_arg( exp_image.ptr() );
255 normalize_kern->set_arg( var_image.ptr() );
256 normalize_kern->set_arg( prob_image.ptr() );
257 normalize_kern->set_arg( exp_img_dim.ptr());
258 normalize_kern->execute( queue, 2, local_threads, gThreads);
259 clFinish(queue);
260 gpu_time += normalize_kern->exec_time();
261
262 //clear render kernel args so it can reset em on next execution
263 normalize_kern->clear_args();
264 exp_image->read_to_buffer(queue);
265 var_image->read_to_buffer(queue);
266 vis_image->read_to_buffer(queue);
267 }
268 #else
269 exp_image->read_to_buffer(queue);
270 var_image->read_to_buffer(queue);
271 vis_image->read_to_buffer(queue);
272 #endif
273
274 opencl_cache->unref_mem(ray_o_buff.ptr());
275 opencl_cache->unref_mem(ray_d_buff.ptr());
276 opencl_cache->unref_mem(exp_image.ptr());
277 opencl_cache->unref_mem(vis_image.ptr());
278 opencl_cache->unref_mem(var_image.ptr());
279 opencl_cache->unref_mem(prob_image.ptr());
280
281
282 clReleaseCommandQueue(queue);
283
284
285 auto* z_image = new vil_image_view<float>(ni,nj);
286 auto* mask = new vil_image_view<vxl_byte>(ni,nj);
287 vil_image_view<float> vis_vil_image(ni,nj);
288
289 float max_z = -vnl_numeric_traits<float>::maxval;
290 float min_z = vnl_numeric_traits<float>::maxval;
291
292 std::vector<float> zval_vector; // for sorting and normalization
293
294 for (unsigned c=0;c<nj;c++)
295 for (unsigned r=0;r<ni;r++)
296 {
297 float vis = vis_buff[c*cl_ni+r];
298 vis_vil_image(r,c) = vis;
299 float z = 0.0f;
300 bool good_z = false;
301 if (vis < 0.95) {
302 z = buff[c*cl_ni+r] / (1.0f - vis);
303 good_z = true;
304 zval_vector.push_back(z);
305 max_z = std::max(max_z,z);
306 min_z = std::min(min_z,z);
307 }
308 (*z_image)(r,c) = z;
309 (*mask)(r,c) = good_z ? vxl_byte(255) : vxl_byte(0);
310 }
311 std::cout << " min_z = " << min_z << " max_z = " << max_z << std::endl;
312 if (normalize_z_values) {
313 std::sort(zval_vector.begin(),zval_vector.end());
314 unsigned int ngood = zval_vector.size();
315 float min_z = zval_vector[(ngood-1)*0.01];
316 float max_z = zval_vector[(ngood-1)*0.99];
317 std::cout << " min_z (pcent) = " << min_z << " max_z (pcent) = " << max_z << std::endl;
318 double scale = 255.0 / (max_z - min_z);
319 double offset = -scale * min_z;
320
321 auto* z_image_byte = new vil_image_view<vxl_byte>(ni,nj);
322 for (unsigned int c=0; c<nj; ++c) {
323 for (unsigned int r=0; r<ni; ++r) {
324 if ((*mask)(r,c)) {
325 float zval = (*z_image)(r,c);
326 float zval_byte = std::max(0.0f,std::min(255.0f,float(zval*scale + offset)));
327 (*z_image_byte)(r,c) = vxl_byte(zval_byte);
328 }
329 else {
330 (*z_image_byte)(r,c) = vxl_byte(0);
331 }
332 }
333 }
334 pro.set_output_val<vil_image_view_base_sptr>(0,z_image_byte);
335 }
336 else {
337 // store scene smart pointer
338 pro.set_output_val<vil_image_view_base_sptr>(0, z_image);
339 }
340
341 pro.set_output_val<vil_image_view_base_sptr>(1, mask);
342 return true;
343 }
344