1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_update_heightmap_factor.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_update_heightmap_factor.h"
6 //:
7 // \file
8 // \brief A process for updating a color model
9 //
10 // \author Vishal Jain
11 // \date Mar 25, 2011
12
13 #ifdef _MSC_VER
14 # include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/ocl/boxm2_opencl_cache.h>
17 #include <boxm2/boxm2_scene.h>
18 #include <boxm2/boxm2_block.h>
19 #include <boxm2/boxm2_data_base.h>
20 #include <boxm2/ocl/boxm2_ocl_util.h>
21 #include <boxm2/boxm2_util.h>
22 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
23 #include "vil/vil_image_view.h"
24
25 //directory utility
26 #include "vul/vul_timer.h"
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 #include <boct/boct_bit_tree.h>
31 #include "vnl/vnl_random.h"
32 //: Map of kernels should persist between process executions
33 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_compute_heightmap_pre_post::pre_kernels_;
34 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_compute_heightmap_pre_post::post_kernels_;
35 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_update_heightmap_factor::update_heightmap_factor_kernels_;
36 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_smooth_heightmap_pdata::smooth_heightmap_pdata_kernels_;
37 //Main public method, updates color model
update_pre(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,float,float)38 bool boxm2_ocl_compute_heightmap_pre_post::update_pre(const boxm2_scene_sptr& scene,
39 const bocl_device_sptr& device,
40 const boxm2_opencl_cache_sptr& opencl_cache,
41 const vil_image_view_base_sptr& ximg,
42 const vil_image_view_base_sptr& yimg,
43 float /*resnearfactor*/,
44 float /*resfarfactor*/)
45 {
46 enum {
47 UPDATE_PRE = 0,
48 NORMALIZE_PRE = 1
49 };
50 float transfer_time = 0.0f;
51 float gpu_time = 0.0f;
52 std::size_t local_threads[2] = { 8, 8 };
53 std::size_t global_threads[2] = { 8, 8 };
54
55 //cache size sanity check
56 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
57 std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
58
59 //make correct data types are here
60 std::string data_type, num_obs_type, options;
61 // create a command queue.
62 int status = 0;
63 cl_command_queue queue = clCreateCommandQueue(device->context(), *(device->device_id()), CL_QUEUE_PROFILING_ENABLE, &status);
64 if (status != 0)
65 return false;
66
67 //grab input image, establish cl_ni, cl_nj (so global size is divisible by local size)
68 auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
69 auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
70
71 auto cl_ni = (unsigned)RoundUp(ximg->ni(), (int)local_threads[0]);
72 auto cl_nj = (unsigned)RoundUp(ximg->nj(), (int)local_threads[1]);
73 global_threads[0] = cl_ni;
74 global_threads[1] = cl_nj;
75 //set generic cam
76 auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
77 auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
78 vgl_box_3d<double> bbox = scene->bounding_box();
79 float z = bbox.max_z();
80 int count = 0;
81 for (unsigned int j = 0; j < cl_nj; ++j) {
82 for (unsigned int i = 0; i < cl_ni; ++i) {
83 if (i < ximg->ni() && j < ximg->nj())
84 {
85 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
86 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
87 ray_origins[count * 4 + 2] = z + 1.0f;
88 ray_origins[count * 4 + 3] = 0.0f;
89 ray_directions[count * 4 + 0] = 0.0;
90 ray_directions[count * 4 + 1] = 0.0;
91 ray_directions[count * 4 + 2] = -1.0;
92 ray_directions[count * 4 + 3] = 0.0f;
93 }
94 ++count;
95 }
96 }
97 bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
98 ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
99 bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
100 ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
101
102 float tnearfar[2] = { 0.0f, 1000000 };
103
104 bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar buffer");
105 tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
106 //Visibility, Preinf, Norm, and input image buffers
107 auto* vis_buff = new float[cl_ni*cl_nj];
108 auto* pre_buff = new float[cl_ni*cl_nj];
109
110 for (unsigned i = 0; i < cl_ni*cl_nj; i++)
111 {
112 vis_buff[i] = 1.0f;
113 pre_buff[i] = 0.0f;
114 }
115
116 bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
117 vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
118
119 //bocl_mem_sptr pre_image=new bocl_mem(device->context(),pre_buff,cl_ni*cl_nj*sizeof(float),"pre image buffer");
120 bocl_mem_sptr pre_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), pre_buff, "pre image buffer");
121 pre_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
122
123 // Image Dimensions
124 int img_dim_buff[4];
125 img_dim_buff[0] = 0;
126 img_dim_buff[1] = 0;
127 img_dim_buff[2] = ximg->ni();
128 img_dim_buff[3] = ximg->nj();
129
130 bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
131 img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
132
133 // Output Array
134 float output_arr[100];
135 for (float & i : output_arr) i = 0.0f;
136 bocl_mem_sptr cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
137 cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
138
139 // bit lookup buffer
140 cl_uchar lookup_arr[256];
141 boxm2_ocl_util::set_bit_lookup(lookup_arr);
142 bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
143 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
144
145 // compile the kernel if not already compiled
146 std::vector<bocl_kernel*>& kernels = get_pre_kernels(device, options);
147 // set arguments
148 std::vector<boxm2_block_id> vis_order = scene->get_block_ids();
149 std::vector<boxm2_block_id>::iterator id;
150
151 for (id = vis_order.begin(); id != vis_order.end(); ++id)
152 {
153 for (unsigned int i = 0; i < kernels.size(); ++i)
154 {
155 //choose correct render kernel
156 boxm2_block_metadata mdata = scene->get_block_metadata(*id);
157 bocl_kernel* kern = kernels[i];
158 //write the image values to the buffer
159 vul_timer transfer;
160 bocl_mem* blk = opencl_cache->get_block(scene, *id);
161 bocl_mem* blk_info = opencl_cache->loaded_block_info();
162 bocl_mem* alpha = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_ALPHA>::prefix());
163 auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
164 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
165 info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
166 //grab an appropriately sized AUX data buffer
167 bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("seglen_h"), 0, false);
168 bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, true);
169 bocl_mem *aux2 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), 0, false);
170 bocl_mem *aux3 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), 0, false);
171 transfer_time += (float)transfer.all();
172 if (i == UPDATE_PRE)
173 {
174 aux0->zero_gpu_buffer(queue);
175 aux2->zero_gpu_buffer(queue);
176 aux3->zero_gpu_buffer(queue);
177 kern->set_arg(blk_info);
178 kern->set_arg(blk);
179 kern->set_arg(alpha);
180 kern->set_arg(aux0);
181 kern->set_arg(aux1_ph_smooth);
182 kern->set_arg(aux2);
183 kern->set_arg(aux3);
184 kern->set_arg(lookup.ptr());
185 kern->set_arg(ray_o_buff.ptr());
186 kern->set_arg(ray_d_buff.ptr());
187 kern->set_arg(tnearfar_mem_ptr.ptr());
188 kern->set_arg(img_dim.ptr());
189 kern->set_arg(vis_image.ptr());
190 kern->set_arg(pre_image.ptr());
191 kern->set_arg(cl_output.ptr());
192 kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));//local tree,
193 kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar)); //cumsum buffer, imindex buffer
194 //execute kernel
195 kern->execute(queue, 2, local_threads, global_threads);
196 int status = clFinish(queue);
197 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
198 return false;
199 gpu_time += kern->exec_time();
200 //clear render kernel args so it can reset em on next execution
201 kern->clear_args();
202 //write info to disk
203 }
204 else if (i == NORMALIZE_PRE)
205 {
206 blk_info->write_to_buffer((queue));
207
208 std::size_t lt[1], gt[1];
209 lt[0] = 64;
210 gt[0] = RoundUp(info_buffer->data_buffer_length, lt[0]);
211
212 kern->set_arg(blk_info);
213 kern->set_arg(aux0);
214 kern->set_arg(aux2);
215 kern->set_arg(aux3);
216 kern->execute(queue, 1, lt, gt);
217 int status = clFinish(queue);
218 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
219 return false;
220 gpu_time += kern->exec_time();
221 aux2->read_to_buffer(queue);
222 aux3->read_to_buffer(queue);
223 //clear render kernel args so it can reset em on next execution
224 kern->clear_args();
225 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("seglen_h"), false);
226 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), false);
227 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), true);
228 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), true);
229 }
230 //read image out to buffer (from gpu)
231 vis_image->read_to_buffer(queue);
232 pre_image->read_to_buffer(queue);
233 clFinish(queue);
234 }
235 }
236
237 delete[] vis_buff;
238 delete[] pre_buff;
239 delete[] ray_origins;
240 delete[] ray_directions;
241 //opencl_cache->unref_mem(hmean_image.ptr());
242 //opencl_cache->unref_mem(hvar_image.ptr());
243 opencl_cache->unref_mem(vis_image.ptr());
244 opencl_cache->unref_mem(pre_image.ptr());
245 opencl_cache->unref_mem(ray_o_buff.ptr());
246 opencl_cache->unref_mem(ray_d_buff.ptr());
247 opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
248 std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
249 clReleaseCommandQueue(queue);
250 return true;
251 }
252
253 //Main public method, updates color model
update_post(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,float,float)254 bool boxm2_ocl_compute_heightmap_pre_post::update_post(const boxm2_scene_sptr& scene,
255 const bocl_device_sptr& device,
256 const boxm2_opencl_cache_sptr& opencl_cache,
257 const vil_image_view_base_sptr& ximg,
258 const vil_image_view_base_sptr& yimg,
259 float /*resnearfactor*/,
260 float /*resfarfactor*/)
261 {
262 enum {
263 UPDATE_POST = 0,
264 NORMALIZE_POST = 1
265 };
266 float transfer_time = 0.0f;
267 float gpu_time = 0.0f;
268 std::size_t local_threads[2] = { 8, 8 };
269 std::size_t global_threads[2] = { 8, 8 };
270
271 //cache size sanity check
272 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
273 std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
274
275 //make correct data types are here
276 std::string data_type, num_obs_type, options;
277
278 // create a command queue.
279 int status = 0;
280 cl_command_queue queue = clCreateCommandQueue(device->context(),
281 *(device->device_id()),
282 CL_QUEUE_PROFILING_ENABLE,
283 &status);
284 if (status != 0)
285 return false;
286
287 auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
288 auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
289 auto cl_ni = (unsigned)RoundUp(ximg->ni(), (int)local_threads[0]);
290 auto cl_nj = (unsigned)RoundUp(ximg->nj(), (int)local_threads[1]);
291 global_threads[0] = cl_ni;
292 global_threads[1] = cl_nj;
293 //set generic cam
294 auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
295 auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
296 vgl_box_3d<double> bbox = scene->bounding_box();
297 float z = bbox.max_z();
298 int count = 0;
299 for (unsigned int j = 0; j < cl_nj; ++j) {
300 for (unsigned int i = 0; i < cl_ni; ++i) {
301 if (i < ximg->ni() && j < ximg->nj())
302 {
303 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
304 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
305 ray_origins[count * 4 + 2] = z + 1.0f;
306 ray_origins[count * 4 + 3] = 0.0f;
307 ray_directions[count * 4 + 0] = 0.0;
308 ray_directions[count * 4 + 1] = 0.0;
309 ray_directions[count * 4 + 2] = -1.0;
310 ray_directions[count * 4 + 3] = 0.0f;
311 }
312 ++count;
313 }
314 }
315 bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
316 ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
317 bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
318 ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
319 float tnearfar[2] = { 0.0f, 1000000 };
320 bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar buffer");
321 tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
322 //Visibility, Preinf, Norm, and input image buffers
323 auto* vis_buff = new float[cl_ni*cl_nj];
324 auto* post_buff = new float[cl_ni*cl_nj];
325
326 for (unsigned i = 0; i < cl_ni*cl_nj; i++)
327 {
328 vis_buff[i] = 1.0f;
329 post_buff[i] = 0.0f;
330 }
331
332
333 bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
334 vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
335 bocl_mem_sptr post_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), post_buff, "pre image buffer");
336 post_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
337
338 // Image Dimensions
339 int img_dim_buff[4];
340 img_dim_buff[0] = 0;
341 img_dim_buff[1] = 0;
342 img_dim_buff[2] = ximg->ni();
343 img_dim_buff[3] = ximg->nj();
344
345 bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
346 img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
347
348 // Output Array
349 float output_arr[100];
350 for (float & i : output_arr) i = 0.0f;
351 bocl_mem_sptr cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
352 cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
353
354 // bit lookup buffer
355 cl_uchar lookup_arr[256];
356 boxm2_ocl_util::set_bit_lookup(lookup_arr);
357 bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
358 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
359
360 // compile the kernel if not already compiled
361 std::vector<bocl_kernel*>& kernels = get_post_kernels(device, options);
362 // set arguments
363 std::vector<boxm2_block_id> vis_order;
364 vis_order = scene->get_block_ids();
365 std::vector<boxm2_block_id>::iterator id;
366 for (unsigned int i = 0; i < kernels.size(); ++i)
367 {
368 bocl_kernel* kern = kernels[i];
369 for (id = vis_order.begin(); id != vis_order.end(); ++id)
370 {
371 //choose correct render kernel
372 boxm2_block_metadata mdata = scene->get_block_metadata(*id);
373 //write the image values to the buffer
374 vul_timer transfer;
375 bocl_mem* blk = opencl_cache->get_block(scene, *id);
376 bocl_mem* blk_info = opencl_cache->loaded_block_info();
377 bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id, 0, false);
378 auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
379 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
380 info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
381 // data type string may contain an identifier so determine the buffer size
382 //grab an appropriately sized AUX data buffer
383 bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("post_seglen_h"), 0, false);
384 bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), 0, false);
385 bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
386 transfer_time += (float)transfer.all();
387
388 if (i == UPDATE_POST)
389 {
390 aux0->zero_gpu_buffer(queue);
391 aux1->zero_gpu_buffer(queue);
392 kern->set_arg(blk_info);
393 kern->set_arg(blk);
394 kern->set_arg(alpha);
395 kern->set_arg(aux0);
396 kern->set_arg(aux1_ph_smooth);
397 kern->set_arg(aux1);
398 kern->set_arg(lookup.ptr());
399 kern->set_arg(ray_o_buff.ptr());
400 kern->set_arg(ray_d_buff.ptr());
401 kern->set_arg(tnearfar_mem_ptr.ptr());
402 kern->set_arg(img_dim.ptr());
403 kern->set_arg(vis_image.ptr());
404 kern->set_arg(post_image.ptr());
405 kern->set_arg(cl_output.ptr());
406 kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));//local tree,
407 kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar)); //cumsum buffer, imindex buffer
408 //execute kernel
409 kern->execute(queue, 2, local_threads, global_threads);
410 int status = clFinish(queue);
411 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
412 return false;
413 gpu_time += kern->exec_time();
414 clFinish(queue);
415 //clear render kernel args so it can reset em on next execution
416 kern->clear_args();
417 //write info to disk
418
419 }
420 else if (i == NORMALIZE_POST)
421 {
422 blk_info->write_to_buffer((queue));
423
424 std::size_t lt[1], gt[1];
425 lt[0] = 64;
426 gt[0] = RoundUp(info_buffer->data_buffer_length, lt[0]);
427
428 kern->set_arg(blk_info);
429 kern->set_arg(aux0);
430 kern->set_arg(aux1);
431 kern->execute(queue, 1, lt, gt);
432 int status = clFinish(queue);
433 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
434 return false;
435 gpu_time += kern->exec_time();
436 aux1->read_to_buffer(queue);
437
438 //clear render kernel args so it can reset em on next execution
439 kern->clear_args();
440 aux1->read_to_buffer(queue);
441 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("post_seglen_h"), false);
442 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), true);
443 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), false);
444
445 }
446 //read image out to buffer (from gpu)
447 vis_image->read_to_buffer(queue);
448 post_image->read_to_buffer(queue);
449 cl_output->read_to_buffer(queue);
450 clFinish(queue);
451 }
452 }
453 delete[] vis_buff;
454 delete[] post_buff;
455 delete[] ray_origins;
456 delete[] ray_directions;
457
458 opencl_cache->unref_mem(vis_image.ptr());
459 opencl_cache->unref_mem(post_image.ptr());
460 opencl_cache->unref_mem(ray_o_buff.ptr());
461 opencl_cache->unref_mem(ray_d_buff.ptr());
462 opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
463 std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
464 clReleaseCommandQueue(queue);
465
466 return true;
467 }
468
469 //Main public method, updates color model
compute_pre_post(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & hmap_mean,const vil_image_view_base_sptr & hmap_var,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,int smoothingradius,float resnearfactor,float resfarfactor)470 bool boxm2_ocl_compute_heightmap_pre_post::compute_pre_post(const boxm2_scene_sptr& scene,
471 const bocl_device_sptr& device,
472 const boxm2_opencl_cache_sptr& opencl_cache,
473 const vil_image_view_base_sptr& hmap_mean,
474 const vil_image_view_base_sptr& hmap_var,
475 const vil_image_view_base_sptr& ximg,
476 const vil_image_view_base_sptr& yimg,
477 int smoothingradius,
478 float resnearfactor,
479 float resfarfactor)
480 {
481 boxm2_ocl_smooth_heightmap_pdata::compute_smooth_heightmap_pdata(scene, device, opencl_cache, hmap_mean, hmap_var, ximg, yimg, smoothingradius);
482 boxm2_ocl_compute_heightmap_pre_post::update_pre(scene, device, opencl_cache, ximg, yimg, resnearfactor, resfarfactor);
483 boxm2_ocl_compute_heightmap_pre_post::update_post(scene, device, opencl_cache, ximg, yimg, resnearfactor, resfarfactor);
484 return true;
485 }
update_heightmap_factor(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,bool add)486 bool boxm2_ocl_update_heightmap_factor::update_heightmap_factor(const boxm2_scene_sptr& scene,
487 const bocl_device_sptr& device,
488 const boxm2_opencl_cache_sptr& opencl_cache,
489 bool add)
490 {
491 float transfer_time = 0.0f;
492 float gpu_time = 0.0f;
493 std::size_t local_threads[1] = { 64 };
494 std::size_t global_threads[1] = { 64 };
495 //cache size sanity check
496 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
497 std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
498
499 //make correct data types are here
500 std::string data_type, num_obs_type, options;
501 int does_add_buf = add ? 1 : 0;
502 bocl_mem_sptr does_add = new bocl_mem(device->context(), &does_add_buf, sizeof(int) * 1, "add (1) or subtract (0)");
503 does_add->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
504
505 // create a command queue.
506 int status = 0;
507 cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
508 if (status != 0)
509 return false;
510 // compile the kernel if not already compiled
511 bocl_kernel * kern = get_update_heightmap_factor_kernels(device, options)[0];
512 std::vector<boxm2_block_id> blks_order;
513 blks_order = scene->get_block_ids();
514 std::vector<boxm2_block_id>::iterator id;
515
516 for (id = blks_order.begin(); id != blks_order.end(); ++id)
517 {
518 //choose correct render kernel
519 boxm2_block_metadata mdata = scene->get_block_metadata(*id);
520 //write the image values to the buffer
521 vul_timer transfer;
522 bocl_mem* blk = opencl_cache->get_block(scene, *id);
523 bocl_mem* blk_info = opencl_cache->loaded_block_info();
524 bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id, 0, false);
525 auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
526 int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
527 info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
528 blk_info->write_to_buffer((queue));
529 local_threads[0] = 64;
530 global_threads[0] = RoundUp(info_buffer->data_buffer_length, local_threads[0]);
531 //grab an appropriately sized AUX data buffer
532 bocl_mem *aux1_pre = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), 0, true);
533 bocl_mem *aux2_pre = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), 0, true);
534 bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), 0, false);
535 bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
536 bocl_mem *aux0_hf = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("hf"), 0, false);
537
538 transfer_time += (float)transfer.all();
539 kern->set_arg(blk_info);
540 kern->set_arg(does_add.ptr());
541 kern->set_arg(aux1_pre);
542 kern->set_arg(aux2_pre);
543 kern->set_arg(aux1);
544 kern->set_arg(aux1_ph_smooth);
545 kern->set_arg(aux0_hf);
546 //execute kernel
547 kern->execute(queue, 1, local_threads, global_threads);
548 int status = clFinish(queue);
549 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
550 return false;
551 gpu_time += kern->exec_time();
552 //clear render kernel args so it can reset em on next execution
553 aux0_hf->read_to_buffer(queue);
554 kern->clear_args();
555 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), false);
556 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), false);
557 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("post_h"), false);
558 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX3>::prefix("post_h"), false);
559 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("hf"), true);
560 }
561 clFinish(queue);
562 opencl_cache->unref_mem(does_add.ptr());
563 return true;
564 }
565
566 //Returns vector of color update kernels (and caches them per device
get_pre_kernels(const bocl_device_sptr & device,const std::string & opts)567 std::vector<bocl_kernel*>& boxm2_ocl_compute_heightmap_pre_post::get_pre_kernels(const bocl_device_sptr& device, const std::string& opts)
568 {
569 // compile kernels if not already compiled
570 std::string identifier = device->device_identifier() + opts;
571 if (pre_kernels_.find(identifier) != pre_kernels_.end())
572 return pre_kernels_[identifier];
573
574 //otherwise compile the kernels
575 std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
576 std::vector<std::string> src_paths;
577 std::string source_dir = boxm2_ocl_util::ocl_src_root();
578 src_paths.push_back(source_dir + "scene_info.cl");
579 src_paths.push_back(source_dir + "pixel_conversion.cl");
580 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
581 src_paths.push_back(source_dir + "backproject.cl");
582 src_paths.push_back(source_dir + "atomics_util.cl");
583 src_paths.push_back(source_dir + "statistics_library_functions.cl");
584 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
585 src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
586 std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
587 src_paths.push_back(source_dir + "update_functors.cl");
588 src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
589
590 //compilation options
591 std::string options = "-D ATOMIC_FLOAT ";
592 //populate vector of kernels
593 std::vector<bocl_kernel*> vec_kernels;
594
595 auto* pre = new bocl_kernel();
596 std::string pre_opts = options + " -D PRE_HMAP_CELL -D STEP_CELL=step_cell_pre_hmap(aux_args,data_ptr,llid,d)";
597 pre->create_kernel(&device->context(), device->device_id(), src_paths, "pre_hmap_main", pre_opts, "update::pre_hmap_main");
598 vec_kernels.push_back(pre);
599 auto* normalize_pre = new bocl_kernel();
600 std::string normalize_pre_opts = options + " -D PRE_HMAP_CELL ";
601 normalize_pre->create_kernel(&device->context(), device->device_id(), src_paths, "normalize_prehmap_main", pre_opts, "update::normalize_prehmap_main");
602 vec_kernels.push_back(normalize_pre);
603 //store and return
604 pre_kernels_[identifier] = vec_kernels;
605 return pre_kernels_[identifier];
606 }
607
get_post_kernels(const bocl_device_sptr & device,const std::string & opts)608 std::vector<bocl_kernel*>& boxm2_ocl_compute_heightmap_pre_post::get_post_kernels(const bocl_device_sptr& device, const std::string& opts)
609 {
610 // compile kernels if not already compiled
611 std::string identifier = device->device_identifier() + opts;
612 if (post_kernels_.find(identifier) != post_kernels_.end())
613 return post_kernels_[identifier];
614
615 //otherwise compile the kernels
616 std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
617 std::vector<std::string> src_paths;
618 std::string source_dir = boxm2_ocl_util::ocl_src_root();
619 src_paths.push_back(source_dir + "scene_info.cl");
620 src_paths.push_back(source_dir + "pixel_conversion.cl");
621 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
622 src_paths.push_back(source_dir + "backproject.cl");
623 src_paths.push_back(source_dir + "atomics_util.cl");
624 src_paths.push_back(source_dir + "statistics_library_functions.cl");
625 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
626 src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
627 std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
628 src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
629
630 //compilation options
631 std::string options = "-D ATOMIC_FLOAT -D REVERSE";
632 //populate vector of kernels
633 std::vector<bocl_kernel*> vec_kernels;
634 //seg len pass
635
636 auto* post = new bocl_kernel();
637 std::string post_opts = options + " -D POST_HMAP_CELL -D STEP_CELL=step_cell_post_hmap(aux_args,data_ptr,llid,d)";
638 post->create_kernel(&device->context(), device->device_id(), src_paths, "post_hmap_main", post_opts, "update::post_Cell");
639 vec_kernels.push_back(post);
640
641 auto* normalize_post = new bocl_kernel();
642 std::string normalize_post_opts = options + " -D NORMALIZE_POST_CELL ";
643 normalize_post->create_kernel(&device->context(), device->device_id(), non_ray_src, "normalize_post_cell", normalize_post_opts, "update::normalize_post_cell");
644 vec_kernels.push_back(normalize_post);
645 //store and return
646 post_kernels_[identifier] = vec_kernels;
647 return post_kernels_[identifier];
648 }
649
get_update_heightmap_factor_kernels(const bocl_device_sptr & device,const std::string & opts)650 std::vector<bocl_kernel*>& boxm2_ocl_update_heightmap_factor::get_update_heightmap_factor_kernels(const bocl_device_sptr& device, const std::string& opts)
651 {
652 // compile kernels if not already compiled
653 std::string identifier = device->device_identifier() + opts;
654 if (update_heightmap_factor_kernels_.find(identifier) != update_heightmap_factor_kernels_.end())
655 return update_heightmap_factor_kernels_[identifier];
656
657 //otherwise compile the kernels
658 std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
659 std::vector<std::string> src_paths;
660 std::string source_dir = boxm2_ocl_util::ocl_src_root();
661 src_paths.push_back(source_dir + "scene_info.cl");
662 src_paths.push_back(source_dir + "pixel_conversion.cl");
663 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
664 src_paths.push_back(source_dir + "backproject.cl");
665 src_paths.push_back(source_dir + "atomics_util.cl");
666 src_paths.push_back(source_dir + "statistics_library_functions.cl");
667 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
668 src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
669
670 //compilation options
671 std::string options = "-D ATOMIC_FLOAT -D ADD_SUBTRACT_FACTOR";
672 //populate vector of kernels
673 std::vector<bocl_kernel*> vec_kernels;
674 auto* computez = new bocl_kernel();
675 const std::string& computez_opts = options;
676 computez->create_kernel(&device->context(), device->device_id(), src_paths, "add_subtract_factor_main", computez_opts, "update::add_subtract_factor_main");
677 vec_kernels.push_back(computez);
678
679 //store and return
680 update_heightmap_factor_kernels_[identifier] = vec_kernels;
681 return update_heightmap_factor_kernels_[identifier];
682 }
683
684
685 bool boxm2_ocl_smooth_heightmap_pdata::
compute_smooth_heightmap_pdata(boxm2_scene_sptr scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & hmap_mean,const vil_image_view_base_sptr & hmap_var,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,int smoothingradius=16,float,float)686 compute_smooth_heightmap_pdata(boxm2_scene_sptr scene,
687 const bocl_device_sptr& device,
688 const boxm2_opencl_cache_sptr& opencl_cache,
689 const vil_image_view_base_sptr& hmap_mean,
690 const vil_image_view_base_sptr& hmap_var,
691 const vil_image_view_base_sptr& ximg,
692 const vil_image_view_base_sptr& yimg,
693 int smoothingradius = 16,
694 float /*resnearfactor*/,
695 float /*resfarfactor*/)
696 {
697
698 float transfer_time = 0.0f;
699 float gpu_time = 0.0f;
700 std::size_t local_threads[2] = { 8, 8 };
701 std::size_t global_threads[2] = { 8, 8 };
702
703 //cache size sanity check
704 std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
705 std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
706
707 //make correct data types are here
708 std::string data_type, num_obs_type, options;
709
710 // create a command queue.
711 int status = 0;
712 cl_command_queue queue = clCreateCommandQueue(device->context(),
713 *(device->device_id()),
714 CL_QUEUE_PROFILING_ENABLE,
715 &status);
716 if (status != 0)
717 return false;
718
719 //grab input image, establish cl_ni, cl_nj (so global size is divisible by local size)
720
721 auto* hmap_mean_view = static_cast<vil_image_view<float>*>(hmap_mean.ptr());
722 auto* hmap_var_view = static_cast<vil_image_view<float>*>(hmap_var.ptr());
723 auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
724 auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
725
726 auto cl_ni = (unsigned)RoundUp(hmap_mean_view->ni(), (int)local_threads[0]);
727 auto cl_nj = (unsigned)RoundUp(hmap_mean_view->nj(), (int)local_threads[1]);
728 global_threads[0] = cl_ni;
729 global_threads[1] = cl_nj;
730 //set generic cam
731 auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
732 auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
733 vgl_box_3d<double> bbox = scene->bounding_box();
734 float z = bbox.max_z();
735 int count = 0;
736 for (unsigned int j = 0; j < cl_nj; ++j) {
737 for (unsigned int i = 0; i < cl_ni; ++i) {
738 if (i < hmap_mean_view->ni() && j < hmap_mean_view->nj())
739 {
740 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
741 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
742 ray_origins[count * 4 + 2] = z + 1.0f;
743 ray_origins[count * 4 + 3] = 0.0f;
744 ray_directions[count * 4 + 0] = 0.0;
745 ray_directions[count * 4 + 1] = 0.0;
746 ray_directions[count * 4 + 2] = -1.0;
747 ray_directions[count * 4 + 3] = 0.0f;
748 }
749 ++count;
750 }
751 }
752 bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
753 ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
754 bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
755 ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
756 float tnearfar[2] = { 0.0f, 1000000 };
757 bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar buffer");
758 tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
759 //Visibility, Preinf, Norm, and input image buffers
760 auto* hmean_buff = new float[cl_ni*cl_nj];
761 auto* hvar_buff = new float[cl_ni*cl_nj];
762
763 //copy input vals into image
764 count = 0;
765 for (unsigned int j = 0; j < cl_nj; ++j) {
766 for (unsigned int i = 0; i < cl_ni; ++i) {
767 hmean_buff[count] = 0.0f;
768 hvar_buff[count] = 0.0f;
769 if (i < hmap_mean_view->ni() && j < hmap_mean_view->nj())
770 {
771 hmean_buff[count] = (*hmap_mean_view)(i, j);
772 hvar_buff[count] = (*hmap_var_view)(i, j);
773 }
774 ++count;
775 }
776 }
777 bocl_mem_sptr hmean_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), hmean_buff, "input image buffer");
778 hmean_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
779 bocl_mem_sptr hvar_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), hvar_buff, "input image buffer");
780 hvar_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
781 // Image Dimensions
782 int img_dim_buff[4];
783 img_dim_buff[0] = 0;
784 img_dim_buff[1] = 0;
785 img_dim_buff[2] = hmap_mean_view->ni();
786 img_dim_buff[3] = hmap_mean_view->nj();
787
788 bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
789 img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
790
791 // Output Array
792 float output_arr[100];
793 for (float & i : output_arr) i = 0.0f;
794 bocl_mem_sptr cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
795 cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
796
797 // bit lookup buffer
798 cl_uchar lookup_arr[256];
799 boxm2_ocl_util::set_bit_lookup(lookup_arr);
800 bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
801 lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
802 //: avg the Pheight with the neighbors
803 vnl_random rand;
804 int numsamples = 16;
805 int rad = smoothingradius;
806 auto * weights = new float[numsamples];
807 int * pts = new int[2 * numsamples];
808 for (int i = 0; i < numsamples;)
809 {
810 int x = -rad + rand.lrand32(0, 2 * rad + 1);
811 int y = -rad + rand.lrand32(0, 2 * rad + 1);
812 if (x == 0 && y == 0) x = 1;
813 pts[2 * i] = x;
814 pts[2 * i + 1] = y;
815 weights[i] = 1 - std::sqrt(float(x*x + y*y)) / (float(rad));
816 i++;
817 }
818 typedef vnl_vector_fixed<unsigned char, 16> uchar16;
819 // compile the kernel if not already compiled
820 std::vector<bocl_kernel*>& kernels = get_smooth_heightmap_pdata_kernels(device, options);
821 // set arguments
822 std::vector<boxm2_block_id> vis_order;
823 vis_order = scene->get_block_ids();
824 std::vector<boxm2_block_id>::iterator id;
825 bocl_kernel* kern = kernels[0];
826 for (id = vis_order.begin(); id != vis_order.end(); ++id)
827 {
828 //choose correct render kernel
829 boxm2_block_metadata mdata = scene->get_block_metadata(*id);
830 //write the image values to the buffer
831 vul_timer transfer;
832 bocl_mem* blk = opencl_cache->get_block(scene, *id);
833 bocl_mem* blk_info = opencl_cache->loaded_block_info();
834 //grab an appropriately sized AUX data buffer
835 bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"), 0, false);
836 bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"), 0, false);
837
838 transfer_time += (float)transfer.all();
839 aux0->zero_gpu_buffer(queue);
840 aux1->zero_gpu_buffer(queue);
841 kern->set_arg(blk_info);
842 kern->set_arg(blk);
843 kern->set_arg(aux0);
844 kern->set_arg(aux1);
845 kern->set_arg(lookup.ptr());
846 kern->set_arg(ray_o_buff.ptr());
847 kern->set_arg(ray_d_buff.ptr());
848 kern->set_arg(tnearfar_mem_ptr.ptr());
849 kern->set_arg(img_dim.ptr());
850 kern->set_arg(hmean_image.ptr());
851 kern->set_arg(hvar_image.ptr());
852 kern->set_arg(cl_output.ptr());
853 kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));
854 kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar));
855 //execute kernel
856 kern->execute(queue, 2, local_threads, global_threads);
857 int status = clFinish(queue);
858 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
859 return false;
860 gpu_time += kern->exec_time();
861 //clear render kernel args so it can reset em on next execution
862 kern->clear_args();
863 aux0->read_to_buffer(queue);
864 aux1->read_to_buffer(queue);
865 // boxm2_block_metadata data = scene->get_block_metadata(*id);
866 boxm2_block * cblk = opencl_cache->get_cpu_cache()->get_block(scene, *id);
867 //get data from cache
868 boxm2_data_base * aux0_len_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"));
869 boxm2_data_base * aux1_ph_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"));
870 boxm2_data_base * aux1_ph_smooth_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
871 //3d array of trees
872 const boxm2_array_3d<uchar16>& trees = cblk->trees();
873 auto * aux0_len = (boxm2_data_traits<BOXM2_AUX0>::datatype*) aux0_len_b->data_buffer();
874 auto * aux1_ph = (boxm2_data_traits<BOXM2_AUX1>::datatype*) aux1_ph_b->data_buffer();
875 auto * aux1_ph_smooth = (boxm2_data_traits<BOXM2_AUX1>::datatype*) aux1_ph_smooth_b->data_buffer();
876
877 //iterate through each tree
878 for (unsigned int x = 0; x < trees.get_row1_count(); ++x) {
879 for (unsigned int y = 0; y < trees.get_row2_count(); ++y) {
880 for (unsigned int z = 0; z < trees.get_row3_count(); ++z) {
881 //load current block/tree
882 uchar16 tree = trees(x, y, z);
883 boct_bit_tree bit_tree((unsigned char*)tree.data_block(), mdata.max_level_);
884 //iterate through leaves of the tree
885 std::vector<int> leafBits = bit_tree.get_leaf_bits(0);
886 std::vector<int>::iterator iter;
887 for (iter = leafBits.begin(); iter != leafBits.end(); ++iter)
888 {
889 int currIdx = bit_tree.get_data_index((*iter)); //data index
890 vgl_point_3d<double> localCenter = bit_tree.cell_center((*iter));
891 float px = (localCenter.x() + x)*mdata.sub_block_dim_.x() + mdata.local_origin_.x();
892 float py = (localCenter.y() + y)*mdata.sub_block_dim_.y() + mdata.local_origin_.y();
893 float pz = (localCenter.z() + z)*mdata.sub_block_dim_.z() + mdata.local_origin_.z();
894 double sumweight = 1.0;
895 if (aux0_len[currIdx] > 1e-10)
896 aux1_ph_smooth[currIdx] = aux1_ph[currIdx] / aux0_len[currIdx];
897 else
898 aux1_ph_smooth[currIdx] = 0.0;
899 //: iterate over neighbors
900 for (size_t k = 0; k < numsamples; k++)
901 {
902 vgl_point_3d<double> neighbor_pt(px + pts[2 * k] * mdata.sub_block_dim_.x(),
903 py + pts[2 * k + 1] * mdata.sub_block_dim_.y(), pz);
904 unsigned int data_index = 0;
905 if (cblk->data_index(neighbor_pt, data_index))
906 {
907 if (aux0_len[data_index] > 1e-10)
908 {
909 aux1_ph_smooth[currIdx] += (aux1_ph[data_index] / aux0_len[data_index]);
910 sumweight += weights[k];
911 }
912 }
913 }
914 aux1_ph_smooth[currIdx] = aux1_ph_smooth[currIdx] / sumweight;
915 }
916 }
917 }
918 }
919 opencl_cache->get_cpu_cache()->remove_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), true);
920 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"), false);
921 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"), false);
922 }
923 clFinish(queue);
924
925 delete[] hmean_buff;
926 delete[] hvar_buff;
927 delete[] ray_origins;
928 delete[] ray_directions;
929 opencl_cache->unref_mem(hmean_image.ptr());
930 opencl_cache->unref_mem(hvar_image.ptr());
931 opencl_cache->unref_mem(ray_o_buff.ptr());
932 opencl_cache->unref_mem(ray_d_buff.ptr());
933 opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
934 std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
935 clReleaseCommandQueue(queue);
936
937 delete[] weights;
938 delete[] pts;
939 return true;
940 }
941 std::vector<bocl_kernel*>& boxm2_ocl_smooth_heightmap_pdata::
get_smooth_heightmap_pdata_kernels(const bocl_device_sptr & device,const std::string & opts)942 get_smooth_heightmap_pdata_kernels(const bocl_device_sptr& device, const std::string& opts)
943 {
944 // compile kernels if not already compiled
945 std::string identifier = device->device_identifier() + opts;
946 if (smooth_heightmap_pdata_kernels_.find(identifier) != smooth_heightmap_pdata_kernels_.end())
947 return smooth_heightmap_pdata_kernels_[identifier];
948
949 //otherwise compile the kernels
950 std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
951 std::vector<std::string> src_paths;
952 std::string source_dir = boxm2_ocl_util::ocl_src_root();
953 src_paths.push_back(source_dir + "scene_info.cl");
954 src_paths.push_back(source_dir + "pixel_conversion.cl");
955 src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
956 src_paths.push_back(source_dir + "backproject.cl");
957 src_paths.push_back(source_dir + "atomics_util.cl");
958 src_paths.push_back(source_dir + "statistics_library_functions.cl");
959 src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
960 src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
961 std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
962 src_paths.push_back(source_dir + "update_functors.cl");
963 src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
964
965 //compilation options
966 std::string options = "-D ATOMIC_FLOAT ";
967 //populate vector of kernels
968 std::vector<bocl_kernel*> vec_kernels;
969 //seg len pass
970 auto* seg_len = new bocl_kernel();
971 std::string seg_opts = options + " -D HMAP_DENSITY_CELL -D STEP_CELL=step_cell_hmap_density(aux_args,data_ptr,llid,d,tblock)";
972 seg_len->create_kernel(&device->context(), device->device_id(), src_paths, "compute_hmap_density_main", seg_opts, "update::hmap_density_main");
973 vec_kernels.push_back(seg_len);
974
975 //store and return
976 smooth_heightmap_pdata_kernels_[identifier] = vec_kernels;
977 return smooth_heightmap_pdata_kernels_[identifier];
978 }
979