1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_update_heightmap_factor.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_update_heightmap_factor.h"
6 //:
7 // \file
8 // \brief  A process for updating a color model
9 //
10 // \author Vishal Jain
11 // \date Mar 25, 2011
12 
13 #ifdef _MSC_VER
14 #  include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/ocl/boxm2_opencl_cache.h>
17 #include <boxm2/boxm2_scene.h>
18 #include <boxm2/boxm2_block.h>
19 #include <boxm2/boxm2_data_base.h>
20 #include <boxm2/ocl/boxm2_ocl_util.h>
21 #include <boxm2/boxm2_util.h>
22 #include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
23 #include "vil/vil_image_view.h"
24 
25 //directory utility
26 #include "vul/vul_timer.h"
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 #include <boct/boct_bit_tree.h>
31 #include "vnl/vnl_random.h"
32 //: Map of kernels should persist between process executions
33 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_compute_heightmap_pre_post::pre_kernels_;
34 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_compute_heightmap_pre_post::post_kernels_;
35 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_update_heightmap_factor::update_heightmap_factor_kernels_;
36 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_smooth_heightmap_pdata::smooth_heightmap_pdata_kernels_;
37 //Main public method, updates color model
update_pre(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,float,float)38 bool boxm2_ocl_compute_heightmap_pre_post::update_pre(const boxm2_scene_sptr&         scene,
39                                                       const bocl_device_sptr&         device,
40                                                       const boxm2_opencl_cache_sptr&  opencl_cache,
41                                                       const vil_image_view_base_sptr& ximg,
42                                                       const vil_image_view_base_sptr& yimg,
43                                                       float  /*resnearfactor*/,
44                                                       float  /*resfarfactor*/)
45 {
46     enum {
47         UPDATE_PRE = 0,
48         NORMALIZE_PRE = 1
49     };
50     float transfer_time = 0.0f;
51     float gpu_time = 0.0f;
52     std::size_t local_threads[2] = { 8, 8 };
53     std::size_t global_threads[2] = { 8, 8 };
54 
55     //cache size sanity check
56     std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
57     std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
58 
59     //make correct data types are here
60     std::string data_type, num_obs_type, options;
61     // create a command queue.
62     int status = 0;
63     cl_command_queue queue = clCreateCommandQueue(device->context(), *(device->device_id()), CL_QUEUE_PROFILING_ENABLE, &status);
64     if (status != 0)
65         return false;
66 
67     //grab input image, establish cl_ni, cl_nj (so global size is divisible by local size)
68     auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
69     auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
70 
71     auto cl_ni = (unsigned)RoundUp(ximg->ni(), (int)local_threads[0]);
72     auto cl_nj = (unsigned)RoundUp(ximg->nj(), (int)local_threads[1]);
73     global_threads[0] = cl_ni;
74     global_threads[1] = cl_nj;
75     //set generic cam
76     auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
77     auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
78     vgl_box_3d<double> bbox = scene->bounding_box();
79     float z = bbox.max_z();
80     int count = 0;
81     for (unsigned int j = 0; j < cl_nj; ++j) {
82         for (unsigned int i = 0; i < cl_ni; ++i) {
83             if (i < ximg->ni() && j < ximg->nj())
84             {
85                 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
86                 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
87                 ray_origins[count * 4 + 2] = z + 1.0f;
88                 ray_origins[count * 4 + 3] = 0.0f;
89                 ray_directions[count * 4 + 0] = 0.0;
90                 ray_directions[count * 4 + 1] = 0.0;
91                 ray_directions[count * 4 + 2] = -1.0;
92                 ray_directions[count * 4 + 3] = 0.0f;
93             }
94             ++count;
95         }
96     }
97     bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
98     ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
99     bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
100     ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
101 
102     float tnearfar[2] = { 0.0f, 1000000 };
103 
104     bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar  buffer");
105     tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
106     //Visibility, Preinf, Norm, and input image buffers
107     auto* vis_buff = new float[cl_ni*cl_nj];
108     auto* pre_buff = new float[cl_ni*cl_nj];
109 
110     for (unsigned i = 0; i < cl_ni*cl_nj; i++)
111     {
112         vis_buff[i] = 1.0f;
113         pre_buff[i] = 0.0f;
114     }
115 
116     bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
117     vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
118 
119     //bocl_mem_sptr pre_image=new bocl_mem(device->context(),pre_buff,cl_ni*cl_nj*sizeof(float),"pre image buffer");
120     bocl_mem_sptr pre_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), pre_buff, "pre image buffer");
121     pre_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
122 
123     // Image Dimensions
124     int img_dim_buff[4];
125     img_dim_buff[0] = 0;
126     img_dim_buff[1] = 0;
127     img_dim_buff[2] = ximg->ni();
128     img_dim_buff[3] = ximg->nj();
129 
130     bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
131     img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
132 
133     // Output Array
134     float output_arr[100];
135     for (float & i : output_arr) i = 0.0f;
136     bocl_mem_sptr  cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
137     cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
138 
139     // bit lookup buffer
140     cl_uchar lookup_arr[256];
141     boxm2_ocl_util::set_bit_lookup(lookup_arr);
142     bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
143     lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
144 
145     // compile the kernel if not already compiled
146     std::vector<bocl_kernel*>& kernels = get_pre_kernels(device, options);
147     // set arguments
148     std::vector<boxm2_block_id> vis_order = scene->get_block_ids();
149     std::vector<boxm2_block_id>::iterator id;
150 
151     for (id = vis_order.begin(); id != vis_order.end(); ++id)
152     {
153         for (unsigned int i = 0; i < kernels.size(); ++i)
154         {
155             //choose correct render kernel
156             boxm2_block_metadata mdata = scene->get_block_metadata(*id);
157             bocl_kernel* kern = kernels[i];
158             //write the image values to the buffer
159             vul_timer transfer;
160             bocl_mem* blk = opencl_cache->get_block(scene, *id);
161             bocl_mem* blk_info = opencl_cache->loaded_block_info();
162             bocl_mem* alpha = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_ALPHA>::prefix());
163             auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
164             int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
165             info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
166             //grab an appropriately sized AUX data buffer
167             bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("seglen_h"), 0, false);
168             bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, true);
169             bocl_mem *aux2 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), 0, false);
170             bocl_mem *aux3 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), 0, false);
171             transfer_time += (float)transfer.all();
172             if (i == UPDATE_PRE)
173             {
174                 aux0->zero_gpu_buffer(queue);
175                 aux2->zero_gpu_buffer(queue);
176                 aux3->zero_gpu_buffer(queue);
177                 kern->set_arg(blk_info);
178                 kern->set_arg(blk);
179                 kern->set_arg(alpha);
180                 kern->set_arg(aux0);
181                 kern->set_arg(aux1_ph_smooth);
182                 kern->set_arg(aux2);
183                 kern->set_arg(aux3);
184                 kern->set_arg(lookup.ptr());
185                 kern->set_arg(ray_o_buff.ptr());
186                 kern->set_arg(ray_d_buff.ptr());
187                 kern->set_arg(tnearfar_mem_ptr.ptr());
188                 kern->set_arg(img_dim.ptr());
189                 kern->set_arg(vis_image.ptr());
190                 kern->set_arg(pre_image.ptr());
191                 kern->set_arg(cl_output.ptr());
192                 kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));//local tree,
193                 kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar)); //cumsum buffer, imindex buffer
194                 //execute kernel
195                 kern->execute(queue, 2, local_threads, global_threads);
196                 int status = clFinish(queue);
197                 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
198                     return false;
199                 gpu_time += kern->exec_time();
200                 //clear render kernel args so it can reset em on next execution
201                 kern->clear_args();
202                 //write info to disk
203             }
204             else if (i == NORMALIZE_PRE)
205             {
206                 blk_info->write_to_buffer((queue));
207 
208                 std::size_t lt[1], gt[1];
209                 lt[0] = 64;
210                 gt[0] = RoundUp(info_buffer->data_buffer_length, lt[0]);
211 
212                 kern->set_arg(blk_info);
213                 kern->set_arg(aux0);
214                 kern->set_arg(aux2);
215                 kern->set_arg(aux3);
216                 kern->execute(queue, 1, lt, gt);
217                 int status = clFinish(queue);
218                 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
219                     return false;
220                 gpu_time += kern->exec_time();
221                 aux2->read_to_buffer(queue);
222                 aux3->read_to_buffer(queue);
223                 //clear render kernel args so it can reset em on next execution
224                 kern->clear_args();
225                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("seglen_h"), false);
226                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), false);
227                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), true);
228                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), true);
229             }
230             //read image out to buffer (from gpu)
231             vis_image->read_to_buffer(queue);
232             pre_image->read_to_buffer(queue);
233             clFinish(queue);
234         }
235     }
236 
237     delete[] vis_buff;
238     delete[] pre_buff;
239     delete[] ray_origins;
240     delete[] ray_directions;
241     //opencl_cache->unref_mem(hmean_image.ptr());
242     //opencl_cache->unref_mem(hvar_image.ptr());
243     opencl_cache->unref_mem(vis_image.ptr());
244     opencl_cache->unref_mem(pre_image.ptr());
245     opencl_cache->unref_mem(ray_o_buff.ptr());
246     opencl_cache->unref_mem(ray_d_buff.ptr());
247     opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
248     std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
249     clReleaseCommandQueue(queue);
250     return true;
251 }
252 
253 //Main public method, updates color model
update_post(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,float,float)254 bool boxm2_ocl_compute_heightmap_pre_post::update_post(const boxm2_scene_sptr&         scene,
255     const bocl_device_sptr&         device,
256     const boxm2_opencl_cache_sptr&  opencl_cache,
257     const vil_image_view_base_sptr& ximg,
258     const vil_image_view_base_sptr& yimg,
259     float  /*resnearfactor*/,
260     float  /*resfarfactor*/)
261 {
262     enum {
263         UPDATE_POST = 0,
264         NORMALIZE_POST = 1
265     };
266     float transfer_time = 0.0f;
267     float gpu_time = 0.0f;
268     std::size_t local_threads[2] = { 8, 8 };
269     std::size_t global_threads[2] = { 8, 8 };
270 
271     //cache size sanity check
272     std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
273     std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
274 
275     //make correct data types are here
276     std::string data_type, num_obs_type, options;
277 
278     // create a command queue.
279     int status = 0;
280     cl_command_queue queue = clCreateCommandQueue(device->context(),
281         *(device->device_id()),
282         CL_QUEUE_PROFILING_ENABLE,
283         &status);
284     if (status != 0)
285         return false;
286 
287     auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
288     auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
289     auto cl_ni = (unsigned)RoundUp(ximg->ni(), (int)local_threads[0]);
290     auto cl_nj = (unsigned)RoundUp(ximg->nj(), (int)local_threads[1]);
291     global_threads[0] = cl_ni;
292     global_threads[1] = cl_nj;
293     //set generic cam
294     auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
295     auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
296     vgl_box_3d<double> bbox = scene->bounding_box();
297     float z = bbox.max_z();
298     int count = 0;
299     for (unsigned int j = 0; j < cl_nj; ++j) {
300         for (unsigned int i = 0; i < cl_ni; ++i) {
301             if (i < ximg->ni() && j < ximg->nj())
302             {
303                 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
304                 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
305                 ray_origins[count * 4 + 2] = z + 1.0f;
306                 ray_origins[count * 4 + 3] = 0.0f;
307                 ray_directions[count * 4 + 0] = 0.0;
308                 ray_directions[count * 4 + 1] = 0.0;
309                 ray_directions[count * 4 + 2] = -1.0;
310                 ray_directions[count * 4 + 3] = 0.0f;
311             }
312             ++count;
313         }
314     }
315     bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
316     ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
317     bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
318     ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
319     float tnearfar[2] = { 0.0f, 1000000 };
320     bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar  buffer");
321     tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
322     //Visibility, Preinf, Norm, and input image buffers
323     auto* vis_buff = new float[cl_ni*cl_nj];
324     auto* post_buff = new float[cl_ni*cl_nj];
325 
326     for (unsigned i = 0; i < cl_ni*cl_nj; i++)
327     {
328         vis_buff[i] = 1.0f;
329         post_buff[i] = 0.0f;
330     }
331 
332 
333     bocl_mem_sptr vis_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), vis_buff, "vis image buffer");
334     vis_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
335     bocl_mem_sptr post_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), post_buff, "pre image buffer");
336     post_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
337 
338     // Image Dimensions
339     int img_dim_buff[4];
340     img_dim_buff[0] = 0;
341     img_dim_buff[1] = 0;
342     img_dim_buff[2] = ximg->ni();
343     img_dim_buff[3] = ximg->nj();
344 
345     bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
346     img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
347 
348     // Output Array
349     float output_arr[100];
350     for (float & i : output_arr) i = 0.0f;
351     bocl_mem_sptr  cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
352     cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
353 
354     // bit lookup buffer
355     cl_uchar lookup_arr[256];
356     boxm2_ocl_util::set_bit_lookup(lookup_arr);
357     bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
358     lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
359 
360     // compile the kernel if not already compiled
361     std::vector<bocl_kernel*>& kernels = get_post_kernels(device, options);
362     // set arguments
363     std::vector<boxm2_block_id> vis_order;
364     vis_order = scene->get_block_ids();
365     std::vector<boxm2_block_id>::iterator id;
366     for (unsigned int i = 0; i < kernels.size(); ++i)
367     {
368         bocl_kernel* kern = kernels[i];
369         for (id = vis_order.begin(); id != vis_order.end(); ++id)
370         {
371             //choose correct render kernel
372             boxm2_block_metadata mdata = scene->get_block_metadata(*id);
373             //write the image values to the buffer
374             vul_timer transfer;
375             bocl_mem* blk = opencl_cache->get_block(scene, *id);
376             bocl_mem* blk_info = opencl_cache->loaded_block_info();
377             bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id, 0, false);
378             auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
379             int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
380             info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
381             // data type string may contain an identifier so determine the buffer size
382             //grab an appropriately sized AUX data buffer
383             bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("post_seglen_h"), 0, false);
384             bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), 0, false);
385             bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
386             transfer_time += (float)transfer.all();
387 
388             if (i == UPDATE_POST)
389             {
390                 aux0->zero_gpu_buffer(queue);
391                 aux1->zero_gpu_buffer(queue);
392                 kern->set_arg(blk_info);
393                 kern->set_arg(blk);
394                 kern->set_arg(alpha);
395                 kern->set_arg(aux0);
396                 kern->set_arg(aux1_ph_smooth);
397                 kern->set_arg(aux1);
398                 kern->set_arg(lookup.ptr());
399                 kern->set_arg(ray_o_buff.ptr());
400                 kern->set_arg(ray_d_buff.ptr());
401                 kern->set_arg(tnearfar_mem_ptr.ptr());
402                 kern->set_arg(img_dim.ptr());
403                 kern->set_arg(vis_image.ptr());
404                 kern->set_arg(post_image.ptr());
405                 kern->set_arg(cl_output.ptr());
406                 kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));//local tree,
407                 kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar)); //cumsum buffer, imindex buffer
408                 //execute kernel
409                 kern->execute(queue, 2, local_threads, global_threads);
410                 int status = clFinish(queue);
411                 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
412                     return false;
413                 gpu_time += kern->exec_time();
414                 clFinish(queue);
415                 //clear render kernel args so it can reset em on next execution
416                 kern->clear_args();
417                 //write info to disk
418 
419             }
420             else if (i == NORMALIZE_POST)
421             {
422                 blk_info->write_to_buffer((queue));
423 
424                 std::size_t lt[1], gt[1];
425                 lt[0] = 64;
426                 gt[0] = RoundUp(info_buffer->data_buffer_length, lt[0]);
427 
428                 kern->set_arg(blk_info);
429                 kern->set_arg(aux0);
430                 kern->set_arg(aux1);
431                 kern->execute(queue, 1, lt, gt);
432                 int status = clFinish(queue);
433                 if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
434                     return false;
435                 gpu_time += kern->exec_time();
436                 aux1->read_to_buffer(queue);
437 
438                 //clear render kernel args so it can reset em on next execution
439                 kern->clear_args();
440                 aux1->read_to_buffer(queue);
441                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("post_seglen_h"), false);
442                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), true);
443                 opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), false);
444 
445             }
446             //read image out to buffer (from gpu)
447             vis_image->read_to_buffer(queue);
448             post_image->read_to_buffer(queue);
449             cl_output->read_to_buffer(queue);
450             clFinish(queue);
451         }
452     }
453     delete[] vis_buff;
454     delete[] post_buff;
455     delete[] ray_origins;
456     delete[] ray_directions;
457 
458     opencl_cache->unref_mem(vis_image.ptr());
459     opencl_cache->unref_mem(post_image.ptr());
460     opencl_cache->unref_mem(ray_o_buff.ptr());
461     opencl_cache->unref_mem(ray_d_buff.ptr());
462     opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
463     std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
464     clReleaseCommandQueue(queue);
465 
466     return true;
467 }
468 
469 //Main public method, updates color model
compute_pre_post(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & hmap_mean,const vil_image_view_base_sptr & hmap_var,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,int smoothingradius,float resnearfactor,float resfarfactor)470 bool boxm2_ocl_compute_heightmap_pre_post::compute_pre_post(const boxm2_scene_sptr&         scene,
471     const bocl_device_sptr&         device,
472     const boxm2_opencl_cache_sptr&  opencl_cache,
473     const vil_image_view_base_sptr& hmap_mean,
474     const vil_image_view_base_sptr& hmap_var,
475     const vil_image_view_base_sptr& ximg,
476     const vil_image_view_base_sptr& yimg,
477     int smoothingradius,
478     float resnearfactor,
479     float resfarfactor)
480 {
481     boxm2_ocl_smooth_heightmap_pdata::compute_smooth_heightmap_pdata(scene, device, opencl_cache, hmap_mean, hmap_var, ximg, yimg, smoothingradius);
482     boxm2_ocl_compute_heightmap_pre_post::update_pre(scene, device, opencl_cache, ximg, yimg, resnearfactor, resfarfactor);
483     boxm2_ocl_compute_heightmap_pre_post::update_post(scene, device, opencl_cache, ximg, yimg, resnearfactor, resfarfactor);
484     return true;
485 }
update_heightmap_factor(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,bool add)486 bool boxm2_ocl_update_heightmap_factor::update_heightmap_factor(const boxm2_scene_sptr&         scene,
487                                                                 const bocl_device_sptr&         device,
488                                                                 const boxm2_opencl_cache_sptr&  opencl_cache,
489                                                                 bool add)
490 {
491     float transfer_time = 0.0f;
492     float gpu_time = 0.0f;
493     std::size_t local_threads[1] = { 64 };
494     std::size_t global_threads[1] = { 64 };
495     //cache size sanity check
496     std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
497     std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
498 
499     //make correct data types are here
500     std::string data_type, num_obs_type, options;
501     int does_add_buf = add ? 1 : 0;
502     bocl_mem_sptr does_add = new bocl_mem(device->context(), &does_add_buf, sizeof(int) * 1, "add (1) or subtract (0)");
503     does_add->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
504 
505     // create a command queue.
506     int status = 0;
507     cl_command_queue queue = clCreateCommandQueue(device->context(),*(device->device_id()),CL_QUEUE_PROFILING_ENABLE,&status);
508     if (status != 0)
509         return false;
510     // compile the kernel if not already compiled
511     bocl_kernel * kern = get_update_heightmap_factor_kernels(device, options)[0];
512     std::vector<boxm2_block_id> blks_order;
513     blks_order = scene->get_block_ids();
514     std::vector<boxm2_block_id>::iterator  id;
515 
516     for (id = blks_order.begin(); id != blks_order.end(); ++id)
517     {
518         //choose correct render kernel
519         boxm2_block_metadata mdata = scene->get_block_metadata(*id);
520         //write the image values to the buffer
521         vul_timer transfer;
522         bocl_mem* blk = opencl_cache->get_block(scene, *id);
523         bocl_mem* blk_info = opencl_cache->loaded_block_info();
524         bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id, 0, false);
525         auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
526         int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
527         info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
528         blk_info->write_to_buffer((queue));
529         local_threads[0] = 64;
530         global_threads[0] = RoundUp(info_buffer->data_buffer_length, local_threads[0]);
531         //grab an appropriately sized AUX data buffer
532         bocl_mem *aux1_pre = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), 0, true);
533         bocl_mem *aux2_pre = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), 0, true);
534         bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("post_h"), 0, false);
535         bocl_mem *aux1_ph_smooth = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
536         bocl_mem *aux0_hf = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("hf"), 0, false);
537 
538         transfer_time += (float)transfer.all();
539         kern->set_arg(blk_info);
540         kern->set_arg(does_add.ptr());
541         kern->set_arg(aux1_pre);
542         kern->set_arg(aux2_pre);
543         kern->set_arg(aux1);
544         kern->set_arg(aux1_ph_smooth);
545         kern->set_arg(aux0_hf);
546         //execute kernel
547         kern->execute(queue, 1, local_threads, global_threads);
548         int status = clFinish(queue);
549         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
550             return false;
551         gpu_time += kern->exec_time();
552         //clear render kernel args so it can reset em on next execution
553         aux0_hf->read_to_buffer(queue);
554         kern->clear_args();
555         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pre_h"), false);
556         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("vis_h"), false);
557         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX2>::prefix("post_h"), false);
558         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX3>::prefix("post_h"), false);
559         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("hf"), true);
560     }
561     clFinish(queue);
562     opencl_cache->unref_mem(does_add.ptr());
563     return true;
564 }
565 
566 //Returns vector of color update kernels (and caches them per device
get_pre_kernels(const bocl_device_sptr & device,const std::string & opts)567 std::vector<bocl_kernel*>& boxm2_ocl_compute_heightmap_pre_post::get_pre_kernels(const bocl_device_sptr& device, const std::string& opts)
568 {
569     // compile kernels if not already compiled
570     std::string identifier = device->device_identifier() + opts;
571     if (pre_kernels_.find(identifier) != pre_kernels_.end())
572         return pre_kernels_[identifier];
573 
574     //otherwise compile the kernels
575     std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
576     std::vector<std::string> src_paths;
577     std::string source_dir = boxm2_ocl_util::ocl_src_root();
578     src_paths.push_back(source_dir + "scene_info.cl");
579     src_paths.push_back(source_dir + "pixel_conversion.cl");
580     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
581     src_paths.push_back(source_dir + "backproject.cl");
582     src_paths.push_back(source_dir + "atomics_util.cl");
583     src_paths.push_back(source_dir + "statistics_library_functions.cl");
584     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
585     src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
586     std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
587     src_paths.push_back(source_dir + "update_functors.cl");
588     src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
589 
590     //compilation options
591     std::string options = "-D ATOMIC_FLOAT ";
592     //populate vector of kernels
593     std::vector<bocl_kernel*> vec_kernels;
594 
595     auto* pre = new bocl_kernel();
596     std::string pre_opts = options + " -D PRE_HMAP_CELL  -D STEP_CELL=step_cell_pre_hmap(aux_args,data_ptr,llid,d)";
597     pre->create_kernel(&device->context(), device->device_id(), src_paths, "pre_hmap_main", pre_opts, "update::pre_hmap_main");
598     vec_kernels.push_back(pre);
599     auto* normalize_pre = new bocl_kernel();
600     std::string normalize_pre_opts = options + " -D PRE_HMAP_CELL ";
601     normalize_pre->create_kernel(&device->context(), device->device_id(), src_paths, "normalize_prehmap_main", pre_opts, "update::normalize_prehmap_main");
602     vec_kernels.push_back(normalize_pre);
603     //store and return
604     pre_kernels_[identifier] = vec_kernels;
605     return pre_kernels_[identifier];
606 }
607 
get_post_kernels(const bocl_device_sptr & device,const std::string & opts)608 std::vector<bocl_kernel*>& boxm2_ocl_compute_heightmap_pre_post::get_post_kernels(const bocl_device_sptr& device, const std::string& opts)
609 {
610     // compile kernels if not already compiled
611     std::string identifier = device->device_identifier() + opts;
612     if (post_kernels_.find(identifier) != post_kernels_.end())
613         return post_kernels_[identifier];
614 
615     //otherwise compile the kernels
616     std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
617     std::vector<std::string> src_paths;
618     std::string source_dir = boxm2_ocl_util::ocl_src_root();
619     src_paths.push_back(source_dir + "scene_info.cl");
620     src_paths.push_back(source_dir + "pixel_conversion.cl");
621     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
622     src_paths.push_back(source_dir + "backproject.cl");
623     src_paths.push_back(source_dir + "atomics_util.cl");
624     src_paths.push_back(source_dir + "statistics_library_functions.cl");
625     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
626     src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
627     std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
628     src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
629 
630     //compilation options
631     std::string options = "-D ATOMIC_FLOAT  -D REVERSE";
632     //populate vector of kernels
633     std::vector<bocl_kernel*> vec_kernels;
634     //seg len pass
635 
636     auto* post = new bocl_kernel();
637     std::string post_opts = options + " -D POST_HMAP_CELL  -D STEP_CELL=step_cell_post_hmap(aux_args,data_ptr,llid,d)";
638     post->create_kernel(&device->context(), device->device_id(), src_paths, "post_hmap_main", post_opts, "update::post_Cell");
639     vec_kernels.push_back(post);
640 
641     auto* normalize_post = new bocl_kernel();
642     std::string normalize_post_opts = options + " -D NORMALIZE_POST_CELL ";
643     normalize_post->create_kernel(&device->context(), device->device_id(), non_ray_src, "normalize_post_cell", normalize_post_opts, "update::normalize_post_cell");
644     vec_kernels.push_back(normalize_post);
645     //store and return
646     post_kernels_[identifier] = vec_kernels;
647     return post_kernels_[identifier];
648 }
649 
get_update_heightmap_factor_kernels(const bocl_device_sptr & device,const std::string & opts)650 std::vector<bocl_kernel*>& boxm2_ocl_update_heightmap_factor::get_update_heightmap_factor_kernels(const bocl_device_sptr& device, const std::string& opts)
651 {
652     // compile kernels if not already compiled
653     std::string identifier = device->device_identifier() + opts;
654     if (update_heightmap_factor_kernels_.find(identifier) != update_heightmap_factor_kernels_.end())
655         return update_heightmap_factor_kernels_[identifier];
656 
657     //otherwise compile the kernels
658     std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
659     std::vector<std::string> src_paths;
660     std::string source_dir = boxm2_ocl_util::ocl_src_root();
661     src_paths.push_back(source_dir + "scene_info.cl");
662     src_paths.push_back(source_dir + "pixel_conversion.cl");
663     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
664     src_paths.push_back(source_dir + "backproject.cl");
665     src_paths.push_back(source_dir + "atomics_util.cl");
666     src_paths.push_back(source_dir + "statistics_library_functions.cl");
667     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
668     src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
669 
670     //compilation options
671     std::string options = "-D ATOMIC_FLOAT -D ADD_SUBTRACT_FACTOR";
672     //populate vector of kernels
673     std::vector<bocl_kernel*> vec_kernels;
674     auto* computez = new bocl_kernel();
675     const std::string& computez_opts = options;
676     computez->create_kernel(&device->context(), device->device_id(), src_paths, "add_subtract_factor_main", computez_opts, "update::add_subtract_factor_main");
677     vec_kernels.push_back(computez);
678 
679     //store and return
680     update_heightmap_factor_kernels_[identifier] = vec_kernels;
681     return update_heightmap_factor_kernels_[identifier];
682 }
683 
684 
685 bool boxm2_ocl_smooth_heightmap_pdata::
compute_smooth_heightmap_pdata(boxm2_scene_sptr scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,const vil_image_view_base_sptr & hmap_mean,const vil_image_view_base_sptr & hmap_var,const vil_image_view_base_sptr & ximg,const vil_image_view_base_sptr & yimg,int smoothingradius=16,float,float)686 compute_smooth_heightmap_pdata(boxm2_scene_sptr         scene,
687                                 const bocl_device_sptr&         device,
688                                 const boxm2_opencl_cache_sptr&  opencl_cache,
689                                 const vil_image_view_base_sptr& hmap_mean,
690                                 const vil_image_view_base_sptr& hmap_var,
691                                 const vil_image_view_base_sptr& ximg,
692                                 const vil_image_view_base_sptr& yimg,
693                                 int smoothingradius = 16,
694                                 float  /*resnearfactor*/,
695                                 float  /*resfarfactor*/)
696 {
697 
698     float transfer_time = 0.0f;
699     float gpu_time = 0.0f;
700     std::size_t local_threads[2] = { 8, 8 };
701     std::size_t global_threads[2] = { 8, 8 };
702 
703     //cache size sanity check
704     std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
705     std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
706 
707     //make correct data types are here
708     std::string data_type, num_obs_type, options;
709 
710     // create a command queue.
711     int status = 0;
712     cl_command_queue queue = clCreateCommandQueue(device->context(),
713         *(device->device_id()),
714         CL_QUEUE_PROFILING_ENABLE,
715         &status);
716     if (status != 0)
717         return false;
718 
719     //grab input image, establish cl_ni, cl_nj (so global size is divisible by local size)
720 
721     auto* hmap_mean_view = static_cast<vil_image_view<float>*>(hmap_mean.ptr());
722     auto* hmap_var_view = static_cast<vil_image_view<float>*>(hmap_var.ptr());
723     auto* ximg_view = static_cast<vil_image_view<float>*>(ximg.ptr());
724     auto* yimg_view = static_cast<vil_image_view<float>*>(yimg.ptr());
725 
726     auto cl_ni = (unsigned)RoundUp(hmap_mean_view->ni(), (int)local_threads[0]);
727     auto cl_nj = (unsigned)RoundUp(hmap_mean_view->nj(), (int)local_threads[1]);
728     global_threads[0] = cl_ni;
729     global_threads[1] = cl_nj;
730     //set generic cam
731     auto* ray_origins = new cl_float[4 * cl_ni*cl_nj];
732     auto* ray_directions = new cl_float[4 * cl_ni*cl_nj];
733     vgl_box_3d<double> bbox = scene->bounding_box();
734     float z = bbox.max_z();
735     int count = 0;
736     for (unsigned int j = 0; j < cl_nj; ++j) {
737         for (unsigned int i = 0; i < cl_ni; ++i) {
738             if (i < hmap_mean_view->ni() && j < hmap_mean_view->nj())
739             {
740                 ray_origins[count * 4 + 0] = (*ximg_view)(i, j);
741                 ray_origins[count * 4 + 1] = (*yimg_view)(i, j);
742                 ray_origins[count * 4 + 2] = z + 1.0f;
743                 ray_origins[count * 4 + 3] = 0.0f;
744                 ray_directions[count * 4 + 0] = 0.0;
745                 ray_directions[count * 4 + 1] = 0.0;
746                 ray_directions[count * 4 + 2] = -1.0;
747                 ray_directions[count * 4 + 3] = 0.0f;
748             }
749             ++count;
750         }
751     }
752     bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
753     ray_o_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
754     bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
755     ray_d_buff->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
756     float tnearfar[2] = { 0.0f, 1000000 };
757     bocl_mem_sptr tnearfar_mem_ptr = opencl_cache->alloc_mem(2 * sizeof(float), tnearfar, "tnearfar  buffer");
758     tnearfar_mem_ptr->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
759     //Visibility, Preinf, Norm, and input image buffers
760     auto* hmean_buff = new float[cl_ni*cl_nj];
761     auto* hvar_buff = new float[cl_ni*cl_nj];
762 
763     //copy input vals into image
764     count = 0;
765     for (unsigned int j = 0; j < cl_nj; ++j) {
766         for (unsigned int i = 0; i < cl_ni; ++i) {
767             hmean_buff[count] = 0.0f;
768             hvar_buff[count] = 0.0f;
769             if (i < hmap_mean_view->ni() && j < hmap_mean_view->nj())
770             {
771                 hmean_buff[count] = (*hmap_mean_view)(i, j);
772                 hvar_buff[count] = (*hmap_var_view)(i, j);
773             }
774             ++count;
775         }
776     }
777     bocl_mem_sptr hmean_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), hmean_buff, "input image buffer");
778     hmean_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
779     bocl_mem_sptr hvar_image = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(float), hvar_buff, "input image buffer");
780     hvar_image->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
781     // Image Dimensions
782     int img_dim_buff[4];
783     img_dim_buff[0] = 0;
784     img_dim_buff[1] = 0;
785     img_dim_buff[2] = hmap_mean_view->ni();
786     img_dim_buff[3] = hmap_mean_view->nj();
787 
788     bocl_mem_sptr img_dim = new bocl_mem(device->context(), img_dim_buff, sizeof(int) * 4, "image dims");
789     img_dim->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
790 
791     // Output Array
792     float output_arr[100];
793     for (float & i : output_arr) i = 0.0f;
794     bocl_mem_sptr  cl_output = new bocl_mem(device->context(), output_arr, sizeof(float) * 100, "output buffer");
795     cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
796 
797     // bit lookup buffer
798     cl_uchar lookup_arr[256];
799     boxm2_ocl_util::set_bit_lookup(lookup_arr);
800     bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
801     lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
802     //: avg the Pheight with the neighbors
803     vnl_random rand;
804     int numsamples = 16;
805     int rad = smoothingradius;
806     auto * weights = new float[numsamples];
807     int * pts = new int[2 * numsamples];
808     for (int i = 0; i < numsamples;)
809     {
810         int x = -rad + rand.lrand32(0, 2 * rad + 1);
811         int y = -rad + rand.lrand32(0, 2 * rad + 1);
812         if (x == 0 && y == 0) x = 1;
813         pts[2 * i] = x;
814         pts[2 * i + 1] = y;
815         weights[i] = 1 - std::sqrt(float(x*x + y*y)) / (float(rad));
816         i++;
817     }
818     typedef vnl_vector_fixed<unsigned char, 16> uchar16;
819     // compile the kernel if not already compiled
820     std::vector<bocl_kernel*>& kernels = get_smooth_heightmap_pdata_kernels(device, options);
821     // set arguments
822     std::vector<boxm2_block_id> vis_order;
823     vis_order = scene->get_block_ids();
824     std::vector<boxm2_block_id>::iterator id;
825     bocl_kernel* kern = kernels[0];
826     for (id = vis_order.begin(); id != vis_order.end(); ++id)
827     {
828         //choose correct render kernel
829         boxm2_block_metadata mdata = scene->get_block_metadata(*id);
830         //write the image values to the buffer
831         vul_timer transfer;
832         bocl_mem* blk = opencl_cache->get_block(scene, *id);
833         bocl_mem* blk_info = opencl_cache->loaded_block_info();
834         //grab an appropriately sized AUX data buffer
835         bocl_mem *aux0 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"), 0, false);
836         bocl_mem *aux1 = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"), 0, false);
837 
838         transfer_time += (float)transfer.all();
839         aux0->zero_gpu_buffer(queue);
840         aux1->zero_gpu_buffer(queue);
841         kern->set_arg(blk_info);
842         kern->set_arg(blk);
843         kern->set_arg(aux0);
844         kern->set_arg(aux1);
845         kern->set_arg(lookup.ptr());
846         kern->set_arg(ray_o_buff.ptr());
847         kern->set_arg(ray_d_buff.ptr());
848         kern->set_arg(tnearfar_mem_ptr.ptr());
849         kern->set_arg(img_dim.ptr());
850         kern->set_arg(hmean_image.ptr());
851         kern->set_arg(hvar_image.ptr());
852         kern->set_arg(cl_output.ptr());
853         kern->set_local_arg(local_threads[0] * local_threads[1] * sizeof(cl_uchar16));
854         kern->set_local_arg(local_threads[0] * local_threads[1] * 10 * sizeof(cl_uchar));
855         //execute kernel
856         kern->execute(queue, 2, local_threads, global_threads);
857         int status = clFinish(queue);
858         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
859             return false;
860         gpu_time += kern->exec_time();
861         //clear render kernel args so it can reset em on next execution
862         kern->clear_args();
863         aux0->read_to_buffer(queue);
864         aux1->read_to_buffer(queue);
865        // boxm2_block_metadata data = scene->get_block_metadata(*id);
866         boxm2_block * cblk = opencl_cache->get_cpu_cache()->get_block(scene, *id);
867         //get data from cache
868         boxm2_data_base * aux0_len_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"));
869         boxm2_data_base * aux1_ph_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"));
870         boxm2_data_base * aux1_ph_smooth_b = opencl_cache->get_cpu_cache()->get_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), 0, false);
871         //3d array of trees
872         const boxm2_array_3d<uchar16>& trees = cblk->trees();
873         auto * aux0_len = (boxm2_data_traits<BOXM2_AUX0>::datatype*) aux0_len_b->data_buffer();
874         auto * aux1_ph = (boxm2_data_traits<BOXM2_AUX1>::datatype*) aux1_ph_b->data_buffer();
875         auto * aux1_ph_smooth = (boxm2_data_traits<BOXM2_AUX1>::datatype*) aux1_ph_smooth_b->data_buffer();
876 
877         //iterate through each tree
878         for (unsigned int x = 0; x < trees.get_row1_count(); ++x) {
879             for (unsigned int y = 0; y < trees.get_row2_count(); ++y) {
880                 for (unsigned int z = 0; z < trees.get_row3_count(); ++z) {
881                     //load current block/tree
882                     uchar16 tree = trees(x, y, z);
883                     boct_bit_tree bit_tree((unsigned char*)tree.data_block(), mdata.max_level_);
884                     //iterate through leaves of the tree
885                     std::vector<int> leafBits = bit_tree.get_leaf_bits(0);
886                     std::vector<int>::iterator iter;
887                     for (iter = leafBits.begin(); iter != leafBits.end(); ++iter)
888                     {
889                         int currIdx = bit_tree.get_data_index((*iter)); //data index
890                         vgl_point_3d<double> localCenter = bit_tree.cell_center((*iter));
891                         float px = (localCenter.x() + x)*mdata.sub_block_dim_.x() + mdata.local_origin_.x();
892                         float py = (localCenter.y() + y)*mdata.sub_block_dim_.y() + mdata.local_origin_.y();
893                         float pz = (localCenter.z() + z)*mdata.sub_block_dim_.z() + mdata.local_origin_.z();
894                         double sumweight = 1.0;
895                         if (aux0_len[currIdx] > 1e-10)
896                             aux1_ph_smooth[currIdx] = aux1_ph[currIdx] / aux0_len[currIdx];
897                         else
898                             aux1_ph_smooth[currIdx] = 0.0;
899                         //: iterate over neighbors
900                         for (size_t k = 0; k < numsamples; k++)
901                         {
902                             vgl_point_3d<double> neighbor_pt(px + pts[2 * k] * mdata.sub_block_dim_.x(),
903                                                              py + pts[2 * k + 1] * mdata.sub_block_dim_.y(), pz);
904                             unsigned int data_index = 0;
905                             if (cblk->data_index(neighbor_pt, data_index))
906                             {
907                                 if (aux0_len[data_index] > 1e-10)
908                                 {
909                                     aux1_ph_smooth[currIdx] +=  (aux1_ph[data_index] / aux0_len[data_index]);
910                                     sumweight += weights[k];
911                                 }
912                             }
913                         }
914                         aux1_ph_smooth[currIdx] = aux1_ph_smooth[currIdx] / sumweight;
915                     }
916                 }
917             }
918         }
919         opencl_cache->get_cpu_cache()->remove_data_base(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight_smooth"), true);
920         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX1>::prefix("pheight"), false);
921         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("length"), false);
922     }
923     clFinish(queue);
924 
925     delete[] hmean_buff;
926     delete[] hvar_buff;
927     delete[] ray_origins;
928     delete[] ray_directions;
929     opencl_cache->unref_mem(hmean_image.ptr());
930     opencl_cache->unref_mem(hvar_image.ptr());
931     opencl_cache->unref_mem(ray_o_buff.ptr());
932     opencl_cache->unref_mem(ray_d_buff.ptr());
933     opencl_cache->unref_mem(tnearfar_mem_ptr.ptr());
934     std::cout << "Gpu time " << gpu_time << " transfer time " << transfer_time << std::endl;
935     clReleaseCommandQueue(queue);
936 
937     delete[] weights;
938     delete[] pts;
939     return true;
940 }
941 std::vector<bocl_kernel*>& boxm2_ocl_smooth_heightmap_pdata::
get_smooth_heightmap_pdata_kernels(const bocl_device_sptr & device,const std::string & opts)942 get_smooth_heightmap_pdata_kernels(const bocl_device_sptr& device, const std::string& opts)
943 {
944     // compile kernels if not already compiled
945     std::string identifier = device->device_identifier() + opts;
946     if (smooth_heightmap_pdata_kernels_.find(identifier) != smooth_heightmap_pdata_kernels_.end())
947         return smooth_heightmap_pdata_kernels_[identifier];
948 
949     //otherwise compile the kernels
950     std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
951     std::vector<std::string> src_paths;
952     std::string source_dir = boxm2_ocl_util::ocl_src_root();
953     src_paths.push_back(source_dir + "scene_info.cl");
954     src_paths.push_back(source_dir + "pixel_conversion.cl");
955     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
956     src_paths.push_back(source_dir + "backproject.cl");
957     src_paths.push_back(source_dir + "atomics_util.cl");
958     src_paths.push_back(source_dir + "statistics_library_functions.cl");
959     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
960     src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
961     std::vector<std::string> non_ray_src = std::vector<std::string>(src_paths);
962     src_paths.push_back(source_dir + "update_functors.cl");
963     src_paths.push_back(source_dir + "bit/cast_ray_bit.cl");
964 
965     //compilation options
966     std::string options = "-D ATOMIC_FLOAT ";
967     //populate vector of kernels
968     std::vector<bocl_kernel*> vec_kernels;
969     //seg len pass
970     auto* seg_len = new bocl_kernel();
971     std::string seg_opts = options + " -D HMAP_DENSITY_CELL  -D STEP_CELL=step_cell_hmap_density(aux_args,data_ptr,llid,d,tblock)";
972     seg_len->create_kernel(&device->context(), device->device_id(), src_paths, "compute_hmap_density_main", seg_opts, "update::hmap_density_main");
973     vec_kernels.push_back(seg_len);
974 
975     //store and return
976     smooth_heightmap_pdata_kernels_[identifier] = vec_kernels;
977     return smooth_heightmap_pdata_kernels_[identifier];
978 }
979