1 // This is brl/bseg/boxm2/ocl/algo/boxm2_ocl_fuse_factors.cxx
2 #include <fstream>
3 #include <iostream>
4 #include <algorithm>
5 #include "boxm2_ocl_fuse_factors.h"
6 //:
7 // \file
8 // \brief  A process for combining the factors
9 //
10 // \author Vishal Jain
11 // \date Nov 24, 2015
12 
13 #ifdef _MSC_VER
14 #  include "vcl_msvc_warnings.h"
15 #endif
16 #include <boxm2/ocl/boxm2_opencl_cache.h>
17 #include <boxm2/boxm2_scene.h>
18 #include <boxm2/boxm2_block.h>
19 #include <boxm2/boxm2_data_base.h>
20 #include <boxm2/ocl/boxm2_ocl_util.h>
21 #include <boxm2/boxm2_util.h>
22 
23 #include "vil/vil_image_view.h"
24 
25 //directory utility
26 #include "vul/vul_timer.h"
27 #include <vcl_where_root_dir.h>
28 #include <bocl/bocl_device.h>
29 #include <bocl/bocl_kernel.h>
30 
31 //: Map of kernels should persist between process executions
32 
33 std::map<std::string, std::vector<bocl_kernel*> > boxm2_ocl_fuse_factors::fuse_factors_kernels_;
fuse_factors(const boxm2_scene_sptr & scene,const bocl_device_sptr & device,const boxm2_opencl_cache_sptr & opencl_cache,std::vector<std::string> factors_ident,std::vector<float> weights)34 bool boxm2_ocl_fuse_factors::fuse_factors(const boxm2_scene_sptr&         scene,
35                                           const bocl_device_sptr&         device,
36                                           const boxm2_opencl_cache_sptr&  opencl_cache,
37                                           std::vector<std::string>   factors_ident,
38                                           std::vector<float>   weights)
39 {
40     float transfer_time = 0.0f;
41     float gpu_time = 0.0f;
42     std::size_t local_threads[1] = { 64 };
43     std::size_t global_threads[1] = { 64 };
44     //cache size sanity check
45     std::size_t binCache = opencl_cache.ptr()->bytes_in_cache();
46     std::cout << "Update MBs in cache: " << binCache / (1024.0*1024.0) << std::endl;
47     // create a command queue.
48     int status = 0;
49     cl_command_queue queue = clCreateCommandQueue(device->context(),
50                                                     *(device->device_id()),
51                                                     CL_QUEUE_PROFILING_ENABLE,
52                                                     &status);
53     if (status != 0)
54         return false;
55     std::vector<boxm2_block_id> blks_order;
56     blks_order = scene->get_block_ids();
57     std::vector<boxm2_block_id>::iterator  id;
58     // compile the kernel if not already compiled
59     //: Initialize Cumulative factor
60     bocl_kernel * kern = get_fuse_factors_kernels(device)[0];
61     float weight_buf[] = { 0.0 };
62     bocl_mem * weight = new bocl_mem(device->context(), weight_buf, sizeof(float), "weight buffer");
63 
64     for (id = blks_order.begin(); id != blks_order.end(); ++id)
65     {
66         //choose correct render kernel
67         boxm2_block_metadata mdata = scene->get_block_metadata(*id);
68         //write the image values to the buffer
69         vul_timer transfer;
70         bocl_mem* blk = opencl_cache->get_block(scene, *id);
71         bocl_mem* blk_info = opencl_cache->loaded_block_info();
72         bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id);
73         auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
74         int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
75         info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
76         blk_info->write_to_buffer((queue));
77         local_threads[0] = 64;
78         global_threads[0] = RoundUp(info_buffer->data_buffer_length, local_threads[0]);
79         //grab an appropriately sized AUX data buffer
80         bocl_mem *aux0_cum = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("cum"), 0, false);
81         bocl_mem* prob_init = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("prob_init"), 0, false);
82         transfer_time += (float)transfer.all();
83         kern->set_arg(blk_info);
84         kern->set_arg(prob_init);
85         kern->set_arg(aux0_cum);
86         //execute kernel
87         kern->execute(queue, 1, local_threads, global_threads);
88         int status = clFinish(queue);
89         if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
90             return false;
91         gpu_time += kern->exec_time();
92         //clear render kernel args so it can reset em on next execution
93         aux0_cum->read_to_buffer(queue);
94         kern->clear_args();
95         opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("prob_init"), true);
96     }
97     kern = get_fuse_factors_kernels(device)[1];
98     for (id = blks_order.begin(); id != blks_order.end(); ++id)
99     {
100         //choose correct render kernel
101         boxm2_block_metadata mdata = scene->get_block_metadata(*id);
102         //write the image values to the buffer
103         vul_timer transfer;
104         bocl_mem* blk = opencl_cache->get_block(scene, *id);
105         bocl_mem* blk_info = opencl_cache->loaded_block_info();
106         bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id);
107         auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
108         int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
109         info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
110         blk_info->write_to_buffer((queue));
111         local_threads[0] = 64;
112         global_threads[0] = RoundUp(info_buffer->data_buffer_length, local_threads[0]);
113         //grab an appropriately sized AUX data buffer
114         bocl_mem *aux0_cum = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("cum"), 0, false);
115         weight->create_buffer(CL_MEM_READ_ONLY, queue);
116         for (unsigned int j = 0; j < factors_ident.size(); j++)
117         {
118 
119             weight_buf[0] = weights[j];
120             weight->write_to_buffer(queue);
121             bocl_mem *aux0_factor = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix(factors_ident[j]), 0, false);
122 
123             kern->set_arg(blk_info);
124             kern->set_arg(aux0_factor);
125             kern->set_arg(aux0_cum);
126             kern->set_arg(weight);
127             //execute kernel
128             kern->execute(queue, 1, local_threads, global_threads);
129             int status = clFinish(queue);
130             if (!check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status)))
131                 return false;
132             gpu_time += kern->exec_time();
133             aux0_cum->read_to_buffer(queue);
134             kern->clear_args();
135         }
136 
137     }
138     cl_uchar lookup_arr[256];
139     boxm2_ocl_util::set_bit_lookup(lookup_arr);
140     bocl_mem_sptr lookup = new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar) * 256, "bit lookup buffer");
141     lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
142     kern = get_fuse_factors_kernels(device)[2];
143     for (id = blks_order.begin(); id != blks_order.end(); ++id)
144     {
145         //choose correct render kernel
146         boxm2_block_metadata mdata = scene->get_block_metadata(*id);
147         //write the image values to the buffer
148         vul_timer transfer;
149         bocl_mem* blk = opencl_cache->get_block(scene, *id);
150         bocl_mem* blk_info = opencl_cache->loaded_block_info();
151         bocl_mem* alpha = opencl_cache->get_data<BOXM2_ALPHA>(scene, *id);
152         auto* info_buffer = (boxm2_scene_info*)blk_info->cpu_buffer();
153         int alphaTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
154         info_buffer->data_buffer_length = (int)(alpha->num_bytes() / alphaTypeSize);
155         blk_info->write_to_buffer((queue));
156         //grab an appropriately sized AUX data buffer
157         bocl_mem *aux0_cum = opencl_cache->get_data(scene, *id, boxm2_data_traits<BOXM2_AUX0>::prefix("cum"), 0, false);
158 
159         //set workspace
160         std::size_t ltr[] = { 4, 4, 4 };
161         std::size_t gtr[] = { RoundUp(mdata.sub_block_num_.x(), ltr[0]),
162                              RoundUp(mdata.sub_block_num_.y(), ltr[1]),
163                              RoundUp(mdata.sub_block_num_.z(), ltr[2]) };
164         kern->set_arg(blk_info);
165         kern->set_arg(blk);
166         kern->set_arg(alpha);
167         kern->set_arg(aux0_cum);
168         kern->set_arg(lookup.ptr());
169         kern->set_local_arg(ltr[0] * ltr[1] * ltr[2] * 10 * sizeof(cl_uchar));
170         kern->set_local_arg(ltr[0] * ltr[1] * ltr[2] * sizeof(cl_uchar16));
171 
172         //execute kernel
173         kern->execute(queue, 3, ltr, gtr);
174         int status = clFinish(queue);
175         check_val(status, MEM_FAILURE, "UPDATE EXECUTE FAILED: " + error_to_string(status));
176         gpu_time += kern->exec_time();
177 
178         alpha->read_to_buffer(queue);
179         kern->clear_args();
180         //opencl_cache->deep_remove_data(scene, *id, boxm2_data_traits<BOXM2_ALPHA>::prefix(), true);
181     }
182     clFinish(queue);
183 
184     return true;
185 }
186 
187 
188 
get_fuse_factors_kernels(const bocl_device_sptr & device,const std::string & opts)189 std::vector<bocl_kernel*>& boxm2_ocl_fuse_factors::get_fuse_factors_kernels(const bocl_device_sptr& device, const std::string& opts)
190 {
191     // compile kernels if not already compiled
192     std::string identifier = device->device_identifier() + opts;
193     if (fuse_factors_kernels_.find(identifier) != fuse_factors_kernels_.end())
194         return fuse_factors_kernels_[identifier];
195 
196     //otherwise compile the kernels
197     std::cout << "=== boxm2_ocl_update_process::compiling kernels on device " << identifier << "===" << std::endl;
198     std::vector<std::string> src_paths;
199     std::string source_dir = boxm2_ocl_util::ocl_src_root();
200     src_paths.push_back(source_dir + "scene_info.cl");
201     src_paths.push_back(source_dir + "pixel_conversion.cl");
202     src_paths.push_back(source_dir + "bit/bit_tree_library_functions.cl");
203     src_paths.push_back(source_dir + "backproject.cl");
204     src_paths.push_back(source_dir + "atomics_util.cl");
205     src_paths.push_back(source_dir + "statistics_library_functions.cl");
206     src_paths.push_back(source_dir + "ray_bundle_library_opt.cl");
207     src_paths.push_back(source_dir + "bit/update_kernels.cl");
208     src_paths.push_back(source_dir + "bit/update_bp_kernels.cl");
209     //populate vector of kernels
210     std::vector<bocl_kernel*> vec_kernels;
211 
212     //compilation options
213     std::string options = "-D INIT_CUM";
214     auto* init_cum = new bocl_kernel();
215     std::string init_cum_opts = options;
216     init_cum->create_kernel(&device->context(), device->device_id(), src_paths, "init_cum_main", init_cum_opts, "update::init_cum");
217     vec_kernels.push_back(init_cum);
218 
219     options = "-D FUSE_FACTORS";
220     auto* fusefactors = new bocl_kernel();
221     std::string fusefactors_opts = options;
222     fusefactors->create_kernel(&device->context(), device->device_id(), src_paths, "fuse_factors_main", fusefactors_opts, "update::fuse_factors");
223     vec_kernels.push_back(fusefactors);
224 
225     options = "-D EVALUATE_ALPHA";
226     auto* evaluate_alpha = new bocl_kernel();
227     std::string evaluate_alpha_opts = options;
228     evaluate_alpha->create_kernel(&device->context(), device->device_id(), src_paths, "evaluate_alpha_main", evaluate_alpha_opts, "update::evaluate_alpha");
229     vec_kernels.push_back(evaluate_alpha);
230     //store and return
231     fuse_factors_kernels_[identifier] = vec_kernels;
232     return fuse_factors_kernels_[identifier];
233 }
234