1 // Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //     * Redistributions of source code must retain the above copyright
8 //       notice, this list of conditions and the following disclaimer.
9 //
10 //     * Redistributions in binary form must reproduce the above copyright
11 //       notice, this list of conditions and the following disclaimer in the
12 //       documentation and/or other materials provided with the distribution.
13 //
14 //     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
15 //       its contributors may be used to endorse or promote products derived
16 //       from this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
22 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 // POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
31 
32 #define _USE_MATH_DEFINES
33 
34 #include "mvs/patch_match_cuda.h"
35 
36 #include <algorithm>
37 #include <cfloat>
38 #include <cmath>
39 #include <cstdint>
40 #include <sstream>
41 
42 #include "util/cuda.h"
43 #include "util/cudacc.h"
44 #include "util/logging.h"
45 
46 // The number of threads per Cuda thread. Warning: Do not change this value,
47 // since the templated window sizes rely on this value.
48 #define THREADS_PER_BLOCK 32
49 
50 // We must not include "util/math.h" to avoid any Eigen includes here,
51 // since Visual Studio cannot compile some of the Eigen/Boost expressions.
52 #ifndef DEG2RAD
53 #define DEG2RAD(deg) deg * 0.0174532925199432
54 #endif
55 
56 namespace colmap {
57 namespace mvs {
58 
59 texture<uint8_t, cudaTextureType2D, cudaReadModeNormalizedFloat>
60     ref_image_texture;
61 texture<uint8_t, cudaTextureType2DLayered, cudaReadModeNormalizedFloat>
62     src_images_texture;
63 texture<float, cudaTextureType2DLayered, cudaReadModeElementType>
64     src_depth_maps_texture;
65 texture<float, cudaTextureType2D, cudaReadModeElementType> poses_texture;
66 
67 // Calibration of reference image as {fx, cx, fy, cy}.
68 __constant__ float ref_K[4];
69 // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}.
70 __constant__ float ref_inv_K[4];
71 
Mat33DotVec3(const float mat[9],const float vec[3],float result[3])72 __device__ inline void Mat33DotVec3(const float mat[9], const float vec[3],
73                                     float result[3]) {
74   result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
75   result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
76   result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
77 }
78 
Mat33DotVec3Homogeneous(const float mat[9],const float vec[2],float result[2])79 __device__ inline void Mat33DotVec3Homogeneous(const float mat[9],
80                                                const float vec[2],
81                                                float result[2]) {
82   const float inv_z = 1.0f / (mat[6] * vec[0] + mat[7] * vec[1] + mat[8]);
83   result[0] = inv_z * (mat[0] * vec[0] + mat[1] * vec[1] + mat[2]);
84   result[1] = inv_z * (mat[3] * vec[0] + mat[4] * vec[1] + mat[5]);
85 }
86 
DotProduct3(const float vec1[3],const float vec2[3])87 __device__ inline float DotProduct3(const float vec1[3], const float vec2[3]) {
88   return vec1[0] * vec2[0] + vec1[1] * vec2[1] + vec1[2] * vec2[2];
89 }
90 
GenerateRandomDepth(const float depth_min,const float depth_max,curandState * rand_state)91 __device__ inline float GenerateRandomDepth(const float depth_min,
92                                             const float depth_max,
93                                             curandState* rand_state) {
94   return curand_uniform(rand_state) * (depth_max - depth_min) + depth_min;
95 }
96 
GenerateRandomNormal(const int row,const int col,curandState * rand_state,float normal[3])97 __device__ inline void GenerateRandomNormal(const int row, const int col,
98                                             curandState* rand_state,
99                                             float normal[3]) {
100   // Unbiased sampling of normal, according to George Marsaglia, "Choosing a
101   // Point from the Surface of a Sphere", 1972.
102   float v1 = 0.0f;
103   float v2 = 0.0f;
104   float s = 2.0f;
105   while (s >= 1.0f) {
106     v1 = 2.0f * curand_uniform(rand_state) - 1.0f;
107     v2 = 2.0f * curand_uniform(rand_state) - 1.0f;
108     s = v1 * v1 + v2 * v2;
109   }
110 
111   const float s_norm = sqrt(1.0f - s);
112   normal[0] = 2.0f * v1 * s_norm;
113   normal[1] = 2.0f * v2 * s_norm;
114   normal[2] = 1.0f - 2.0f * s;
115 
116   // Make sure normal is looking away from camera.
117   const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
118                              ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
119   if (DotProduct3(normal, view_ray) > 0) {
120     normal[0] = -normal[0];
121     normal[1] = -normal[1];
122     normal[2] = -normal[2];
123   }
124 }
125 
PerturbDepth(const float perturbation,const float depth,curandState * rand_state)126 __device__ inline float PerturbDepth(const float perturbation,
127                                      const float depth,
128                                      curandState* rand_state) {
129   const float depth_min = (1.0f - perturbation) * depth;
130   const float depth_max = (1.0f + perturbation) * depth;
131   return GenerateRandomDepth(depth_min, depth_max, rand_state);
132 }
133 
PerturbNormal(const int row,const int col,const float perturbation,const float normal[3],curandState * rand_state,float perturbed_normal[3],const int num_trials=0)134 __device__ inline void PerturbNormal(const int row, const int col,
135                                      const float perturbation,
136                                      const float normal[3],
137                                      curandState* rand_state,
138                                      float perturbed_normal[3],
139                                      const int num_trials = 0) {
140   // Perturbation rotation angles.
141   const float a1 = (curand_uniform(rand_state) - 0.5f) * perturbation;
142   const float a2 = (curand_uniform(rand_state) - 0.5f) * perturbation;
143   const float a3 = (curand_uniform(rand_state) - 0.5f) * perturbation;
144 
145   const float sin_a1 = sin(a1);
146   const float sin_a2 = sin(a2);
147   const float sin_a3 = sin(a3);
148   const float cos_a1 = cos(a1);
149   const float cos_a2 = cos(a2);
150   const float cos_a3 = cos(a3);
151 
152   // R = Rx * Ry * Rz
153   float R[9];
154   R[0] = cos_a2 * cos_a3;
155   R[1] = -cos_a2 * sin_a3;
156   R[2] = sin_a2;
157   R[3] = cos_a1 * sin_a3 + cos_a3 * sin_a1 * sin_a2;
158   R[4] = cos_a1 * cos_a3 - sin_a1 * sin_a2 * sin_a3;
159   R[5] = -cos_a2 * sin_a1;
160   R[6] = sin_a1 * sin_a3 - cos_a1 * cos_a3 * sin_a2;
161   R[7] = cos_a3 * sin_a1 + cos_a1 * sin_a2 * sin_a3;
162   R[8] = cos_a1 * cos_a2;
163 
164   // Perturb the normal vector.
165   Mat33DotVec3(R, normal, perturbed_normal);
166 
167   // Make sure the perturbed normal is still looking in the same direction as
168   // the viewing direction, otherwise try again but with smaller perturbation.
169   const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
170                              ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
171   if (DotProduct3(perturbed_normal, view_ray) >= 0.0f) {
172     const int kMaxNumTrials = 3;
173     if (num_trials < kMaxNumTrials) {
174       PerturbNormal(row, col, 0.5f * perturbation, normal, rand_state,
175                     perturbed_normal, num_trials + 1);
176       return;
177     } else {
178       perturbed_normal[0] = normal[0];
179       perturbed_normal[1] = normal[1];
180       perturbed_normal[2] = normal[2];
181       return;
182     }
183   }
184 
185   // Make sure normal has unit norm.
186   const float inv_norm = rsqrt(DotProduct3(perturbed_normal, perturbed_normal));
187   perturbed_normal[0] *= inv_norm;
188   perturbed_normal[1] *= inv_norm;
189   perturbed_normal[2] *= inv_norm;
190 }
191 
ComputePointAtDepth(const float row,const float col,const float depth,float point[3])192 __device__ inline void ComputePointAtDepth(const float row, const float col,
193                                            const float depth, float point[3]) {
194   point[0] = depth * (ref_inv_K[0] * col + ref_inv_K[1]);
195   point[1] = depth * (ref_inv_K[2] * row + ref_inv_K[3]);
196   point[2] = depth;
197 }
198 
199 // Transfer depth on plane from viewing ray at row1 to row2. The returned
200 // depth is the intersection of the viewing ray through row2 with the plane
201 // at row1 defined by the given depth and normal.
PropagateDepth(const float depth1,const float normal1[3],const float row1,const float row2)202 __device__ inline float PropagateDepth(const float depth1,
203                                        const float normal1[3], const float row1,
204                                        const float row2) {
205   // Point along first viewing ray.
206   const float x1 = depth1 * (ref_inv_K[2] * row1 + ref_inv_K[3]);
207   const float y1 = depth1;
208   // Point on plane defined by point along first viewing ray and plane normal1.
209   const float x2 = x1 + normal1[2];
210   const float y2 = y1 - normal1[1];
211 
212   // Origin of second viewing ray.
213   // const float x3 = 0.0f;
214   // const float y3 = 0.0f;
215   // Point on second viewing ray.
216   const float x4 = ref_inv_K[2] * row2 + ref_inv_K[3];
217   // const float y4 = 1.0f;
218 
219   // Intersection of the lines ((x1, y1), (x2, y2)) and ((x3, y3), (x4, y4)).
220   const float denom = x2 - x1 + x4 * (y1 - y2);
221   const float kEps = 1e-5f;
222   if (abs(denom) < kEps) {
223     return depth1;
224   }
225   const float nom = y1 * x2 - x1 * y2;
226   return nom / denom;
227 }
228 
229 // First, compute triangulation angle between reference and source image for 3D
230 // point. Second, compute incident angle between viewing direction of source
231 // image and normal direction of 3D point. Both angles are cosine distances.
ComputeViewingAngles(const float point[3],const float normal[3],const int image_idx,float * cos_triangulation_angle,float * cos_incident_angle)232 __device__ inline void ComputeViewingAngles(const float point[3],
233                                             const float normal[3],
234                                             const int image_idx,
235                                             float* cos_triangulation_angle,
236                                             float* cos_incident_angle) {
237   *cos_triangulation_angle = 0.0f;
238   *cos_incident_angle = 0.0f;
239 
240   // Projection center of source image.
241   float C[3];
242   for (int i = 0; i < 3; ++i) {
243     C[i] = tex2D(poses_texture, i + 16, image_idx);
244   }
245 
246   // Ray from point to camera.
247   const float SX[3] = {C[0] - point[0], C[1] - point[1], C[2] - point[2]};
248 
249   // Length of ray from reference image to point.
250   const float RX_inv_norm = rsqrt(DotProduct3(point, point));
251 
252   // Length of ray from source image to point.
253   const float SX_inv_norm = rsqrt(DotProduct3(SX, SX));
254 
255   *cos_incident_angle = DotProduct3(SX, normal) * SX_inv_norm;
256   *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
257 }
258 
ComposeHomography(const int image_idx,const int row,const int col,const float depth,const float normal[3],float H[9])259 __device__ inline void ComposeHomography(const int image_idx, const int row,
260                                          const int col, const float depth,
261                                          const float normal[3], float H[9]) {
262   // Calibration of source image.
263   float K[4];
264   for (int i = 0; i < 4; ++i) {
265     K[i] = tex2D(poses_texture, i, image_idx);
266   }
267 
268   // Relative rotation between reference and source image.
269   float R[9];
270   for (int i = 0; i < 9; ++i) {
271     R[i] = tex2D(poses_texture, i + 4, image_idx);
272   }
273 
274   // Relative translation between reference and source image.
275   float T[3];
276   for (int i = 0; i < 3; ++i) {
277     T[i] = tex2D(poses_texture, i + 13, image_idx);
278   }
279 
280   // Distance to the plane.
281   const float dist =
282       depth * (normal[0] * (ref_inv_K[0] * col + ref_inv_K[1]) +
283                normal[1] * (ref_inv_K[2] * row + ref_inv_K[3]) + normal[2]);
284   const float inv_dist = 1.0f / dist;
285 
286   const float inv_dist_N0 = inv_dist * normal[0];
287   const float inv_dist_N1 = inv_dist * normal[1];
288   const float inv_dist_N2 = inv_dist * normal[2];
289 
290   // Homography as H = K * (R - T * n' / d) * Kref^-1.
291   H[0] = ref_inv_K[0] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
292                          K[1] * (R[6] + inv_dist_N0 * T[2]));
293   H[1] = ref_inv_K[2] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
294                          K[1] * (R[7] + inv_dist_N1 * T[2]));
295   H[2] = K[0] * (R[2] + inv_dist_N2 * T[0]) +
296          K[1] * (R[8] + inv_dist_N2 * T[2]) +
297          ref_inv_K[1] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
298                          K[1] * (R[6] + inv_dist_N0 * T[2])) +
299          ref_inv_K[3] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
300                          K[1] * (R[7] + inv_dist_N1 * T[2]));
301   H[3] = ref_inv_K[0] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
302                          K[3] * (R[6] + inv_dist_N0 * T[2]));
303   H[4] = ref_inv_K[2] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
304                          K[3] * (R[7] + inv_dist_N1 * T[2]));
305   H[5] = K[2] * (R[5] + inv_dist_N2 * T[1]) +
306          K[3] * (R[8] + inv_dist_N2 * T[2]) +
307          ref_inv_K[1] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
308                          K[3] * (R[6] + inv_dist_N0 * T[2])) +
309          ref_inv_K[3] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
310                          K[3] * (R[7] + inv_dist_N1 * T[2]));
311   H[6] = ref_inv_K[0] * (R[6] + inv_dist_N0 * T[2]);
312   H[7] = ref_inv_K[2] * (R[7] + inv_dist_N1 * T[2]);
313   H[8] = R[8] + ref_inv_K[1] * (R[6] + inv_dist_N0 * T[2]) +
314          ref_inv_K[3] * (R[7] + inv_dist_N1 * T[2]) + inv_dist_N2 * T[2];
315 }
316 
317 // The return values is 1 - NCC, so the range is [0, 2], the smaller the
318 // value, the better the color consistency.
319 template <int kWindowSize, int kWindowStep>
320 struct PhotoConsistencyCostComputer {
321   const int kWindowRadius = kWindowSize / 2;
322 
PhotoConsistencyCostComputercolmap::mvs::PhotoConsistencyCostComputer323   __device__ PhotoConsistencyCostComputer(const float sigma_spatial,
324                                           const float sigma_color)
325       : bilateral_weight_computer_(sigma_spatial, sigma_color) {}
326 
327   // Maximum photo consistency cost as 1 - min(NCC).
328   const float kMaxCost = 2.0f;
329 
330   // Image data in local window around patch.
331   const float* local_ref_image = nullptr;
332 
333   // Precomputed sum of raw and squared image intensities.
334   float local_ref_sum = 0.0f;
335   float local_ref_squared_sum = 0.0f;
336 
337   // Index of source image.
338   int src_image_idx = -1;
339 
340   // Center position of patch in reference image.
341   int row = -1;
342   int col = -1;
343 
344   // Depth and normal for which to warp patch.
345   float depth = 0.0f;
346   const float* normal = nullptr;
347 
Computecolmap::mvs::PhotoConsistencyCostComputer348   __device__ inline float Compute() const {
349     float tform[9];
350     ComposeHomography(src_image_idx, row, col, depth, normal, tform);
351 
352     float tform_step[9];
353     for (int i = 0; i < 9; ++i) {
354       tform_step[i] = kWindowStep * tform[i];
355     }
356 
357     const int thread_id = threadIdx.x;
358     const int row_start = row - kWindowRadius;
359     const int col_start = col - kWindowRadius;
360 
361     float col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
362     float row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
363     float z = tform[6] * col_start + tform[7] * row_start + tform[8];
364     float base_col_src = col_src;
365     float base_row_src = row_src;
366     float base_z = z;
367 
368     int ref_image_idx = THREADS_PER_BLOCK - kWindowRadius + thread_id;
369     int ref_image_base_idx = ref_image_idx;
370 
371     const float ref_center_color =
372         local_ref_image[ref_image_idx + kWindowRadius * 3 * THREADS_PER_BLOCK +
373                         kWindowRadius];
374     const float ref_color_sum = local_ref_sum;
375     const float ref_color_squared_sum = local_ref_squared_sum;
376     float src_color_sum = 0.0f;
377     float src_color_squared_sum = 0.0f;
378     float src_ref_color_sum = 0.0f;
379     float bilateral_weight_sum = 0.0f;
380 
381     for (int row = -kWindowRadius; row <= kWindowRadius; row += kWindowStep) {
382       for (int col = -kWindowRadius; col <= kWindowRadius; col += kWindowStep) {
383         const float inv_z = 1.0f / z;
384         const float norm_col_src = inv_z * col_src + 0.5f;
385         const float norm_row_src = inv_z * row_src + 0.5f;
386         const float ref_color = local_ref_image[ref_image_idx];
387         const float src_color = tex2DLayered(src_images_texture, norm_col_src,
388                                              norm_row_src, src_image_idx);
389 
390         const float bilateral_weight = bilateral_weight_computer_.Compute(
391             row, col, ref_center_color, ref_color);
392 
393         const float bilateral_weight_src = bilateral_weight * src_color;
394 
395         src_color_sum += bilateral_weight_src;
396         src_color_squared_sum += bilateral_weight_src * src_color;
397         src_ref_color_sum += bilateral_weight_src * ref_color;
398         bilateral_weight_sum += bilateral_weight;
399 
400         ref_image_idx += kWindowStep;
401 
402         // Accumulate warped source coordinates per row to reduce numerical
403         // errors. Note that this is necessary since coordinates usually are in
404         // the order of 1000s as opposed to the color values which are
405         // normalized to the range [0, 1].
406         col_src += tform_step[0];
407         row_src += tform_step[3];
408         z += tform_step[6];
409       }
410 
411       ref_image_base_idx += kWindowStep * 3 * THREADS_PER_BLOCK;
412       ref_image_idx = ref_image_base_idx;
413 
414       base_col_src += tform_step[1];
415       base_row_src += tform_step[4];
416       base_z += tform_step[7];
417 
418       col_src = base_col_src;
419       row_src = base_row_src;
420       z = base_z;
421     }
422 
423     const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
424     src_color_sum *= inv_bilateral_weight_sum;
425     src_color_squared_sum *= inv_bilateral_weight_sum;
426     src_ref_color_sum *= inv_bilateral_weight_sum;
427 
428     const float ref_color_var =
429         ref_color_squared_sum - ref_color_sum * ref_color_sum;
430     const float src_color_var =
431         src_color_squared_sum - src_color_sum * src_color_sum;
432 
433     // Based on Jensen's Inequality for convex functions, the variance
434     // should always be larger than 0. Do not make this threshold smaller.
435     const float kMinVar = 1e-5f;
436     if (ref_color_var < kMinVar || src_color_var < kMinVar) {
437       return kMaxCost;
438     } else {
439       const float src_ref_color_covar =
440           src_ref_color_sum - ref_color_sum * src_color_sum;
441       const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
442       return max(0.0f,
443                  min(kMaxCost, 1.0f - src_ref_color_covar / src_ref_color_var));
444     }
445   }
446 
447  private:
448   const BilateralWeightComputer bilateral_weight_computer_;
449 };
450 
ComputeGeomConsistencyCost(const float row,const float col,const float depth,const int image_idx,const float max_cost)451 __device__ inline float ComputeGeomConsistencyCost(const float row,
452                                                    const float col,
453                                                    const float depth,
454                                                    const int image_idx,
455                                                    const float max_cost) {
456   // Extract projection matrices for source image.
457   float P[12];
458   for (int i = 0; i < 12; ++i) {
459     P[i] = tex2D(poses_texture, i + 19, image_idx);
460   }
461   float inv_P[12];
462   for (int i = 0; i < 12; ++i) {
463     inv_P[i] = tex2D(poses_texture, i + 31, image_idx);
464   }
465 
466   // Project point in reference image to world.
467   float forward_point[3];
468   ComputePointAtDepth(row, col, depth, forward_point);
469 
470   // Project world point to source image.
471   const float inv_forward_z =
472       1.0f / (P[8] * forward_point[0] + P[9] * forward_point[1] +
473               P[10] * forward_point[2] + P[11]);
474   float src_col =
475       inv_forward_z * (P[0] * forward_point[0] + P[1] * forward_point[1] +
476                        P[2] * forward_point[2] + P[3]);
477   float src_row =
478       inv_forward_z * (P[4] * forward_point[0] + P[5] * forward_point[1] +
479                        P[6] * forward_point[2] + P[7]);
480 
481   // Extract depth in source image.
482   const float src_depth = tex2DLayered(src_depth_maps_texture, src_col + 0.5f,
483                                        src_row + 0.5f, image_idx);
484 
485   // Projection outside of source image.
486   if (src_depth == 0.0f) {
487     return max_cost;
488   }
489 
490   // Project point in source image to world.
491   src_col *= src_depth;
492   src_row *= src_depth;
493   const float backward_point_x =
494       inv_P[0] * src_col + inv_P[1] * src_row + inv_P[2] * src_depth + inv_P[3];
495   const float backward_point_y =
496       inv_P[4] * src_col + inv_P[5] * src_row + inv_P[6] * src_depth + inv_P[7];
497   const float backward_point_z = inv_P[8] * src_col + inv_P[9] * src_row +
498                                  inv_P[10] * src_depth + inv_P[11];
499   const float inv_backward_point_z = 1.0f / backward_point_z;
500 
501   // Project world point back to reference image.
502   const float backward_col =
503       inv_backward_point_z *
504       (ref_K[0] * backward_point_x + ref_K[1] * backward_point_z);
505   const float backward_row =
506       inv_backward_point_z *
507       (ref_K[2] * backward_point_y + ref_K[3] * backward_point_z);
508 
509   // Return truncated reprojection error between original observation and
510   // the forward-backward projected observation.
511   const float diff_col = col - backward_col;
512   const float diff_row = row - backward_row;
513   return min(max_cost, sqrt(diff_col * diff_col + diff_row * diff_row));
514 }
515 
516 // Find index of minimum in given values.
517 template <int kNumCosts>
FindMinCost(const float costs[kNumCosts])518 __device__ inline int FindMinCost(const float costs[kNumCosts]) {
519   float min_cost = costs[0];
520   int min_cost_idx = 0;
521   for (int idx = 1; idx < kNumCosts; ++idx) {
522     if (costs[idx] <= min_cost) {
523       min_cost = costs[idx];
524       min_cost_idx = idx;
525     }
526   }
527   return min_cost_idx;
528 }
529 
530 template <int kWindowSize>
ReadRefImageIntoSharedMemory(float * local_image,const int row,const int col,const int thread_id)531 __device__ inline void ReadRefImageIntoSharedMemory(float* local_image,
532                                                     const int row,
533                                                     const int col,
534                                                     const int thread_id) {
535   // For the first row, read the entire block into shared memory. For all
536   // consecutive rows, it is only necessary to shift the rows in shared memory
537   // up by one element and then read in a new row at the bottom of the shared
538   // memory. Note that this assumes that the calling loop starts with the first
539   // row and then consecutively reads in a new row.
540 
541   if (row == 0) {
542     int r = row - kWindowSize / 2;
543     for (int i = 0; i < kWindowSize; ++i) {
544       int c = col - THREADS_PER_BLOCK;
545 #pragma unroll
546       for (int j = 0; j < 3; ++j) {
547         local_image[thread_id + i * 3 * THREADS_PER_BLOCK +
548                     j * THREADS_PER_BLOCK] = tex2D(ref_image_texture, c, r);
549         c += THREADS_PER_BLOCK;
550       }
551       r += 1;
552     }
553   } else {
554     // Move rows in shared memory up by one row.
555     for (int i = 1; i < kWindowSize; ++i) {
556 #pragma unroll
557       for (int j = 0; j < 3; ++j) {
558         local_image[thread_id + (i - 1) * 3 * THREADS_PER_BLOCK +
559                     j * THREADS_PER_BLOCK] =
560             local_image[thread_id + i * 3 * THREADS_PER_BLOCK +
561                         j * THREADS_PER_BLOCK];
562       }
563     }
564 
565     // Read next row into the last row of shared memory.
566     const int r = row + kWindowSize / 2;
567     int c = col - THREADS_PER_BLOCK;
568     const int i = kWindowSize - 1;
569 #pragma unroll
570     for (int j = 0; j < 3; ++j) {
571       local_image[thread_id + i * 3 * THREADS_PER_BLOCK +
572                   j * THREADS_PER_BLOCK] = tex2D(ref_image_texture, c, r);
573       c += THREADS_PER_BLOCK;
574     }
575   }
576 
577   __syncthreads();
578 }
579 
TransformPDFToCDF(float * probs,const int num_probs)580 __device__ inline void TransformPDFToCDF(float* probs, const int num_probs) {
581   float prob_sum = 0.0f;
582   for (int i = 0; i < num_probs; ++i) {
583     prob_sum += probs[i];
584   }
585   const float inv_prob_sum = 1.0f / prob_sum;
586 
587   float cum_prob = 0.0f;
588   for (int i = 0; i < num_probs; ++i) {
589     const float prob = probs[i] * inv_prob_sum;
590     cum_prob += prob;
591     probs[i] = cum_prob;
592   }
593 }
594 
595 class LikelihoodComputer {
596  public:
LikelihoodComputer(const float ncc_sigma,const float min_triangulation_angle,const float incident_angle_sigma)597   __device__ LikelihoodComputer(const float ncc_sigma,
598                                 const float min_triangulation_angle,
599                                 const float incident_angle_sigma)
600       : cos_min_triangulation_angle_(cos(min_triangulation_angle)),
601         inv_incident_angle_sigma_square_(
602             -0.5f / (incident_angle_sigma * incident_angle_sigma)),
603         inv_ncc_sigma_square_(-0.5f / (ncc_sigma * ncc_sigma)),
604         ncc_norm_factor_(ComputeNCCCostNormFactor(ncc_sigma)) {}
605 
606   // Compute forward message from current cost and forward message of
607   // previous / neighboring pixel.
ComputeForwardMessage(const float cost,const float prev) const608   __device__ float ComputeForwardMessage(const float cost,
609                                          const float prev) const {
610     return ComputeMessage<true>(cost, prev);
611   }
612 
613   // Compute backward message from current cost and backward message of
614   // previous / neighboring pixel.
ComputeBackwardMessage(const float cost,const float prev) const615   __device__ float ComputeBackwardMessage(const float cost,
616                                           const float prev) const {
617     return ComputeMessage<false>(cost, prev);
618   }
619 
620   // Compute the selection probability from the forward and backward message.
ComputeSelProb(const float alpha,const float beta,const float prev,const float prev_weight) const621   __device__ inline float ComputeSelProb(const float alpha, const float beta,
622                                          const float prev,
623                                          const float prev_weight) const {
624     const float zn0 = (1.0f - alpha) * (1.0f - beta);
625     const float zn1 = alpha * beta;
626     const float curr = zn1 / (zn0 + zn1);
627     return prev_weight * prev + (1.0f - prev_weight) * curr;
628   }
629 
630   // Compute NCC probability. Note that cost = 1 - NCC.
ComputeNCCProb(const float cost) const631   __device__ inline float ComputeNCCProb(const float cost) const {
632     return exp(cost * cost * inv_ncc_sigma_square_) * ncc_norm_factor_;
633   }
634 
635   // Compute the triangulation angle probability.
ComputeTriProb(const float cos_triangulation_angle) const636   __device__ inline float ComputeTriProb(
637       const float cos_triangulation_angle) const {
638     const float abs_cos_triangulation_angle = abs(cos_triangulation_angle);
639     if (abs_cos_triangulation_angle > cos_min_triangulation_angle_) {
640       const float scaled = 1.0f - (1.0f - abs_cos_triangulation_angle) /
641                                       (1.0f - cos_min_triangulation_angle_);
642       const float likelihood = 1.0f - scaled * scaled;
643       return min(1.0f, max(0.0f, likelihood));
644     } else {
645       return 1.0f;
646     }
647   }
648 
649   // Compute the incident angle probability.
ComputeIncProb(const float cos_incident_angle) const650   __device__ inline float ComputeIncProb(const float cos_incident_angle) const {
651     const float x = 1.0f - max(0.0f, cos_incident_angle);
652     return exp(x * x * inv_incident_angle_sigma_square_);
653   }
654 
655   // Compute the warping/resolution prior probability.
656   template <int kWindowSize>
ComputeResolutionProb(const float H[9],const float row,const float col) const657   __device__ inline float ComputeResolutionProb(const float H[9],
658                                                 const float row,
659                                                 const float col) const {
660     const int kWindowRadius = kWindowSize / 2;
661 
662     // Warp corners of patch in reference image to source image.
663     float src1[2];
664     const float ref1[2] = {col - kWindowRadius, row - kWindowRadius};
665     Mat33DotVec3Homogeneous(H, ref1, src1);
666     float src2[2];
667     const float ref2[2] = {col - kWindowRadius, row + kWindowRadius};
668     Mat33DotVec3Homogeneous(H, ref2, src2);
669     float src3[2];
670     const float ref3[2] = {col + kWindowRadius, row + kWindowRadius};
671     Mat33DotVec3Homogeneous(H, ref3, src3);
672     float src4[2];
673     const float ref4[2] = {col + kWindowRadius, row - kWindowRadius};
674     Mat33DotVec3Homogeneous(H, ref4, src4);
675 
676     // Compute area of patches in reference and source image.
677     const float ref_area = kWindowSize * kWindowSize;
678     const float src_area =
679         abs(0.5f * (src1[0] * src2[1] - src2[0] * src1[1] - src1[0] * src4[1] +
680                     src2[0] * src3[1] - src3[0] * src2[1] + src4[0] * src1[1] +
681                     src3[0] * src4[1] - src4[0] * src3[1]));
682 
683     if (ref_area > src_area) {
684       return src_area / ref_area;
685     } else {
686       return ref_area / src_area;
687     }
688   }
689 
690  private:
691   // The normalization for the likelihood function, i.e. the normalization for
692   // the prior on the matching cost.
ComputeNCCCostNormFactor(const float ncc_sigma)693   __device__ static inline float ComputeNCCCostNormFactor(
694       const float ncc_sigma) {
695     // A = sqrt(2pi)*sigma/2*erf(sqrt(2)/sigma)
696     // erf(x) = 2/sqrt(pi) * integral from 0 to x of exp(-t^2) dt
697     return 2.0f / (sqrt(2.0f * M_PI) * ncc_sigma *
698                    erff(2.0f / (ncc_sigma * 1.414213562f)));
699   }
700 
701   // Compute the forward or backward message.
702   template <bool kForward>
ComputeMessage(const float cost,const float prev) const703   __device__ inline float ComputeMessage(const float cost,
704                                          const float prev) const {
705     const float kUniformProb = 0.5f;
706     const float kNoChangeProb = 0.99999f;
707     const float kChangeProb = 1.0f - kNoChangeProb;
708     const float emission = ComputeNCCProb(cost);
709 
710     float zn0;  // Message for selection probability = 0.
711     float zn1;  // Message for selection probability = 1.
712     if (kForward) {
713       zn0 = (prev * kChangeProb + (1.0f - prev) * kNoChangeProb) * kUniformProb;
714       zn1 = (prev * kNoChangeProb + (1.0f - prev) * kChangeProb) * emission;
715     } else {
716       zn0 = prev * emission * kChangeProb +
717             (1.0f - prev) * kUniformProb * kNoChangeProb;
718       zn1 = prev * emission * kNoChangeProb +
719             (1.0f - prev) * kUniformProb * kChangeProb;
720     }
721 
722     return zn1 / (zn0 + zn1);
723   }
724 
725   float cos_min_triangulation_angle_;
726   float inv_incident_angle_sigma_square_;
727   float inv_ncc_sigma_square_;
728   float ncc_norm_factor_;
729 };
730 
731 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
InitNormalMap(GpuMat<float> normal_map,GpuMat<curandState> rand_state_map)732 __global__ void InitNormalMap(GpuMat<float> normal_map,
733                               GpuMat<curandState> rand_state_map) {
734   const int row = blockDim.y * blockIdx.y + threadIdx.y;
735   const int col = blockDim.x * blockIdx.x + threadIdx.x;
736   if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
737     curandState rand_state = rand_state_map.Get(row, col);
738     float normal[3];
739     GenerateRandomNormal(row, col, &rand_state, normal);
740     normal_map.SetSlice(row, col, normal);
741     rand_state_map.Set(row, col, rand_state);
742   }
743 }
744 
745 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
RotateNormalMap(GpuMat<float> normal_map)746 __global__ void RotateNormalMap(GpuMat<float> normal_map) {
747   const int row = blockDim.y * blockIdx.y + threadIdx.y;
748   const int col = blockDim.x * blockIdx.x + threadIdx.x;
749   if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
750     float normal[3];
751     normal_map.GetSlice(row, col, normal);
752     float rotated_normal[3];
753     rotated_normal[0] = normal[1];
754     rotated_normal[1] = -normal[0];
755     rotated_normal[2] = normal[2];
756     normal_map.SetSlice(row, col, rotated_normal);
757   }
758 }
759 
760 template <int kWindowSize, int kWindowStep>
ComputeInitialCost(GpuMat<float> cost_map,const GpuMat<float> depth_map,const GpuMat<float> normal_map,const GpuMat<float> ref_sum_image,const GpuMat<float> ref_squared_sum_image,const float sigma_spatial,const float sigma_color)761 __global__ void ComputeInitialCost(GpuMat<float> cost_map,
762                                    const GpuMat<float> depth_map,
763                                    const GpuMat<float> normal_map,
764                                    const GpuMat<float> ref_sum_image,
765                                    const GpuMat<float> ref_squared_sum_image,
766                                    const float sigma_spatial,
767                                    const float sigma_color) {
768   const int thread_id = threadIdx.x;
769   const int col = blockDim.x * blockIdx.x + threadIdx.x;
770 
771   __shared__ float local_ref_image[THREADS_PER_BLOCK * 3 * kWindowSize];
772 
773   PhotoConsistencyCostComputer<kWindowSize, kWindowStep> pcc_computer(
774       sigma_spatial, sigma_color);
775   pcc_computer.local_ref_image = local_ref_image;
776   pcc_computer.row = 0;
777   pcc_computer.col = col;
778 
779   float normal[3];
780   pcc_computer.normal = normal;
781 
782   for (int row = 0; row < cost_map.GetHeight(); ++row) {
783     // Note that this must be executed even for pixels outside the borders,
784     // since pixels are used in the local neighborhood of the current pixel.
785     ReadRefImageIntoSharedMemory<kWindowSize>(local_ref_image, row, col,
786                                               thread_id);
787 
788     if (col < cost_map.GetWidth()) {
789       pcc_computer.depth = depth_map.Get(row, col);
790       normal_map.GetSlice(row, col, normal);
791 
792       pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
793       pcc_computer.local_ref_squared_sum = ref_squared_sum_image.Get(row, col);
794 
795       for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
796         pcc_computer.src_image_idx = image_idx;
797         cost_map.Set(row, col, image_idx, pcc_computer.Compute());
798       }
799 
800       pcc_computer.row += 1;
801     }
802   }
803 }
804 
805 struct SweepOptions {
806   float perturbation = 1.0f;
807   float depth_min = 0.0f;
808   float depth_max = 1.0f;
809   int num_samples = 15;
810   float sigma_spatial = 3.0f;
811   float sigma_color = 0.3f;
812   float ncc_sigma = 0.6f;
813   float min_triangulation_angle = 0.5f;
814   float incident_angle_sigma = 0.9f;
815   float prev_sel_prob_weight = 0.0f;
816   float geom_consistency_regularizer = 0.1f;
817   float geom_consistency_max_cost = 5.0f;
818   float filter_min_ncc = 0.1f;
819   float filter_min_triangulation_angle = 3.0f;
820   int filter_min_num_consistent = 2;
821   float filter_geom_consistency_max_cost = 1.0f;
822 };
823 
824 template <int kWindowSize, int kWindowStep, bool kGeomConsistencyTerm = false,
825           bool kFilterPhotoConsistency = false,
826           bool kFilterGeomConsistency = false>
SweepFromTopToBottom(GpuMat<float> global_workspace,GpuMat<curandState> rand_state_map,GpuMat<float> cost_map,GpuMat<float> depth_map,GpuMat<float> normal_map,GpuMat<uint8_t> consistency_mask,GpuMat<float> sel_prob_map,const GpuMat<float> prev_sel_prob_map,const GpuMat<float> ref_sum_image,const GpuMat<float> ref_squared_sum_image,const SweepOptions options)827 __global__ void SweepFromTopToBottom(
828     GpuMat<float> global_workspace, GpuMat<curandState> rand_state_map,
829     GpuMat<float> cost_map, GpuMat<float> depth_map, GpuMat<float> normal_map,
830     GpuMat<uint8_t> consistency_mask, GpuMat<float> sel_prob_map,
831     const GpuMat<float> prev_sel_prob_map, const GpuMat<float> ref_sum_image,
832     const GpuMat<float> ref_squared_sum_image, const SweepOptions options) {
833   const int thread_id = threadIdx.x;
834   const int col = blockDim.x * blockIdx.x + threadIdx.x;
835 
836   // Probability for boundary pixels.
837   const float kUniformProb = 0.5f;
838 
839   LikelihoodComputer likelihood_computer(options.ncc_sigma,
840                                          options.min_triangulation_angle,
841                                          options.incident_angle_sigma);
842 
843   float* forward_message =
844       &global_workspace.GetPtr()[col * global_workspace.GetHeight()];
845   float* sampling_probs =
846       &global_workspace.GetPtr()[global_workspace.GetWidth() *
847                                      global_workspace.GetHeight() +
848                                  col * global_workspace.GetHeight()];
849 
850   //////////////////////////////////////////////////////////////////////////////
851   // Compute backward message for all rows. Note that the backward messages are
852   // temporarily stored in the sel_prob_map and replaced row by row as the
853   // updated forward messages are computed further below.
854   //////////////////////////////////////////////////////////////////////////////
855 
856   if (col < cost_map.GetWidth()) {
857     for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
858       // Compute backward message.
859       float beta = kUniformProb;
860       for (int row = cost_map.GetHeight() - 1; row >= 0; --row) {
861         const float cost = cost_map.Get(row, col, image_idx);
862         beta = likelihood_computer.ComputeBackwardMessage(cost, beta);
863         sel_prob_map.Set(row, col, image_idx, beta);
864       }
865 
866       // Initialize forward message.
867       forward_message[image_idx] = kUniformProb;
868     }
869   }
870 
871   //////////////////////////////////////////////////////////////////////////////
872   // Estimate parameters for remaining rows and compute selection probabilities.
873   //////////////////////////////////////////////////////////////////////////////
874 
875   // Shared memory holding local patch around current position for one warp.
876   // Contains 3 vertical stripes of height kWindowSize, that are reused within
877   // one warp for NCC computation. Note that this limits the maximum window
878   // size to 2 * THREADS_PER_BLOCK + 1.
879   __shared__ float local_ref_image[THREADS_PER_BLOCK * 3 * kWindowSize];
880 
881   PhotoConsistencyCostComputer<kWindowSize, kWindowStep> pcc_computer(
882       options.sigma_spatial, options.sigma_color);
883   pcc_computer.local_ref_image = local_ref_image;
884   pcc_computer.col = col;
885 
886   struct ParamState {
887     float depth = 0.0f;
888     float normal[3];
889   };
890 
891   // Parameters of previous pixel in column.
892   ParamState prev_param_state;
893   // Parameters of current pixel in column.
894   ParamState curr_param_state;
895   // Randomly sampled parameters.
896   ParamState rand_param_state;
897   // Cuda PRNG state for random sampling.
898   curandState rand_state;
899 
900   if (col < cost_map.GetWidth()) {
901     // Read random state for current column.
902     rand_state = rand_state_map.Get(0, col);
903     // Parameters for first row in column.
904     prev_param_state.depth = depth_map.Get(0, col);
905     normal_map.GetSlice(0, col, prev_param_state.normal);
906   }
907 
908   for (int row = 0; row < cost_map.GetHeight(); ++row) {
909     // Note that this must be executed even for pixels outside the borders,
910     // since pixels are used in the local neighborhood of the current pixel.
911     ReadRefImageIntoSharedMemory<kWindowSize>(local_ref_image, row, col,
912                                               thread_id);
913 
914     if (col >= cost_map.GetWidth()) {
915       continue;
916     }
917 
918     pcc_computer.row = row;
919     pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
920     pcc_computer.local_ref_squared_sum = ref_squared_sum_image.Get(row, col);
921 
922     // Propagate the depth at which the current ray intersects with the plane
923     // of the normal of the previous ray. This helps to better estimate
924     // the depth of very oblique structures, i.e. pixels whose normal direction
925     // is significantly different from their viewing direction.
926     prev_param_state.depth = PropagateDepth(
927         prev_param_state.depth, prev_param_state.normal, row - 1, row);
928 
929     // Read parameters for current pixel from previous sweep.
930     curr_param_state.depth = depth_map.Get(row, col);
931     normal_map.GetSlice(row, col, curr_param_state.normal);
932 
933     // Generate random parameters.
934     rand_param_state.depth =
935         PerturbDepth(options.perturbation, curr_param_state.depth, &rand_state);
936     PerturbNormal(row, col, options.perturbation * M_PI,
937                   curr_param_state.normal, &rand_state,
938                   rand_param_state.normal);
939 
940     // Read in the backward message, compute selection probabilities and
941     // modulate selection probabilities with priors.
942 
943     float point[3];
944     ComputePointAtDepth(row, col, curr_param_state.depth, point);
945 
946     for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
947       const float cost = cost_map.Get(row, col, image_idx);
948       const float alpha = likelihood_computer.ComputeForwardMessage(
949           cost, forward_message[image_idx]);
950       const float beta = sel_prob_map.Get(row, col, image_idx);
951       const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
952       const float sel_prob = likelihood_computer.ComputeSelProb(
953           alpha, beta, prev_prob, options.prev_sel_prob_weight);
954 
955       float cos_triangulation_angle;
956       float cos_incident_angle;
957       ComputeViewingAngles(point, curr_param_state.normal, image_idx,
958                            &cos_triangulation_angle, &cos_incident_angle);
959       const float tri_prob =
960           likelihood_computer.ComputeTriProb(cos_triangulation_angle);
961       const float inc_prob =
962           likelihood_computer.ComputeIncProb(cos_incident_angle);
963 
964       float H[9];
965       ComposeHomography(image_idx, row, col, curr_param_state.depth,
966                         curr_param_state.normal, H);
967       const float res_prob =
968           likelihood_computer.ComputeResolutionProb<kWindowSize>(H, row, col);
969 
970       sampling_probs[image_idx] = sel_prob * tri_prob * inc_prob * res_prob;
971     }
972 
973     TransformPDFToCDF(sampling_probs, cost_map.GetDepth());
974 
975     // Compute matching cost using Monte Carlo sampling of source images. Images
976     // with higher selection probability are more likely to be sampled. Hence,
977     // if only very few source images see the reference image pixel, the same
978     // source image is likely to be sampled many times. Instead of taking
979     // the best K probabilities, this sampling scheme has the advantage of
980     // being adaptive to any distribution of selection probabilities.
981 
982     const int kNumCosts = 5;
983     float costs[kNumCosts];
984     const float depths[kNumCosts] = {
985         curr_param_state.depth, prev_param_state.depth, rand_param_state.depth,
986         curr_param_state.depth, rand_param_state.depth};
987     const float* normals[kNumCosts] = {
988         curr_param_state.normal, prev_param_state.normal,
989         rand_param_state.normal, rand_param_state.normal,
990         curr_param_state.normal};
991 
992     for (int i = 0; i < kNumCosts; ++i) {
993       costs[i] = 0.0f;
994     }
995 
996     for (int sample = 0; sample < options.num_samples; ++sample) {
997       const float rand_prob = curand_uniform(&rand_state) - FLT_EPSILON;
998 
999       pcc_computer.src_image_idx = -1;
1000       for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1001         const float prob = sampling_probs[image_idx];
1002         if (prob > rand_prob) {
1003           pcc_computer.src_image_idx = image_idx;
1004           break;
1005         }
1006       }
1007 
1008       if (pcc_computer.src_image_idx == -1) {
1009         continue;
1010       }
1011 
1012       costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx);
1013       if (kGeomConsistencyTerm) {
1014         costs[0] += options.geom_consistency_regularizer *
1015                     ComputeGeomConsistencyCost(
1016                         row, col, depths[0], pcc_computer.src_image_idx,
1017                         options.geom_consistency_max_cost);
1018       }
1019 
1020       for (int i = 1; i < kNumCosts; ++i) {
1021         pcc_computer.depth = depths[i];
1022         pcc_computer.normal = normals[i];
1023         costs[i] += pcc_computer.Compute();
1024         if (kGeomConsistencyTerm) {
1025           costs[i] += options.geom_consistency_regularizer *
1026                       ComputeGeomConsistencyCost(
1027                           row, col, depths[i], pcc_computer.src_image_idx,
1028                           options.geom_consistency_max_cost);
1029         }
1030       }
1031     }
1032 
1033     // Find the parameters of the minimum cost.
1034     const int min_cost_idx = FindMinCost<kNumCosts>(costs);
1035     const float best_depth = depths[min_cost_idx];
1036     const float* best_normal = normals[min_cost_idx];
1037 
1038     // Save best new parameters.
1039     depth_map.Set(row, col, best_depth);
1040     normal_map.SetSlice(row, col, best_normal);
1041 
1042     // Use the new cost to recompute the updated forward message and
1043     // the selection probability.
1044     pcc_computer.depth = best_depth;
1045     pcc_computer.normal = best_normal;
1046     for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1047       // Determine the cost for best depth.
1048       float cost;
1049       if (min_cost_idx == 0) {
1050         cost = cost_map.Get(row, col, image_idx);
1051       } else {
1052         pcc_computer.src_image_idx = image_idx;
1053         cost = pcc_computer.Compute();
1054         cost_map.Set(row, col, image_idx, cost);
1055       }
1056 
1057       const float alpha = likelihood_computer.ComputeForwardMessage(
1058           cost, forward_message[image_idx]);
1059       const float beta = sel_prob_map.Get(row, col, image_idx);
1060       const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
1061       const float prob = likelihood_computer.ComputeSelProb(
1062           alpha, beta, prev_prob, options.prev_sel_prob_weight);
1063       forward_message[image_idx] = alpha;
1064       sel_prob_map.Set(row, col, image_idx, prob);
1065     }
1066 
1067     if (kFilterPhotoConsistency || kFilterGeomConsistency) {
1068       int num_consistent = 0;
1069 
1070       float best_point[3];
1071       ComputePointAtDepth(row, col, best_depth, best_point);
1072 
1073       const float min_ncc_prob =
1074           likelihood_computer.ComputeNCCProb(1.0f - options.filter_min_ncc);
1075       const float cos_min_triangulation_angle =
1076           cos(options.filter_min_triangulation_angle);
1077 
1078       for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1079         float cos_triangulation_angle;
1080         float cos_incident_angle;
1081         ComputeViewingAngles(best_point, best_normal, image_idx,
1082                              &cos_triangulation_angle, &cos_incident_angle);
1083         if (cos_triangulation_angle > cos_min_triangulation_angle ||
1084             cos_incident_angle <= 0.0f) {
1085           continue;
1086         }
1087 
1088         if (!kFilterGeomConsistency) {
1089           if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob) {
1090             consistency_mask.Set(row, col, image_idx, 1);
1091             num_consistent += 1;
1092           }
1093         } else if (!kFilterPhotoConsistency) {
1094           if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
1095                                          options.geom_consistency_max_cost) <=
1096               options.filter_geom_consistency_max_cost) {
1097             consistency_mask.Set(row, col, image_idx, 1);
1098             num_consistent += 1;
1099           }
1100         } else {
1101           if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob &&
1102               ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
1103                                          options.geom_consistency_max_cost) <=
1104                   options.filter_geom_consistency_max_cost) {
1105             consistency_mask.Set(row, col, image_idx, 1);
1106             num_consistent += 1;
1107           }
1108         }
1109       }
1110 
1111       if (num_consistent < options.filter_min_num_consistent) {
1112         const float kFilterValue = 0.0f;
1113         depth_map.Set(row, col, kFilterValue);
1114         normal_map.Set(row, col, 0, kFilterValue);
1115         normal_map.Set(row, col, 1, kFilterValue);
1116         normal_map.Set(row, col, 2, kFilterValue);
1117         for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1118           consistency_mask.Set(row, col, image_idx, 0);
1119         }
1120       }
1121     }
1122 
1123     // Update previous depth for next row.
1124     prev_param_state.depth = best_depth;
1125     for (int i = 0; i < 3; ++i) {
1126       prev_param_state.normal[i] = best_normal[i];
1127     }
1128   }
1129 
1130   if (col < cost_map.GetWidth()) {
1131     rand_state_map.Set(0, col, rand_state);
1132   }
1133 }
1134 
PatchMatchCuda(const PatchMatchOptions & options,const PatchMatch::Problem & problem)1135 PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options,
1136                                const PatchMatch::Problem& problem)
1137     : options_(options),
1138       problem_(problem),
1139       ref_width_(0),
1140       ref_height_(0),
1141       rotation_in_half_pi_(0) {
1142   SetBestCudaDevice(std::stoi(options_.gpu_index));
1143   InitRefImage();
1144   InitSourceImages();
1145   InitTransforms();
1146   InitWorkspaceMemory();
1147 }
1148 
~PatchMatchCuda()1149 PatchMatchCuda::~PatchMatchCuda() {
1150   for (size_t i = 0; i < 4; ++i) {
1151     poses_device_[i].reset();
1152   }
1153 }
1154 
Run()1155 void PatchMatchCuda::Run() {
1156 #define CASE_WINDOW_RADIUS(window_radius, window_step)              \
1157   case window_radius:                                               \
1158     RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \
1159     break;
1160 
1161 #define CASE_WINDOW_STEP(window_step)                                 \
1162   case window_step:                                                   \
1163     switch (options_.window_radius) {                                 \
1164       CASE_WINDOW_RADIUS(1, window_step)                              \
1165       CASE_WINDOW_RADIUS(2, window_step)                              \
1166       CASE_WINDOW_RADIUS(3, window_step)                              \
1167       CASE_WINDOW_RADIUS(4, window_step)                              \
1168       CASE_WINDOW_RADIUS(5, window_step)                              \
1169       CASE_WINDOW_RADIUS(6, window_step)                              \
1170       CASE_WINDOW_RADIUS(7, window_step)                              \
1171       CASE_WINDOW_RADIUS(8, window_step)                              \
1172       CASE_WINDOW_RADIUS(9, window_step)                              \
1173       CASE_WINDOW_RADIUS(10, window_step)                             \
1174       CASE_WINDOW_RADIUS(11, window_step)                             \
1175       CASE_WINDOW_RADIUS(12, window_step)                             \
1176       CASE_WINDOW_RADIUS(13, window_step)                             \
1177       CASE_WINDOW_RADIUS(14, window_step)                             \
1178       CASE_WINDOW_RADIUS(15, window_step)                             \
1179       CASE_WINDOW_RADIUS(16, window_step)                             \
1180       CASE_WINDOW_RADIUS(17, window_step)                             \
1181       CASE_WINDOW_RADIUS(18, window_step)                             \
1182       CASE_WINDOW_RADIUS(19, window_step)                             \
1183       CASE_WINDOW_RADIUS(20, window_step)                             \
1184       default: {                                                      \
1185         std::cerr << "Error: Window size not supported" << std::endl; \
1186         break;                                                        \
1187       }                                                               \
1188     }                                                                 \
1189     break;
1190 
1191   switch (options_.window_step) {
1192     CASE_WINDOW_STEP(1)
1193     CASE_WINDOW_STEP(2)
1194     default: {
1195       std::cerr << "Error: Window step not supported" << std::endl;
1196       break;
1197     }
1198   }
1199 
1200 #undef SWITCH_WINDOW_RADIUS
1201 #undef CALL_RUN_FUNC
1202 }
1203 
GetDepthMap() const1204 DepthMap PatchMatchCuda::GetDepthMap() const {
1205   return DepthMap(depth_map_->CopyToMat(), options_.depth_min,
1206                   options_.depth_max);
1207 }
1208 
GetNormalMap() const1209 NormalMap PatchMatchCuda::GetNormalMap() const {
1210   return NormalMap(normal_map_->CopyToMat());
1211 }
1212 
GetSelProbMap() const1213 Mat<float> PatchMatchCuda::GetSelProbMap() const {
1214   return prev_sel_prob_map_->CopyToMat();
1215 }
1216 
GetConsistentImageIdxs() const1217 std::vector<int> PatchMatchCuda::GetConsistentImageIdxs() const {
1218   const Mat<uint8_t> mask = consistency_mask_->CopyToMat();
1219   std::vector<int> consistent_image_idxs;
1220   std::vector<int> pixel_consistent_image_idxs;
1221   pixel_consistent_image_idxs.reserve(mask.GetDepth());
1222   for (size_t r = 0; r < mask.GetHeight(); ++r) {
1223     for (size_t c = 0; c < mask.GetWidth(); ++c) {
1224       pixel_consistent_image_idxs.clear();
1225       for (size_t d = 0; d < mask.GetDepth(); ++d) {
1226         if (mask.Get(r, c, d)) {
1227           pixel_consistent_image_idxs.push_back(problem_.src_image_idxs[d]);
1228         }
1229       }
1230       if (pixel_consistent_image_idxs.size() > 0) {
1231         consistent_image_idxs.push_back(c);
1232         consistent_image_idxs.push_back(r);
1233         consistent_image_idxs.push_back(pixel_consistent_image_idxs.size());
1234         consistent_image_idxs.insert(consistent_image_idxs.end(),
1235                                      pixel_consistent_image_idxs.begin(),
1236                                      pixel_consistent_image_idxs.end());
1237       }
1238     }
1239   }
1240   return consistent_image_idxs;
1241 }
1242 
1243 template <int kWindowSize, int kWindowStep>
RunWithWindowSizeAndStep()1244 void PatchMatchCuda::RunWithWindowSizeAndStep() {
1245   // Wait for all initializations to finish.
1246   CUDA_SYNC_AND_CHECK();
1247 
1248   CudaTimer total_timer;
1249   CudaTimer init_timer;
1250 
1251   ComputeCudaConfig();
1252   ComputeInitialCost<kWindowSize, kWindowStep>
1253       <<<sweep_grid_size_, sweep_block_size_>>>(
1254           *cost_map_, *depth_map_, *normal_map_, *ref_image_->sum_image,
1255           *ref_image_->squared_sum_image, options_.sigma_spatial,
1256           options_.sigma_color);
1257   CUDA_SYNC_AND_CHECK();
1258 
1259   init_timer.Print("Initialization");
1260 
1261   const float total_num_steps = options_.num_iterations * 4;
1262 
1263   SweepOptions sweep_options;
1264   sweep_options.depth_min = options_.depth_min;
1265   sweep_options.depth_max = options_.depth_max;
1266   sweep_options.sigma_spatial = options_.sigma_spatial;
1267   sweep_options.sigma_color = options_.sigma_color;
1268   sweep_options.num_samples = options_.num_samples;
1269   sweep_options.ncc_sigma = options_.ncc_sigma;
1270   sweep_options.min_triangulation_angle =
1271       DEG2RAD(options_.min_triangulation_angle);
1272   sweep_options.incident_angle_sigma = options_.incident_angle_sigma;
1273   sweep_options.geom_consistency_regularizer =
1274       options_.geom_consistency_regularizer;
1275   sweep_options.geom_consistency_max_cost = options_.geom_consistency_max_cost;
1276   sweep_options.filter_min_ncc = options_.filter_min_ncc;
1277   sweep_options.filter_min_triangulation_angle =
1278       DEG2RAD(options_.filter_min_triangulation_angle);
1279   sweep_options.filter_min_num_consistent = options_.filter_min_num_consistent;
1280   sweep_options.filter_geom_consistency_max_cost =
1281       options_.filter_geom_consistency_max_cost;
1282 
1283   for (int iter = 0; iter < options_.num_iterations; ++iter) {
1284     CudaTimer iter_timer;
1285 
1286     for (int sweep = 0; sweep < 4; ++sweep) {
1287       CudaTimer sweep_timer;
1288 
1289       // Expenentially reduce amount of perturbation during the optimization.
1290       sweep_options.perturbation = 1.0f / std::pow(2.0f, iter + sweep / 4.0f);
1291 
1292       // Linearly increase the influence of previous selection probabilities.
1293       sweep_options.prev_sel_prob_weight =
1294           static_cast<float>(iter * 4 + sweep) / total_num_steps;
1295 
1296       const bool last_sweep = iter == options_.num_iterations - 1 && sweep == 3;
1297 
1298 #define CALL_SWEEP_FUNC                                                  \
1299   SweepFromTopToBottom<kWindowSize, kWindowStep, kGeomConsistencyTerm,   \
1300                        kFilterPhotoConsistency, kFilterGeomConsistency>  \
1301       <<<sweep_grid_size_, sweep_block_size_>>>(                         \
1302           *global_workspace_, *rand_state_map_, *cost_map_, *depth_map_, \
1303           *normal_map_, *consistency_mask_, *sel_prob_map_,              \
1304           *prev_sel_prob_map_, *ref_image_->sum_image,                   \
1305           *ref_image_->squared_sum_image, sweep_options);
1306 
1307       if (last_sweep) {
1308         if (options_.filter) {
1309           consistency_mask_.reset(new GpuMat<uint8_t>(cost_map_->GetWidth(),
1310                                                       cost_map_->GetHeight(),
1311                                                       cost_map_->GetDepth()));
1312           consistency_mask_->FillWithScalar(0);
1313         }
1314         if (options_.geom_consistency) {
1315           const bool kGeomConsistencyTerm = true;
1316           if (options_.filter) {
1317             const bool kFilterPhotoConsistency = true;
1318             const bool kFilterGeomConsistency = true;
1319             CALL_SWEEP_FUNC
1320           } else {
1321             const bool kFilterPhotoConsistency = false;
1322             const bool kFilterGeomConsistency = false;
1323             CALL_SWEEP_FUNC
1324           }
1325         } else {
1326           const bool kGeomConsistencyTerm = false;
1327           if (options_.filter) {
1328             const bool kFilterPhotoConsistency = true;
1329             const bool kFilterGeomConsistency = false;
1330             CALL_SWEEP_FUNC
1331           } else {
1332             const bool kFilterPhotoConsistency = false;
1333             const bool kFilterGeomConsistency = false;
1334             CALL_SWEEP_FUNC
1335           }
1336         }
1337       } else {
1338         const bool kFilterPhotoConsistency = false;
1339         const bool kFilterGeomConsistency = false;
1340         if (options_.geom_consistency) {
1341           const bool kGeomConsistencyTerm = true;
1342           CALL_SWEEP_FUNC
1343         } else {
1344           const bool kGeomConsistencyTerm = false;
1345           CALL_SWEEP_FUNC
1346         }
1347       }
1348 
1349 #undef CALL_SWEEP_FUNC
1350 
1351       CUDA_SYNC_AND_CHECK();
1352 
1353       Rotate();
1354 
1355       // Rotate selected image map.
1356       if (last_sweep && options_.filter) {
1357         std::unique_ptr<GpuMat<uint8_t>> rot_consistency_mask_(
1358             new GpuMat<uint8_t>(cost_map_->GetWidth(), cost_map_->GetHeight(),
1359                                 cost_map_->GetDepth()));
1360         consistency_mask_->Rotate(rot_consistency_mask_.get());
1361         consistency_mask_.swap(rot_consistency_mask_);
1362       }
1363 
1364       sweep_timer.Print(" Sweep " + std::to_string(sweep + 1));
1365     }
1366 
1367     iter_timer.Print("Iteration " + std::to_string(iter + 1));
1368   }
1369 
1370   total_timer.Print("Total");
1371 }
1372 
ComputeCudaConfig()1373 void PatchMatchCuda::ComputeCudaConfig() {
1374   sweep_block_size_.x = THREADS_PER_BLOCK;
1375   sweep_block_size_.y = 1;
1376   sweep_block_size_.z = 1;
1377   sweep_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1378   sweep_grid_size_.y = 1;
1379   sweep_grid_size_.z = 1;
1380 
1381   elem_wise_block_size_.x = THREADS_PER_BLOCK;
1382   elem_wise_block_size_.y = THREADS_PER_BLOCK;
1383   elem_wise_block_size_.z = 1;
1384   elem_wise_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1385   elem_wise_grid_size_.y =
1386       (depth_map_->GetHeight() - 1) / THREADS_PER_BLOCK + 1;
1387   elem_wise_grid_size_.z = 1;
1388 }
1389 
InitRefImage()1390 void PatchMatchCuda::InitRefImage() {
1391   const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1392 
1393   ref_width_ = ref_image.GetWidth();
1394   ref_height_ = ref_image.GetHeight();
1395 
1396   // Upload to device.
1397   ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_));
1398   const std::vector<uint8_t> ref_image_array =
1399       ref_image.GetBitmap().ConvertToRowMajorArray();
1400   ref_image_->Filter(ref_image_array.data(), options_.window_radius,
1401                      options_.window_step, options_.sigma_spatial,
1402                      options_.sigma_color);
1403 
1404   ref_image_device_.reset(
1405       new CudaArrayWrapper<uint8_t>(ref_width_, ref_height_, 1));
1406   ref_image_device_->CopyFromGpuMat(*ref_image_->image);
1407 
1408   // Create texture.
1409   ref_image_texture.addressMode[0] = cudaAddressModeBorder;
1410   ref_image_texture.addressMode[1] = cudaAddressModeBorder;
1411   ref_image_texture.addressMode[2] = cudaAddressModeBorder;
1412   ref_image_texture.filterMode = cudaFilterModePoint;
1413   ref_image_texture.normalized = false;
1414   CUDA_SAFE_CALL(
1415       cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
1416 }
1417 
InitSourceImages()1418 void PatchMatchCuda::InitSourceImages() {
1419   // Determine maximum image size.
1420   size_t max_width = 0;
1421   size_t max_height = 0;
1422   for (const auto image_idx : problem_.src_image_idxs) {
1423     const Image& image = problem_.images->at(image_idx);
1424     if (image.GetWidth() > max_width) {
1425       max_width = image.GetWidth();
1426     }
1427     if (image.GetHeight() > max_height) {
1428       max_height = image.GetHeight();
1429     }
1430   }
1431 
1432   // Upload source images to device.
1433   {
1434     // Copy source images to contiguous memory block.
1435     const uint8_t kDefaultValue = 0;
1436     std::vector<uint8_t> src_images_host_data(
1437         static_cast<size_t>(max_width * max_height *
1438                             problem_.src_image_idxs.size()),
1439         kDefaultValue);
1440     for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1441       const Image& image = problem_.images->at(problem_.src_image_idxs[i]);
1442       const Bitmap& bitmap = image.GetBitmap();
1443       uint8_t* dest = src_images_host_data.data() + max_width * max_height * i;
1444       for (size_t r = 0; r < image.GetHeight(); ++r) {
1445         memcpy(dest, bitmap.GetScanline(r), image.GetWidth() * sizeof(uint8_t));
1446         dest += max_width;
1447       }
1448     }
1449 
1450     // Upload to device.
1451     src_images_device_.reset(new CudaArrayWrapper<uint8_t>(
1452         max_width, max_height, problem_.src_image_idxs.size()));
1453     src_images_device_->CopyToDevice(src_images_host_data.data());
1454 
1455     // Create source images texture.
1456     src_images_texture.addressMode[0] = cudaAddressModeBorder;
1457     src_images_texture.addressMode[1] = cudaAddressModeBorder;
1458     src_images_texture.addressMode[2] = cudaAddressModeBorder;
1459     src_images_texture.filterMode = cudaFilterModeLinear;
1460     src_images_texture.normalized = false;
1461     CUDA_SAFE_CALL(cudaBindTextureToArray(src_images_texture,
1462                                           src_images_device_->GetPtr()));
1463   }
1464 
1465   // Upload source depth maps to device.
1466   if (options_.geom_consistency) {
1467     const float kDefaultValue = 0.0f;
1468     std::vector<float> src_depth_maps_host_data(
1469         static_cast<size_t>(max_width * max_height *
1470                             problem_.src_image_idxs.size()),
1471         kDefaultValue);
1472     for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1473       const DepthMap& depth_map =
1474           problem_.depth_maps->at(problem_.src_image_idxs[i]);
1475       float* dest =
1476           src_depth_maps_host_data.data() + max_width * max_height * i;
1477       for (size_t r = 0; r < depth_map.GetHeight(); ++r) {
1478         memcpy(dest, depth_map.GetPtr() + r * depth_map.GetWidth(),
1479                depth_map.GetWidth() * sizeof(float));
1480         dest += max_width;
1481       }
1482     }
1483 
1484     src_depth_maps_device_.reset(new CudaArrayWrapper<float>(
1485         max_width, max_height, problem_.src_image_idxs.size()));
1486     src_depth_maps_device_->CopyToDevice(src_depth_maps_host_data.data());
1487 
1488     // Create source depth maps texture.
1489     src_depth_maps_texture.addressMode[0] = cudaAddressModeBorder;
1490     src_depth_maps_texture.addressMode[1] = cudaAddressModeBorder;
1491     src_depth_maps_texture.addressMode[2] = cudaAddressModeBorder;
1492     // TODO: Check if linear interpolation improves results or not.
1493     src_depth_maps_texture.filterMode = cudaFilterModePoint;
1494     src_depth_maps_texture.normalized = false;
1495     CUDA_SAFE_CALL(cudaBindTextureToArray(src_depth_maps_texture,
1496                                           src_depth_maps_device_->GetPtr()));
1497   }
1498 }
1499 
InitTransforms()1500 void PatchMatchCuda::InitTransforms() {
1501   const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1502 
1503   //////////////////////////////////////////////////////////////////////////////
1504   // Generate rotated versions (counter-clockwise) of calibration matrix.
1505   //////////////////////////////////////////////////////////////////////////////
1506 
1507   for (size_t i = 0; i < 4; ++i) {
1508     ref_K_host_[i][0] = ref_image.GetK()[0];
1509     ref_K_host_[i][1] = ref_image.GetK()[2];
1510     ref_K_host_[i][2] = ref_image.GetK()[4];
1511     ref_K_host_[i][3] = ref_image.GetK()[5];
1512   }
1513 
1514   // Rotated by 90 degrees.
1515   std::swap(ref_K_host_[1][0], ref_K_host_[1][2]);
1516   std::swap(ref_K_host_[1][1], ref_K_host_[1][3]);
1517   ref_K_host_[1][3] = ref_width_ - 1 - ref_K_host_[1][3];
1518 
1519   // Rotated by 180 degrees.
1520   ref_K_host_[2][1] = ref_width_ - 1 - ref_K_host_[2][1];
1521   ref_K_host_[2][3] = ref_height_ - 1 - ref_K_host_[2][3];
1522 
1523   // Rotated by 270 degrees.
1524   std::swap(ref_K_host_[3][0], ref_K_host_[3][2]);
1525   std::swap(ref_K_host_[3][1], ref_K_host_[3][3]);
1526   ref_K_host_[3][1] = ref_height_ - 1 - ref_K_host_[3][1];
1527 
1528   // Extract 1/fx, -cx/fx, fy, -cy/fy.
1529   for (size_t i = 0; i < 4; ++i) {
1530     ref_inv_K_host_[i][0] = 1.0f / ref_K_host_[i][0];
1531     ref_inv_K_host_[i][1] = -ref_K_host_[i][1] / ref_K_host_[i][0];
1532     ref_inv_K_host_[i][2] = 1.0f / ref_K_host_[i][2];
1533     ref_inv_K_host_[i][3] = -ref_K_host_[i][3] / ref_K_host_[i][2];
1534   }
1535 
1536   // Bind 0 degrees version to constant global memory.
1537   CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[0], sizeof(float) * 4, 0,
1538                                     cudaMemcpyHostToDevice));
1539   CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[0],
1540                                     sizeof(float) * 4, 0,
1541                                     cudaMemcpyHostToDevice));
1542 
1543   //////////////////////////////////////////////////////////////////////////////
1544   // Generate rotated versions of camera poses.
1545   //////////////////////////////////////////////////////////////////////////////
1546 
1547   float rotated_R[9];
1548   memcpy(rotated_R, ref_image.GetR(), 9 * sizeof(float));
1549 
1550   float rotated_T[3];
1551   memcpy(rotated_T, ref_image.GetT(), 3 * sizeof(float));
1552 
1553   // Matrix for 90deg rotation around Z-axis in counter-clockwise direction.
1554   const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1};
1555 
1556   for (size_t i = 0; i < 4; ++i) {
1557     const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12;
1558     std::vector<float> poses_host_data(kNumTformParams *
1559                                        problem_.src_image_idxs.size());
1560     int offset = 0;
1561     for (const auto image_idx : problem_.src_image_idxs) {
1562       const Image& image = problem_.images->at(image_idx);
1563 
1564       const float K[4] = {image.GetK()[0], image.GetK()[2], image.GetK()[4],
1565                           image.GetK()[5]};
1566       memcpy(poses_host_data.data() + offset, K, 4 * sizeof(float));
1567       offset += 4;
1568 
1569       float rel_R[9];
1570       float rel_T[3];
1571       ComputeRelativePose(rotated_R, rotated_T, image.GetR(), image.GetT(),
1572                           rel_R, rel_T);
1573       memcpy(poses_host_data.data() + offset, rel_R, 9 * sizeof(float));
1574       offset += 9;
1575       memcpy(poses_host_data.data() + offset, rel_T, 3 * sizeof(float));
1576       offset += 3;
1577 
1578       float C[3];
1579       ComputeProjectionCenter(rel_R, rel_T, C);
1580       memcpy(poses_host_data.data() + offset, C, 3 * sizeof(float));
1581       offset += 3;
1582 
1583       float P[12];
1584       ComposeProjectionMatrix(image.GetK(), rel_R, rel_T, P);
1585       memcpy(poses_host_data.data() + offset, P, 12 * sizeof(float));
1586       offset += 12;
1587 
1588       float inv_P[12];
1589       ComposeInverseProjectionMatrix(image.GetK(), rel_R, rel_T, inv_P);
1590       memcpy(poses_host_data.data() + offset, inv_P, 12 * sizeof(float));
1591       offset += 12;
1592     }
1593 
1594     poses_device_[i].reset(new CudaArrayWrapper<float>(
1595         kNumTformParams, problem_.src_image_idxs.size(), 1));
1596     poses_device_[i]->CopyToDevice(poses_host_data.data());
1597 
1598     RotatePose(R_z90, rotated_R, rotated_T);
1599   }
1600 
1601   poses_texture.addressMode[0] = cudaAddressModeBorder;
1602   poses_texture.addressMode[1] = cudaAddressModeBorder;
1603   poses_texture.addressMode[2] = cudaAddressModeBorder;
1604   poses_texture.filterMode = cudaFilterModePoint;
1605   poses_texture.normalized = false;
1606   CUDA_SAFE_CALL(
1607       cudaBindTextureToArray(poses_texture, poses_device_[0]->GetPtr()));
1608 }
1609 
InitWorkspaceMemory()1610 void PatchMatchCuda::InitWorkspaceMemory() {
1611   rand_state_map_.reset(new GpuMatPRNG(ref_width_, ref_height_));
1612 
1613   depth_map_.reset(new GpuMat<float>(ref_width_, ref_height_));
1614   if (options_.geom_consistency) {
1615     const DepthMap& init_depth_map =
1616         problem_.depth_maps->at(problem_.ref_image_idx);
1617     depth_map_->CopyToDevice(init_depth_map.GetPtr(),
1618                              init_depth_map.GetWidth() * sizeof(float));
1619   } else {
1620     depth_map_->FillWithRandomNumbers(options_.depth_min, options_.depth_max,
1621                                       *rand_state_map_);
1622   }
1623 
1624   normal_map_.reset(new GpuMat<float>(ref_width_, ref_height_, 3));
1625 
1626   // Note that it is not necessary to keep the selection probability map in
1627   // memory for all pixels. Theoretically, it is possible to incorporate
1628   // the temporary selection probabilities in the global_workspace_.
1629   // However, it is useful to keep the probabilities for the entire image
1630   // in memory, so that it can be exported.
1631   sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1632                                         problem_.src_image_idxs.size()));
1633   prev_sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1634                                              problem_.src_image_idxs.size()));
1635   prev_sel_prob_map_->FillWithScalar(0.5f);
1636 
1637   cost_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1638                                     problem_.src_image_idxs.size()));
1639 
1640   const int ref_max_dim = std::max(ref_width_, ref_height_);
1641   global_workspace_.reset(
1642       new GpuMat<float>(ref_max_dim, problem_.src_image_idxs.size(), 2));
1643 
1644   consistency_mask_.reset(new GpuMat<uint8_t>(0, 0, 0));
1645 
1646   ComputeCudaConfig();
1647 
1648   if (options_.geom_consistency) {
1649     const NormalMap& init_normal_map =
1650         problem_.normal_maps->at(problem_.ref_image_idx);
1651     normal_map_->CopyToDevice(init_normal_map.GetPtr(),
1652                               init_normal_map.GetWidth() * sizeof(float));
1653   } else {
1654     InitNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1655         *normal_map_, *rand_state_map_);
1656   }
1657 }
1658 
Rotate()1659 void PatchMatchCuda::Rotate() {
1660   rotation_in_half_pi_ = (rotation_in_half_pi_ + 1) % 4;
1661 
1662   size_t width;
1663   size_t height;
1664   if (rotation_in_half_pi_ % 2 == 0) {
1665     width = ref_width_;
1666     height = ref_height_;
1667   } else {
1668     width = ref_height_;
1669     height = ref_width_;
1670   }
1671 
1672   // Rotate random map.
1673   {
1674     std::unique_ptr<GpuMatPRNG> rotated_rand_state_map(
1675         new GpuMatPRNG(width, height));
1676     rand_state_map_->Rotate(rotated_rand_state_map.get());
1677     rand_state_map_.swap(rotated_rand_state_map);
1678   }
1679 
1680   // Rotate depth map.
1681   {
1682     std::unique_ptr<GpuMat<float>> rotated_depth_map(
1683         new GpuMat<float>(width, height));
1684     depth_map_->Rotate(rotated_depth_map.get());
1685     depth_map_.swap(rotated_depth_map);
1686   }
1687 
1688   // Rotate normal map.
1689   {
1690     RotateNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1691         *normal_map_);
1692     std::unique_ptr<GpuMat<float>> rotated_normal_map(
1693         new GpuMat<float>(width, height, 3));
1694     normal_map_->Rotate(rotated_normal_map.get());
1695     normal_map_.swap(rotated_normal_map);
1696   }
1697 
1698   // Rotate reference image.
1699   {
1700     std::unique_ptr<GpuMatRefImage> rotated_ref_image(
1701         new GpuMatRefImage(width, height));
1702     ref_image_->image->Rotate(rotated_ref_image->image.get());
1703     ref_image_->sum_image->Rotate(rotated_ref_image->sum_image.get());
1704     ref_image_->squared_sum_image->Rotate(
1705         rotated_ref_image->squared_sum_image.get());
1706     ref_image_.swap(rotated_ref_image);
1707   }
1708 
1709   // Bind rotated reference image to texture.
1710   ref_image_device_.reset(new CudaArrayWrapper<uint8_t>(width, height, 1));
1711   ref_image_device_->CopyFromGpuMat(*ref_image_->image);
1712   CUDA_SAFE_CALL(cudaUnbindTexture(ref_image_texture));
1713   CUDA_SAFE_CALL(
1714       cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
1715 
1716   // Rotate selection probability map.
1717   prev_sel_prob_map_.reset(
1718       new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1719   sel_prob_map_->Rotate(prev_sel_prob_map_.get());
1720   sel_prob_map_.reset(
1721       new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1722 
1723   // Rotate cost map.
1724   {
1725     std::unique_ptr<GpuMat<float>> rotated_cost_map(
1726         new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1727     cost_map_->Rotate(rotated_cost_map.get());
1728     cost_map_.swap(rotated_cost_map);
1729   }
1730 
1731   // Rotate transformations.
1732   CUDA_SAFE_CALL(cudaUnbindTexture(poses_texture));
1733   CUDA_SAFE_CALL(cudaBindTextureToArray(
1734       poses_texture, poses_device_[rotation_in_half_pi_]->GetPtr()));
1735 
1736   // Rotate calibration.
1737   CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_],
1738                                     sizeof(float) * 4, 0,
1739                                     cudaMemcpyHostToDevice));
1740   CUDA_SAFE_CALL(
1741       cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[rotation_in_half_pi_],
1742                          sizeof(float) * 4, 0, cudaMemcpyHostToDevice));
1743 
1744   // Recompute Cuda configuration for rotated reference image.
1745   ComputeCudaConfig();
1746 }
1747 
1748 }  // namespace mvs
1749 }  // namespace colmap
1750