1 /*
2 * Software License Agreement (BSD License)
3 *
4 * Point Cloud Library (PCL) - www.pointclouds.org
5 * Copyright (c) 2011, Willow Garage, Inc.
6 *
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer in the documentation and/or other materials provided
18 * with the distribution.
19 * * Neither the name of Willow Garage, Inc. nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 *
36 */
37
38 #include "device.hpp"
39
40 using namespace pcl::device;
41
42 namespace pcl
43 {
44 namespace device
45 {
46 template<typename T>
47 __global__ void
initializeVolume(PtrStep<T> volume)48 initializeVolume (PtrStep<T> volume)
49 {
50 int x = threadIdx.x + blockIdx.x * blockDim.x;
51 int y = threadIdx.y + blockIdx.y * blockDim.y;
52
53
54 if (x < VOLUME_X && y < VOLUME_Y)
55 {
56 T *pos = volume.ptr(y) + x;
57 int z_step = VOLUME_Y * volume.step / sizeof(*pos);
58
59 #pragma unroll
60 for(int z = 0; z < VOLUME_Z; ++z, pos+=z_step)
61 pack_tsdf (0.f, 0, *pos);
62 }
63 }
64 }
65 }
66
67 void
initVolume(PtrStep<short2> volume)68 pcl::device::initVolume (PtrStep<short2> volume)
69 {
70 dim3 block (16, 16);
71 dim3 grid (1, 1, 1);
72 grid.x = divUp (VOLUME_X, block.x);
73 grid.y = divUp (VOLUME_Y, block.y);
74
75 initializeVolume<<<grid, block>>>(volume);
76 cudaSafeCall ( cudaGetLastError () );
77 cudaSafeCall (cudaDeviceSynchronize ());
78 }
79
80 namespace pcl
81 {
82 namespace device
83 {
84 struct Tsdf
85 {
86 static constexpr int CTA_SIZE_X = 32;
87 static constexpr int CTA_SIZE_Y = 8;
88 static constexpr int MAX_WEIGHT = 1 << 7;
89
90 mutable PtrStep<short2> volume;
91 float3 cell_size;
92
93 Intr intr;
94
95 Mat33 Rcurr_inv;
96 float3 tcurr;
97
98 PtrStepSz<ushort> depth_raw; //depth in mm
99
100 float tranc_dist_mm;
101
102 __device__ __forceinline__ float3
getVoxelGCoopcl::device::Tsdf103 getVoxelGCoo (int x, int y, int z) const
104 {
105 float3 coo = make_float3 (x, y, z);
106 coo += 0.5f; //shift to cell center;
107
108 coo.x *= cell_size.x;
109 coo.y *= cell_size.y;
110 coo.z *= cell_size.z;
111
112 return coo;
113 }
114
115 __device__ __forceinline__ void
operator ()pcl::device::Tsdf116 operator () () const
117 {
118 int x = threadIdx.x + blockIdx.x * CTA_SIZE_X;
119 int y = threadIdx.y + blockIdx.y * CTA_SIZE_Y;
120
121 if (x >= VOLUME_X || y >= VOLUME_Y)
122 return;
123
124 short2 *pos = volume.ptr (y) + x;
125 int elem_step = volume.step * VOLUME_Y / sizeof(*pos);
126
127 for (int z = 0; z < VOLUME_Z; ++z, pos += elem_step)
128 {
129 float3 v_g = getVoxelGCoo (x, y, z); //3 // p
130
131 //transform to curr cam coo space
132 float3 v = Rcurr_inv * (v_g - tcurr); //4
133
134 int2 coo; //project to current cam
135 coo.x = __float2int_rn (v.x * intr.fx / v.z + intr.cx);
136 coo.y = __float2int_rn (v.y * intr.fy / v.z + intr.cy);
137
138 if (v.z > 0 && coo.x >= 0 && coo.y >= 0 && coo.x < depth_raw.cols && coo.y < depth_raw.rows) //6
139 {
140 int Dp = depth_raw.ptr (coo.y)[coo.x];
141
142 if (Dp != 0)
143 {
144 float xl = (coo.x - intr.cx) / intr.fx;
145 float yl = (coo.y - intr.cy) / intr.fy;
146 float lambda_inv = rsqrtf (xl * xl + yl * yl + 1);
147
148 float sdf = 1000 * norm (tcurr - v_g) * lambda_inv - Dp; //mm
149
150 sdf *= (-1);
151
152 if (sdf >= -tranc_dist_mm)
153 {
154 float tsdf = fmin (1.f, sdf / tranc_dist_mm);
155
156 int weight_prev;
157 float tsdf_prev;
158
159 //read and unpack
160 unpack_tsdf (*pos, tsdf_prev, weight_prev);
161
162 const int Wrk = 1;
163
164 float tsdf_new = (tsdf_prev * weight_prev + Wrk * tsdf) / (weight_prev + Wrk);
165 int weight_new = min (weight_prev + Wrk, MAX_WEIGHT);
166
167 pack_tsdf (tsdf_new, weight_new, *pos);
168 }
169 }
170 }
171 }
172 }
173 };
174
175 __global__ void
integrateTsdfKernel(const Tsdf tsdf)176 integrateTsdfKernel (const Tsdf tsdf) {
177 tsdf ();
178 }
179
180 __global__ void
tsdf2(PtrStep<short2> volume,const float tranc_dist_mm,const Mat33 Rcurr_inv,float3 tcurr,const Intr intr,const PtrStepSz<ushort> depth_raw,const float3 cell_size)181 tsdf2 (PtrStep<short2> volume, const float tranc_dist_mm, const Mat33 Rcurr_inv, float3 tcurr,
182 const Intr intr, const PtrStepSz<ushort> depth_raw, const float3 cell_size)
183 {
184 int x = threadIdx.x + blockIdx.x * blockDim.x;
185 int y = threadIdx.y + blockIdx.y * blockDim.y;
186
187 if (x >= VOLUME_X || y >= VOLUME_Y)
188 return;
189
190 short2 *pos = volume.ptr (y) + x;
191 int elem_step = volume.step * VOLUME_Y / sizeof(short2);
192
193 float v_g_x = (x + 0.5f) * cell_size.x - tcurr.x;
194 float v_g_y = (y + 0.5f) * cell_size.y - tcurr.y;
195 float v_g_z = (0 + 0.5f) * cell_size.z - tcurr.z;
196
197 float v_x = Rcurr_inv.data[0].x * v_g_x + Rcurr_inv.data[0].y * v_g_y + Rcurr_inv.data[0].z * v_g_z;
198 float v_y = Rcurr_inv.data[1].x * v_g_x + Rcurr_inv.data[1].y * v_g_y + Rcurr_inv.data[1].z * v_g_z;
199 float v_z = Rcurr_inv.data[2].x * v_g_x + Rcurr_inv.data[2].y * v_g_y + Rcurr_inv.data[2].z * v_g_z;
200
201 //#pragma unroll
202 for (int z = 0; z < VOLUME_Z; ++z)
203 {
204 float3 vr;
205 vr.x = v_g_x;
206 vr.y = v_g_y;
207 vr.z = (v_g_z + z * cell_size.z);
208
209 float3 v;
210 v.x = v_x + Rcurr_inv.data[0].z * z * cell_size.z;
211 v.y = v_y + Rcurr_inv.data[1].z * z * cell_size.z;
212 v.z = v_z + Rcurr_inv.data[2].z * z * cell_size.z;
213
214 int2 coo; //project to current cam
215 coo.x = __float2int_rn (v.x * intr.fx / v.z + intr.cx);
216 coo.y = __float2int_rn (v.y * intr.fy / v.z + intr.cy);
217
218
219 if (v.z > 0 && coo.x >= 0 && coo.y >= 0 && coo.x < depth_raw.cols && coo.y < depth_raw.rows) //6
220 {
221 int Dp = depth_raw.ptr (coo.y)[coo.x]; //mm
222
223 if (Dp != 0)
224 {
225 float xl = (coo.x - intr.cx) / intr.fx;
226 float yl = (coo.y - intr.cy) / intr.fy;
227 float lambda_inv = rsqrtf (xl * xl + yl * yl + 1);
228
229 float sdf = Dp - norm (vr) * lambda_inv * 1000; //mm
230
231
232 if (sdf >= -tranc_dist_mm)
233 {
234 float tsdf = fmin (1.f, sdf / tranc_dist_mm);
235
236 int weight_prev;
237 float tsdf_prev;
238
239 //read and unpack
240 unpack_tsdf (*pos, tsdf_prev, weight_prev);
241
242 const int Wrk = 1;
243
244 float tsdf_new = (tsdf_prev * weight_prev + Wrk * tsdf) / (weight_prev + Wrk);
245 int weight_new = min (weight_prev + Wrk, Tsdf::MAX_WEIGHT);
246
247 pack_tsdf (tsdf_new, weight_new, *pos);
248 }
249 }
250 }
251 pos += elem_step;
252 } /* for(int z = 0; z < VOLUME_Z; ++z) */
253 } /* __global__ */
254 }
255 }
256
257
258 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
259 void
integrateTsdfVolume(const PtrStepSz<ushort> & depth_raw,const Intr & intr,const float3 & volume_size,const Mat33 & Rcurr_inv,const float3 & tcurr,float tranc_dist,PtrStep<short2> volume)260 pcl::device::integrateTsdfVolume (const PtrStepSz<ushort>& depth_raw, const Intr& intr, const float3& volume_size,
261 const Mat33& Rcurr_inv, const float3& tcurr, float tranc_dist,
262 PtrStep<short2> volume)
263 {
264 Tsdf tsdf;
265
266 tsdf.volume = volume;
267 tsdf.cell_size.x = volume_size.x / VOLUME_X;
268 tsdf.cell_size.y = volume_size.y / VOLUME_Y;
269 tsdf.cell_size.z = volume_size.z / VOLUME_Z;
270
271 tsdf.intr = intr;
272
273 tsdf.Rcurr_inv = Rcurr_inv;
274 tsdf.tcurr = tcurr;
275 tsdf.depth_raw = depth_raw;
276
277 tsdf.tranc_dist_mm = tranc_dist*1000; //mm
278
279 dim3 block (Tsdf::CTA_SIZE_X, Tsdf::CTA_SIZE_Y);
280 dim3 grid (divUp (VOLUME_X, block.x), divUp (VOLUME_Y, block.y));
281
282 #if 0
283 //tsdf2<<<grid, block>>>(volume, tranc_dist, Rcurr_inv, tcurr, intr, depth_raw, tsdf.cell_size);
284 integrateTsdfKernel<<<grid, block>>>(tsdf);
285 #endif
286 cudaSafeCall ( cudaGetLastError () );
287 cudaSafeCall (cudaDeviceSynchronize ());
288 }
289
290
291 namespace pcl
292 {
293 namespace device
294 {
295 __global__ void
scaleDepth(const PtrStepSz<ushort> depth,PtrStep<float> scaled,const Intr intr)296 scaleDepth (const PtrStepSz<ushort> depth, PtrStep<float> scaled, const Intr intr)
297 {
298 int x = threadIdx.x + blockIdx.x * blockDim.x;
299 int y = threadIdx.y + blockIdx.y * blockDim.y;
300
301 if (x >= depth.cols || y >= depth.rows)
302 return;
303
304 int Dp = depth.ptr (y)[x];
305
306 float xl = (x - intr.cx) / intr.fx;
307 float yl = (y - intr.cy) / intr.fy;
308 float lambda = sqrtf (xl * xl + yl * yl + 1);
309
310 scaled.ptr (y)[x] = Dp * lambda/1000.f; //meters
311 }
312
313 __global__ void
tsdf23(const PtrStepSz<float> depthScaled,PtrStep<short2> volume,const float tranc_dist,const Mat33 Rcurr_inv,const float3 tcurr,const Intr intr,const float3 cell_size)314 tsdf23 (const PtrStepSz<float> depthScaled, PtrStep<short2> volume,
315 const float tranc_dist, const Mat33 Rcurr_inv, const float3 tcurr, const Intr intr, const float3 cell_size)
316 {
317 int x = threadIdx.x + blockIdx.x * blockDim.x;
318 int y = threadIdx.y + blockIdx.y * blockDim.y;
319
320 if (x >= VOLUME_X || y >= VOLUME_Y)
321 return;
322
323 float v_g_x = (x + 0.5f) * cell_size.x - tcurr.x;
324 float v_g_y = (y + 0.5f) * cell_size.y - tcurr.y;
325 float v_g_z = (0 + 0.5f) * cell_size.z - tcurr.z;
326
327 float v_g_part_norm = v_g_x * v_g_x + v_g_y * v_g_y;
328
329 float v_x = (Rcurr_inv.data[0].x * v_g_x + Rcurr_inv.data[0].y * v_g_y + Rcurr_inv.data[0].z * v_g_z) * intr.fx;
330 float v_y = (Rcurr_inv.data[1].x * v_g_x + Rcurr_inv.data[1].y * v_g_y + Rcurr_inv.data[1].z * v_g_z) * intr.fy;
331 float v_z = (Rcurr_inv.data[2].x * v_g_x + Rcurr_inv.data[2].y * v_g_y + Rcurr_inv.data[2].z * v_g_z);
332
333 float z_scaled = 0;
334
335 float Rcurr_inv_0_z_scaled = Rcurr_inv.data[0].z * cell_size.z * intr.fx;
336 float Rcurr_inv_1_z_scaled = Rcurr_inv.data[1].z * cell_size.z * intr.fy;
337
338 float tranc_dist_inv = 1.0f / tranc_dist;
339
340 short2* pos = volume.ptr (y) + x;
341 int elem_step = volume.step * VOLUME_Y / sizeof(short2);
342
343 //#pragma unroll
344 for (int z = 0; z < VOLUME_Z;
345 ++z,
346 v_g_z += cell_size.z,
347 z_scaled += cell_size.z,
348 v_x += Rcurr_inv_0_z_scaled,
349 v_y += Rcurr_inv_1_z_scaled,
350 pos += elem_step)
351 {
352 float inv_z = 1.0f / (v_z + Rcurr_inv.data[2].z * z_scaled);
353 if (inv_z < 0)
354 continue;
355
356 // project to current cam
357 int2 coo =
358 {
359 __float2int_rn (v_x * inv_z + intr.cx),
360 __float2int_rn (v_y * inv_z + intr.cy)
361 };
362
363 if (coo.x >= 0 && coo.y >= 0 && coo.x < depthScaled.cols && coo.y < depthScaled.rows) //6
364 {
365 float Dp_scaled = depthScaled.ptr (coo.y)[coo.x]; //meters
366
367 float sdf = Dp_scaled - sqrtf (v_g_z * v_g_z + v_g_part_norm);
368
369 if (Dp_scaled != 0 && sdf >= -tranc_dist) //meters
370 {
371 float tsdf = fmin (1.0f, sdf * tranc_dist_inv);
372
373 //read and unpack
374 float tsdf_prev;
375 int weight_prev;
376 unpack_tsdf (*pos, tsdf_prev, weight_prev);
377
378 const int Wrk = 1;
379
380 float tsdf_new = (tsdf_prev * weight_prev + Wrk * tsdf) / (weight_prev + Wrk);
381 int weight_new = min (weight_prev + Wrk, Tsdf::MAX_WEIGHT);
382
383 pack_tsdf (tsdf_new, weight_new, *pos);
384 }
385 }
386 } // for(int z = 0; z < VOLUME_Z; ++z)
387 } // __global__
388
389 __global__ void
tsdf23normal_hack(const PtrStepSz<float> depthScaled,PtrStep<short2> volume,const float tranc_dist,const Mat33 Rcurr_inv,const float3 tcurr,const Intr intr,const float3 cell_size)390 tsdf23normal_hack (const PtrStepSz<float> depthScaled, PtrStep<short2> volume,
391 const float tranc_dist, const Mat33 Rcurr_inv, const float3 tcurr, const Intr intr, const float3 cell_size)
392 {
393 int x = threadIdx.x + blockIdx.x * blockDim.x;
394 int y = threadIdx.y + blockIdx.y * blockDim.y;
395
396 if (x >= VOLUME_X || y >= VOLUME_Y)
397 return;
398
399 const float v_g_x = (x + 0.5f) * cell_size.x - tcurr.x;
400 const float v_g_y = (y + 0.5f) * cell_size.y - tcurr.y;
401 float v_g_z = (0 + 0.5f) * cell_size.z - tcurr.z;
402
403 float v_g_part_norm = v_g_x * v_g_x + v_g_y * v_g_y;
404
405 float v_x = (Rcurr_inv.data[0].x * v_g_x + Rcurr_inv.data[0].y * v_g_y + Rcurr_inv.data[0].z * v_g_z) * intr.fx;
406 float v_y = (Rcurr_inv.data[1].x * v_g_x + Rcurr_inv.data[1].y * v_g_y + Rcurr_inv.data[1].z * v_g_z) * intr.fy;
407 float v_z = (Rcurr_inv.data[2].x * v_g_x + Rcurr_inv.data[2].y * v_g_y + Rcurr_inv.data[2].z * v_g_z);
408
409 float z_scaled = 0;
410
411 float Rcurr_inv_0_z_scaled = Rcurr_inv.data[0].z * cell_size.z * intr.fx;
412 float Rcurr_inv_1_z_scaled = Rcurr_inv.data[1].z * cell_size.z * intr.fy;
413
414 float tranc_dist_inv = 1.0f / tranc_dist;
415
416 short2* pos = volume.ptr (y) + x;
417 int elem_step = volume.step * VOLUME_Y / sizeof(short2);
418
419 //#pragma unroll
420 for (int z = 0; z < VOLUME_Z;
421 ++z,
422 v_g_z += cell_size.z,
423 z_scaled += cell_size.z,
424 v_x += Rcurr_inv_0_z_scaled,
425 v_y += Rcurr_inv_1_z_scaled,
426 pos += elem_step)
427 {
428 float inv_z = 1.0f / (v_z + Rcurr_inv.data[2].z * z_scaled);
429 if (inv_z < 0)
430 continue;
431
432 // project to current cam
433 int2 coo =
434 {
435 __float2int_rn (v_x * inv_z + intr.cx),
436 __float2int_rn (v_y * inv_z + intr.cy)
437 };
438
439 if (coo.x >= 0 && coo.y >= 0 && coo.x < depthScaled.cols && coo.y < depthScaled.rows) //6
440 {
441 float Dp_scaled = depthScaled.ptr (coo.y)[coo.x]; //meters
442
443 float sdf = Dp_scaled - sqrtf (v_g_z * v_g_z + v_g_part_norm);
444
445 if (Dp_scaled != 0 && sdf >= -tranc_dist) //meters
446 {
447 float tsdf = fmin (1.0f, sdf * tranc_dist_inv);
448
449 bool integrate = true;
450 if ((x > 0 && x < VOLUME_X-2) && (y > 0 && y < VOLUME_Y-2) && (z > 0 && z < VOLUME_Z-2))
451 {
452 const float qnan = std::numeric_limits<float>::quiet_NaN();
453 float3 normal = make_float3(qnan, qnan, qnan);
454
455 float Fn, Fp;
456 int Wn = 0, Wp = 0;
457 unpack_tsdf (*(pos + elem_step), Fn, Wn);
458 unpack_tsdf (*(pos - elem_step), Fp, Wp);
459
460 if (Wn > 16 && Wp > 16)
461 normal.z = (Fn - Fp)/cell_size.z;
462
463 unpack_tsdf (*(pos + volume.step/sizeof(short2) ), Fn, Wn);
464 unpack_tsdf (*(pos - volume.step/sizeof(short2) ), Fp, Wp);
465
466 if (Wn > 16 && Wp > 16)
467 normal.y = (Fn - Fp)/cell_size.y;
468
469 unpack_tsdf (*(pos + 1), Fn, Wn);
470 unpack_tsdf (*(pos - 1), Fp, Wp);
471
472 if (Wn > 16 && Wp > 16)
473 normal.x = (Fn - Fp)/cell_size.x;
474
475 if (normal.x != qnan && normal.y != qnan && normal.z != qnan)
476 {
477 float norm2 = dot(normal, normal);
478 if (norm2 >= 1e-10)
479 {
480 normal *= rsqrt(norm2);
481
482 float nt = v_g_x * normal.x + v_g_y * normal.y + v_g_z * normal.z;
483 float cosine = nt * rsqrt(v_g_x * v_g_x + v_g_y * v_g_y + v_g_z * v_g_z);
484
485 if (cosine < 0.5)
486 integrate = false;
487 }
488 }
489 }
490
491 if (integrate)
492 {
493 //read and unpack
494 float tsdf_prev;
495 int weight_prev;
496 unpack_tsdf (*pos, tsdf_prev, weight_prev);
497
498 const int Wrk = 1;
499
500 float tsdf_new = (tsdf_prev * weight_prev + Wrk * tsdf) / (weight_prev + Wrk);
501 int weight_new = min (weight_prev + Wrk, Tsdf::MAX_WEIGHT);
502
503 pack_tsdf (tsdf_new, weight_new, *pos);
504 }
505 }
506 }
507 } // for(int z = 0; z < VOLUME_Z; ++z)
508 } // __global__
509 }
510 }
511
512 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
513 void
integrateTsdfVolume(const PtrStepSz<ushort> & depth,const Intr & intr,const float3 & volume_size,const Mat33 & Rcurr_inv,const float3 & tcurr,float tranc_dist,PtrStep<short2> volume,DeviceArray2D<float> & depthScaled)514 pcl::device::integrateTsdfVolume (const PtrStepSz<ushort>& depth, const Intr& intr,
515 const float3& volume_size, const Mat33& Rcurr_inv, const float3& tcurr,
516 float tranc_dist,
517 PtrStep<short2> volume, DeviceArray2D<float>& depthScaled)
518 {
519 depthScaled.create (depth.rows, depth.cols);
520
521 dim3 block_scale (32, 8);
522 dim3 grid_scale (divUp (depth.cols, block_scale.x), divUp (depth.rows, block_scale.y));
523
524 //scales depth along ray and converts mm -> meters.
525 scaleDepth<<<grid_scale, block_scale>>>(depth, depthScaled, intr);
526 cudaSafeCall ( cudaGetLastError () );
527
528 float3 cell_size;
529 cell_size.x = volume_size.x / VOLUME_X;
530 cell_size.y = volume_size.y / VOLUME_Y;
531 cell_size.z = volume_size.z / VOLUME_Z;
532
533 //dim3 block(Tsdf::CTA_SIZE_X, Tsdf::CTA_SIZE_Y);
534 dim3 block (16, 16);
535 dim3 grid (divUp (VOLUME_X, block.x), divUp (VOLUME_Y, block.y));
536
537 tsdf23<<<grid, block>>>(depthScaled, volume, tranc_dist, Rcurr_inv, tcurr, intr, cell_size);
538 //tsdf23normal_hack<<<grid, block>>>(depthScaled, volume, tranc_dist, Rcurr_inv, tcurr, intr, cell_size);
539
540 cudaSafeCall ( cudaGetLastError () );
541 cudaSafeCall (cudaDeviceSynchronize ());
542 }
543