1 // This sample demonstrates working on one piece of data using two GPUs.
2 // It splits input into two parts and processes them separately on different GPUs.
3
4 #ifdef _WIN32
5 #define NOMINMAX
6 #include <windows.h>
7 #else
8 #include <pthread.h>
9 #include <unistd.h>
10 #endif
11
12 #include <iostream>
13 #include <iomanip>
14
15 #include "opencv2/core.hpp"
16 #include "opencv2/highgui.hpp"
17 #include "opencv2/imgproc.hpp"
18 #include "opencv2/cudastereo.hpp"
19
20 using namespace std;
21 using namespace cv;
22 using namespace cv::cuda;
23
24 ///////////////////////////////////////////////////////////
25 // Thread
26 // OS-specific wrappers for multi-threading
27
28 #ifdef _WIN32
29 class Thread
30 {
31 struct UserData
32 {
33 void (*func)(void* userData);
34 void* param;
35 };
36
WinThreadFunction(LPVOID lpParam)37 static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
38 {
39 UserData* userData = static_cast<UserData*>(lpParam);
40
41 userData->func(userData->param);
42
43 return 0;
44 }
45
46 UserData userData_;
47 HANDLE thread_;
48 DWORD threadId_;
49
50 public:
Thread(void (* func)(void * userData),void * userData)51 Thread(void (*func)(void* userData), void* userData)
52 {
53 userData_.func = func;
54 userData_.param = userData;
55
56 thread_ = CreateThread(
57 NULL, // default security attributes
58 0, // use default stack size
59 WinThreadFunction, // thread function name
60 &userData_, // argument to thread function
61 0, // use default creation flags
62 &threadId_); // returns the thread identifier
63 }
64
~Thread()65 ~Thread()
66 {
67 CloseHandle(thread_);
68 }
69
wait()70 void wait()
71 {
72 WaitForSingleObject(thread_, INFINITE);
73 }
74 };
75 #else
76 class Thread
77 {
78 struct UserData
79 {
80 void (*func)(void* userData);
81 void* param;
82 };
83
PThreadFunction(void * lpParam)84 static void* PThreadFunction(void* lpParam)
85 {
86 UserData* userData = static_cast<UserData*>(lpParam);
87
88 userData->func(userData->param);
89
90 return 0;
91 }
92
93 pthread_t thread_;
94 UserData userData_;
95
96 public:
Thread(void (* func)(void * userData),void * userData)97 Thread(void (*func)(void* userData), void* userData)
98 {
99 userData_.func = func;
100 userData_.param = userData;
101
102 pthread_create(&thread_, NULL, PThreadFunction, &userData_);
103 }
104
~Thread()105 ~Thread()
106 {
107 pthread_detach(thread_);
108 }
109
wait()110 void wait()
111 {
112 pthread_join(thread_, NULL);
113 }
114 };
115 #endif
116
117 ///////////////////////////////////////////////////////////
118 // StereoSingleGpu
119 // Run Stereo algorithm on single GPU
120
121 class StereoSingleGpu
122 {
123 public:
124 explicit StereoSingleGpu(int deviceId = 0);
125 ~StereoSingleGpu();
126
127 void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
128
129 private:
130 int deviceId_;
131 GpuMat d_leftFrame;
132 GpuMat d_rightFrame;
133 GpuMat d_disparity;
134 Ptr<cuda::StereoBM> d_alg;
135 };
136
StereoSingleGpu(int deviceId)137 StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
138 {
139 cuda::setDevice(deviceId_);
140 d_alg = cuda::createStereoBM(256);
141 }
142
~StereoSingleGpu()143 StereoSingleGpu::~StereoSingleGpu()
144 {
145 cuda::setDevice(deviceId_);
146 d_leftFrame.release();
147 d_rightFrame.release();
148 d_disparity.release();
149 d_alg.release();
150 }
151
compute(const Mat & leftFrame,const Mat & rightFrame,Mat & disparity)152 void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
153 {
154 cuda::setDevice(deviceId_);
155 d_leftFrame.upload(leftFrame);
156 d_rightFrame.upload(rightFrame);
157 d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
158 d_disparity.download(disparity);
159 }
160
161 ///////////////////////////////////////////////////////////
162 // StereoMultiGpuThread
163 // Run Stereo algorithm on two GPUs using different host threads
164
165 class StereoMultiGpuThread
166 {
167 public:
168 StereoMultiGpuThread();
169 ~StereoMultiGpuThread();
170
171 void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
172
173 private:
174 GpuMat d_leftFrames[2];
175 GpuMat d_rightFrames[2];
176 GpuMat d_disparities[2];
177 Ptr<cuda::StereoBM> d_algs[2];
178
179 struct StereoLaunchData
180 {
181 int deviceId;
182 Mat leftFrame;
183 Mat rightFrame;
184 Mat disparity;
185 GpuMat* d_leftFrame;
186 GpuMat* d_rightFrame;
187 GpuMat* d_disparity;
188 Ptr<cuda::StereoBM> d_alg;
189 };
190
191 static void launchGpuStereoAlg(void* userData);
192 };
193
StereoMultiGpuThread()194 StereoMultiGpuThread::StereoMultiGpuThread()
195 {
196 cuda::setDevice(0);
197 d_algs[0] = cuda::createStereoBM(256);
198
199 cuda::setDevice(1);
200 d_algs[1] = cuda::createStereoBM(256);
201 }
202
~StereoMultiGpuThread()203 StereoMultiGpuThread::~StereoMultiGpuThread()
204 {
205 cuda::setDevice(0);
206 d_leftFrames[0].release();
207 d_rightFrames[0].release();
208 d_disparities[0].release();
209 d_algs[0].release();
210
211 cuda::setDevice(1);
212 d_leftFrames[1].release();
213 d_rightFrames[1].release();
214 d_disparities[1].release();
215 d_algs[1].release();
216 }
217
compute(const Mat & leftFrame,const Mat & rightFrame,Mat & disparity)218 void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
219 {
220 disparity.create(leftFrame.size(), CV_8UC1);
221
222 // Split input data onto two parts for each GPUs.
223 // We add small border for each part,
224 // because original algorithm doesn't calculate disparity on image borders.
225 // With such padding we will get output in the middle of final result.
226
227 StereoLaunchData launchDatas[2];
228
229 launchDatas[0].deviceId = 0;
230 launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
231 launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
232 launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
233 launchDatas[0].d_leftFrame = &d_leftFrames[0];
234 launchDatas[0].d_rightFrame = &d_rightFrames[0];
235 launchDatas[0].d_disparity = &d_disparities[0];
236 launchDatas[0].d_alg = d_algs[0];
237
238 launchDatas[1].deviceId = 1;
239 launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
240 launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
241 launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
242 launchDatas[1].d_leftFrame = &d_leftFrames[1];
243 launchDatas[1].d_rightFrame = &d_rightFrames[1];
244 launchDatas[1].d_disparity = &d_disparities[1];
245 launchDatas[1].d_alg = d_algs[1];
246
247 Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
248 Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
249
250 thread0.wait();
251 thread1.wait();
252 }
253
launchGpuStereoAlg(void * userData)254 void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
255 {
256 StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
257
258 cuda::setDevice(data->deviceId);
259 data->d_leftFrame->upload(data->leftFrame);
260 data->d_rightFrame->upload(data->rightFrame);
261 data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
262
263 if (data->deviceId == 0)
264 data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
265 else
266 data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
267 }
268
269 ///////////////////////////////////////////////////////////
270 // StereoMultiGpuStream
271 // Run Stereo algorithm on two GPUs from single host thread using async API
272
273 class StereoMultiGpuStream
274 {
275 public:
276 StereoMultiGpuStream();
277 ~StereoMultiGpuStream();
278
279 void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
280
281 private:
282 GpuMat d_leftFrames[2];
283 GpuMat d_rightFrames[2];
284 GpuMat d_disparities[2];
285 Ptr<cuda::StereoBM> d_algs[2];
286 Ptr<Stream> streams[2];
287 };
288
StereoMultiGpuStream()289 StereoMultiGpuStream::StereoMultiGpuStream()
290 {
291 cuda::setDevice(0);
292 d_algs[0] = cuda::createStereoBM(256);
293 streams[0] = makePtr<Stream>();
294
295 cuda::setDevice(1);
296 d_algs[1] = cuda::createStereoBM(256);
297 streams[1] = makePtr<Stream>();
298 }
299
~StereoMultiGpuStream()300 StereoMultiGpuStream::~StereoMultiGpuStream()
301 {
302 cuda::setDevice(0);
303 d_leftFrames[0].release();
304 d_rightFrames[0].release();
305 d_disparities[0].release();
306 d_algs[0].release();
307 streams[0].release();
308
309 cuda::setDevice(1);
310 d_leftFrames[1].release();
311 d_rightFrames[1].release();
312 d_disparities[1].release();
313 d_algs[1].release();
314 streams[1].release();
315 }
316
compute(const HostMem & leftFrame,const HostMem & rightFrame,HostMem & disparity)317 void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
318 {
319 disparity.create(leftFrame.size(), CV_8UC1);
320
321 // Split input data onto two parts for each GPUs.
322 // We add small border for each part,
323 // because original algorithm doesn't calculate disparity on image borders.
324 // With such padding we will get output in the middle of final result.
325
326 Mat leftFrameHdr = leftFrame.createMatHeader();
327 Mat rightFrameHdr = rightFrame.createMatHeader();
328 Mat disparityHdr = disparity.createMatHeader();
329 Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
330 Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
331
332 cuda::setDevice(0);
333 d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
334 d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
335 d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
336 d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
337
338 cuda::setDevice(1);
339 d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
340 d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
341 d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
342 d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
343
344 cuda::setDevice(0);
345 streams[0]->waitForCompletion();
346
347 cuda::setDevice(1);
348 streams[1]->waitForCompletion();
349 }
350
351 ///////////////////////////////////////////////////////////
352 // main
353
main(int argc,char ** argv)354 int main(int argc, char** argv)
355 {
356 if (argc != 3)
357 {
358 cerr << "Usage: stereo_multi <left_video> <right_video>" << endl;
359 return -1;
360 }
361
362 const int numDevices = getCudaEnabledDeviceCount();
363 if (numDevices != 2)
364 {
365 cerr << "Two GPUs are required" << endl;
366 return -1;
367 }
368
369 for (int i = 0; i < numDevices; ++i)
370 {
371 DeviceInfo devInfo(i);
372 if (!devInfo.isCompatible())
373 {
374 cerr << "CUDA module wasn't built for GPU #" << i << " ("
375 << devInfo.name() << ", CC " << devInfo.majorVersion()
376 << devInfo.minorVersion() << endl;
377 return -1;
378 }
379
380 printShortCudaDeviceInfo(i);
381 }
382
383 VideoCapture leftVideo(argv[1]);
384 VideoCapture rightVideo(argv[2]);
385
386 if (!leftVideo.isOpened())
387 {
388 cerr << "Can't open " << argv[1] << " video file" << endl;
389 return -1;
390 }
391
392 if (!rightVideo.isOpened())
393 {
394 cerr << "Can't open " << argv[2] << " video file" << endl;
395 return -1;
396 }
397
398 cout << endl;
399 cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
400 cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
401 cout << endl;
402
403 Mat leftFrame, rightFrame;
404 HostMem leftGrayFrame, rightGrayFrame;
405
406 StereoSingleGpu gpu0Alg(0);
407 StereoSingleGpu gpu1Alg(1);
408 StereoMultiGpuThread multiThreadAlg;
409 StereoMultiGpuStream multiStreamAlg;
410
411 Mat disparityGpu0;
412 Mat disparityGpu1;
413 Mat disparityMultiThread;
414 HostMem disparityMultiStream;
415
416 Mat disparityGpu0Show;
417 Mat disparityGpu1Show;
418 Mat disparityMultiThreadShow;
419 Mat disparityMultiStreamShow;
420
421 TickMeter tm;
422
423 cout << "-------------------------------------------------------------------" << endl;
424 cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
425 cout << "-------------------------------------------------------------------" << endl;
426
427 for (int i = 0;; ++i)
428 {
429 leftVideo >> leftFrame;
430 rightVideo >> rightFrame;
431
432 if (leftFrame.empty() || rightFrame.empty())
433 break;
434
435 if (leftFrame.size() != rightFrame.size())
436 {
437 cerr << "Frames have different sizes" << endl;
438 return -1;
439 }
440
441 leftGrayFrame.create(leftFrame.size(), CV_8UC1);
442 rightGrayFrame.create(leftFrame.size(), CV_8UC1);
443
444 cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
445 cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
446
447 tm.reset(); tm.start();
448 gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
449 disparityGpu0);
450 tm.stop();
451
452 const double gpu0Time = tm.getTimeMilli();
453
454 tm.reset(); tm.start();
455 gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
456 disparityGpu1);
457 tm.stop();
458
459 const double gpu1Time = tm.getTimeMilli();
460
461 tm.reset(); tm.start();
462 multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
463 disparityMultiThread);
464 tm.stop();
465
466 const double multiThreadTime = tm.getTimeMilli();
467
468 tm.reset(); tm.start();
469 multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
470 tm.stop();
471
472 const double multiStreamTime = tm.getTimeMilli();
473
474 cout << "| " << setw(5) << i << " | "
475 << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
476 << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
477 << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
478 << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
479
480 resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
481 resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
482 resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
483 resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
484
485 imshow("disparityGpu0", disparityGpu0Show);
486 imshow("disparityGpu1", disparityGpu1Show);
487 imshow("disparityMultiThread", disparityMultiThreadShow);
488 imshow("disparityMultiStream", disparityMultiStreamShow);
489
490 const int key = waitKey(30) & 0xff;
491 if (key == 27)
492 break;
493 }
494
495 cout << "-------------------------------------------------------------------" << endl;
496
497 return 0;
498 }
499