1 // This sample demonstrates working on one piece of data using two GPUs.
2 // It splits input into two parts and processes them separately on different GPUs.
3 
4 #ifdef _WIN32
5     #define NOMINMAX
6     #include <windows.h>
7 #else
8     #include <pthread.h>
9     #include <unistd.h>
10 #endif
11 
12 #include <iostream>
13 #include <iomanip>
14 
15 #include "opencv2/core.hpp"
16 #include "opencv2/highgui.hpp"
17 #include "opencv2/imgproc.hpp"
18 #include "opencv2/cudastereo.hpp"
19 
20 using namespace std;
21 using namespace cv;
22 using namespace cv::cuda;
23 
24 ///////////////////////////////////////////////////////////
25 // Thread
26 // OS-specific wrappers for multi-threading
27 
28 #ifdef _WIN32
29 class Thread
30 {
31     struct UserData
32     {
33         void (*func)(void* userData);
34         void* param;
35     };
36 
WinThreadFunction(LPVOID lpParam)37     static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
38     {
39         UserData* userData = static_cast<UserData*>(lpParam);
40 
41         userData->func(userData->param);
42 
43         return 0;
44     }
45 
46     UserData userData_;
47     HANDLE thread_;
48     DWORD threadId_;
49 
50 public:
Thread(void (* func)(void * userData),void * userData)51     Thread(void (*func)(void* userData), void* userData)
52     {
53         userData_.func = func;
54         userData_.param = userData;
55 
56         thread_ = CreateThread(
57             NULL,                   // default security attributes
58             0,                      // use default stack size
59             WinThreadFunction,      // thread function name
60             &userData_,             // argument to thread function
61             0,                      // use default creation flags
62             &threadId_);            // returns the thread identifier
63     }
64 
~Thread()65     ~Thread()
66     {
67         CloseHandle(thread_);
68     }
69 
wait()70     void wait()
71     {
72         WaitForSingleObject(thread_, INFINITE);
73     }
74 };
75 #else
76 class Thread
77 {
78     struct UserData
79     {
80         void (*func)(void* userData);
81         void* param;
82     };
83 
PThreadFunction(void * lpParam)84     static void* PThreadFunction(void* lpParam)
85     {
86         UserData* userData = static_cast<UserData*>(lpParam);
87 
88         userData->func(userData->param);
89 
90         return 0;
91     }
92 
93     pthread_t thread_;
94     UserData userData_;
95 
96 public:
Thread(void (* func)(void * userData),void * userData)97     Thread(void (*func)(void* userData), void* userData)
98     {
99         userData_.func = func;
100         userData_.param = userData;
101 
102         pthread_create(&thread_, NULL, PThreadFunction, &userData_);
103     }
104 
~Thread()105     ~Thread()
106     {
107         pthread_detach(thread_);
108     }
109 
wait()110     void wait()
111     {
112         pthread_join(thread_, NULL);
113     }
114 };
115 #endif
116 
117 ///////////////////////////////////////////////////////////
118 // StereoSingleGpu
119 // Run Stereo algorithm on single GPU
120 
121 class StereoSingleGpu
122 {
123 public:
124     explicit StereoSingleGpu(int deviceId = 0);
125     ~StereoSingleGpu();
126 
127     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
128 
129 private:
130     int deviceId_;
131     GpuMat d_leftFrame;
132     GpuMat d_rightFrame;
133     GpuMat d_disparity;
134     Ptr<cuda::StereoBM> d_alg;
135 };
136 
StereoSingleGpu(int deviceId)137 StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
138 {
139     cuda::setDevice(deviceId_);
140     d_alg = cuda::createStereoBM(256);
141 }
142 
~StereoSingleGpu()143 StereoSingleGpu::~StereoSingleGpu()
144 {
145     cuda::setDevice(deviceId_);
146     d_leftFrame.release();
147     d_rightFrame.release();
148     d_disparity.release();
149     d_alg.release();
150 }
151 
compute(const Mat & leftFrame,const Mat & rightFrame,Mat & disparity)152 void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
153 {
154     cuda::setDevice(deviceId_);
155     d_leftFrame.upload(leftFrame);
156     d_rightFrame.upload(rightFrame);
157     d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
158     d_disparity.download(disparity);
159 }
160 
161 ///////////////////////////////////////////////////////////
162 // StereoMultiGpuThread
163 // Run Stereo algorithm on two GPUs using different host threads
164 
165 class StereoMultiGpuThread
166 {
167 public:
168     StereoMultiGpuThread();
169     ~StereoMultiGpuThread();
170 
171     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
172 
173 private:
174     GpuMat d_leftFrames[2];
175     GpuMat d_rightFrames[2];
176     GpuMat d_disparities[2];
177     Ptr<cuda::StereoBM> d_algs[2];
178 
179     struct StereoLaunchData
180     {
181         int deviceId;
182         Mat leftFrame;
183         Mat rightFrame;
184         Mat disparity;
185         GpuMat* d_leftFrame;
186         GpuMat* d_rightFrame;
187         GpuMat* d_disparity;
188         Ptr<cuda::StereoBM> d_alg;
189     };
190 
191     static void launchGpuStereoAlg(void* userData);
192 };
193 
StereoMultiGpuThread()194 StereoMultiGpuThread::StereoMultiGpuThread()
195 {
196     cuda::setDevice(0);
197     d_algs[0] = cuda::createStereoBM(256);
198 
199     cuda::setDevice(1);
200     d_algs[1] = cuda::createStereoBM(256);
201 }
202 
~StereoMultiGpuThread()203 StereoMultiGpuThread::~StereoMultiGpuThread()
204 {
205     cuda::setDevice(0);
206     d_leftFrames[0].release();
207     d_rightFrames[0].release();
208     d_disparities[0].release();
209     d_algs[0].release();
210 
211     cuda::setDevice(1);
212     d_leftFrames[1].release();
213     d_rightFrames[1].release();
214     d_disparities[1].release();
215     d_algs[1].release();
216 }
217 
compute(const Mat & leftFrame,const Mat & rightFrame,Mat & disparity)218 void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
219 {
220     disparity.create(leftFrame.size(), CV_8UC1);
221 
222     // Split input data onto two parts for each GPUs.
223     // We add small border for each part,
224     // because original algorithm doesn't calculate disparity on image borders.
225     // With such padding we will get output in the middle of final result.
226 
227     StereoLaunchData launchDatas[2];
228 
229     launchDatas[0].deviceId = 0;
230     launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
231     launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
232     launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
233     launchDatas[0].d_leftFrame = &d_leftFrames[0];
234     launchDatas[0].d_rightFrame = &d_rightFrames[0];
235     launchDatas[0].d_disparity = &d_disparities[0];
236     launchDatas[0].d_alg = d_algs[0];
237 
238     launchDatas[1].deviceId = 1;
239     launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
240     launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
241     launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
242     launchDatas[1].d_leftFrame = &d_leftFrames[1];
243     launchDatas[1].d_rightFrame = &d_rightFrames[1];
244     launchDatas[1].d_disparity = &d_disparities[1];
245     launchDatas[1].d_alg = d_algs[1];
246 
247     Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
248     Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
249 
250     thread0.wait();
251     thread1.wait();
252 }
253 
launchGpuStereoAlg(void * userData)254 void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
255 {
256     StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
257 
258     cuda::setDevice(data->deviceId);
259     data->d_leftFrame->upload(data->leftFrame);
260     data->d_rightFrame->upload(data->rightFrame);
261     data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
262 
263     if (data->deviceId == 0)
264         data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
265     else
266         data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
267 }
268 
269 ///////////////////////////////////////////////////////////
270 // StereoMultiGpuStream
271 // Run Stereo algorithm on two GPUs from single host thread using async API
272 
273 class StereoMultiGpuStream
274 {
275 public:
276     StereoMultiGpuStream();
277     ~StereoMultiGpuStream();
278 
279     void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
280 
281 private:
282     GpuMat d_leftFrames[2];
283     GpuMat d_rightFrames[2];
284     GpuMat d_disparities[2];
285     Ptr<cuda::StereoBM> d_algs[2];
286     Ptr<Stream> streams[2];
287 };
288 
StereoMultiGpuStream()289 StereoMultiGpuStream::StereoMultiGpuStream()
290 {
291     cuda::setDevice(0);
292     d_algs[0] = cuda::createStereoBM(256);
293     streams[0] = makePtr<Stream>();
294 
295     cuda::setDevice(1);
296     d_algs[1] = cuda::createStereoBM(256);
297     streams[1] = makePtr<Stream>();
298 }
299 
~StereoMultiGpuStream()300 StereoMultiGpuStream::~StereoMultiGpuStream()
301 {
302     cuda::setDevice(0);
303     d_leftFrames[0].release();
304     d_rightFrames[0].release();
305     d_disparities[0].release();
306     d_algs[0].release();
307     streams[0].release();
308 
309     cuda::setDevice(1);
310     d_leftFrames[1].release();
311     d_rightFrames[1].release();
312     d_disparities[1].release();
313     d_algs[1].release();
314     streams[1].release();
315 }
316 
compute(const HostMem & leftFrame,const HostMem & rightFrame,HostMem & disparity)317 void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
318 {
319     disparity.create(leftFrame.size(), CV_8UC1);
320 
321     // Split input data onto two parts for each GPUs.
322     // We add small border for each part,
323     // because original algorithm doesn't calculate disparity on image borders.
324     // With such padding we will get output in the middle of final result.
325 
326     Mat leftFrameHdr = leftFrame.createMatHeader();
327     Mat rightFrameHdr = rightFrame.createMatHeader();
328     Mat disparityHdr = disparity.createMatHeader();
329     Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
330     Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
331 
332     cuda::setDevice(0);
333     d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
334     d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
335     d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
336     d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
337 
338     cuda::setDevice(1);
339     d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
340     d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
341     d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
342     d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
343 
344     cuda::setDevice(0);
345     streams[0]->waitForCompletion();
346 
347     cuda::setDevice(1);
348     streams[1]->waitForCompletion();
349 }
350 
351 ///////////////////////////////////////////////////////////
352 // main
353 
main(int argc,char ** argv)354 int main(int argc, char** argv)
355 {
356     if (argc != 3)
357     {
358         cerr << "Usage: stereo_multi <left_video> <right_video>" << endl;
359         return -1;
360     }
361 
362     const int numDevices = getCudaEnabledDeviceCount();
363     if (numDevices != 2)
364     {
365         cerr << "Two GPUs are required" << endl;
366         return -1;
367     }
368 
369     for (int i = 0; i < numDevices; ++i)
370     {
371         DeviceInfo devInfo(i);
372         if (!devInfo.isCompatible())
373         {
374             cerr << "CUDA module wasn't built for GPU #" << i << " ("
375                  << devInfo.name() << ", CC " << devInfo.majorVersion()
376                  << devInfo.minorVersion() << endl;
377             return -1;
378         }
379 
380         printShortCudaDeviceInfo(i);
381     }
382 
383     VideoCapture leftVideo(argv[1]);
384     VideoCapture rightVideo(argv[2]);
385 
386     if (!leftVideo.isOpened())
387     {
388          cerr << "Can't open " << argv[1] << " video file" << endl;
389          return -1;
390     }
391 
392     if (!rightVideo.isOpened())
393     {
394          cerr << "Can't open " << argv[2] << " video file" << endl;
395          return -1;
396     }
397 
398     cout << endl;
399     cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
400     cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
401     cout << endl;
402 
403     Mat leftFrame, rightFrame;
404     HostMem leftGrayFrame, rightGrayFrame;
405 
406     StereoSingleGpu gpu0Alg(0);
407     StereoSingleGpu gpu1Alg(1);
408     StereoMultiGpuThread multiThreadAlg;
409     StereoMultiGpuStream multiStreamAlg;
410 
411     Mat disparityGpu0;
412     Mat disparityGpu1;
413     Mat disparityMultiThread;
414     HostMem disparityMultiStream;
415 
416     Mat disparityGpu0Show;
417     Mat disparityGpu1Show;
418     Mat disparityMultiThreadShow;
419     Mat disparityMultiStreamShow;
420 
421     TickMeter tm;
422 
423     cout << "-------------------------------------------------------------------" << endl;
424     cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
425     cout << "-------------------------------------------------------------------" << endl;
426 
427     for (int i = 0;; ++i)
428     {
429         leftVideo >> leftFrame;
430         rightVideo >> rightFrame;
431 
432         if (leftFrame.empty() || rightFrame.empty())
433             break;
434 
435         if (leftFrame.size() != rightFrame.size())
436         {
437             cerr << "Frames have different sizes" << endl;
438             return -1;
439         }
440 
441         leftGrayFrame.create(leftFrame.size(), CV_8UC1);
442         rightGrayFrame.create(leftFrame.size(), CV_8UC1);
443 
444         cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
445         cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
446 
447         tm.reset(); tm.start();
448         gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
449                         disparityGpu0);
450         tm.stop();
451 
452         const double gpu0Time = tm.getTimeMilli();
453 
454         tm.reset(); tm.start();
455         gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
456                         disparityGpu1);
457         tm.stop();
458 
459         const double gpu1Time = tm.getTimeMilli();
460 
461         tm.reset(); tm.start();
462         multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
463                                disparityMultiThread);
464         tm.stop();
465 
466         const double multiThreadTime = tm.getTimeMilli();
467 
468         tm.reset(); tm.start();
469         multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
470         tm.stop();
471 
472         const double multiStreamTime = tm.getTimeMilli();
473 
474         cout << "| " << setw(5) << i << " | "
475              << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
476              << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
477              << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
478              << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
479 
480         resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
481         resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
482         resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
483         resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
484 
485         imshow("disparityGpu0", disparityGpu0Show);
486         imshow("disparityGpu1", disparityGpu1Show);
487         imshow("disparityMultiThread", disparityMultiThreadShow);
488         imshow("disparityMultiStream", disparityMultiStreamShow);
489 
490         const int key = waitKey(30) & 0xff;
491         if (key == 27)
492             break;
493     }
494 
495     cout << "-------------------------------------------------------------------" << endl;
496 
497     return 0;
498 }
499