1 /******************************************************************************
2 QtAV: Multimedia framework based on Qt and FFmpeg
3 Copyright (C) 2012-2016 Wang Bin <wbsecg1@gmail.com>
4
5 * This file is part of QtAV (from 2015)
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with this library; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ******************************************************************************/
21
22 #include "SurfaceInteropCUDA.h"
23 #include "QtAV/VideoFrame.h"
24 #include "utils/Logger.h"
25 #include "helper_cuda.h"
26
27 #define WORKAROUND_UNMAP_CONTEXT_SWITCH 1
28 #define USE_STREAM 1
29
30 namespace QtAV {
31 namespace cuda {
32
InteropResource()33 InteropResource::InteropResource()
34 : cuda_api()
35 , dev(0)
36 , ctx(0)
37 , dec(0)
38 , lock(0)
39 {
40 memset(res, 0, sizeof(res));
41 }
42
~InteropResource()43 InteropResource::~InteropResource()
44 {
45 //CUDA_WARN(cuCtxPushCurrent(ctx)); //error invalid value
46 if (res[0].cuRes)
47 CUDA_WARN(cuGraphicsUnregisterResource(res[0].cuRes));
48 if (res[1].cuRes)
49 CUDA_WARN(cuGraphicsUnregisterResource(res[1].cuRes));
50 if (res[0].stream)
51 CUDA_WARN(cuStreamDestroy(res[0].stream));
52 if (res[1].stream)
53 CUDA_WARN(cuStreamDestroy(res[1].stream));
54
55 // FIXME: we own the context. But why crash to destroy ctx? CUDA_ERROR_INVALID_VALUE
56 if (!share_ctx && ctx)
57 CUDA_ENSURE(cuCtxDestroy(ctx));
58 }
59
mapToHost(const VideoFormat & format,void * handle,int picIndex,const CUVIDPROCPARAMS & param,int width,int height,int coded_height)60 void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS ¶m, int width, int height, int coded_height)
61 {
62 AutoCtxLock locker((cuda_api*)this, lock);
63 Q_UNUSED(locker);
64 CUdeviceptr devptr;
65 unsigned int pitch;
66
67 CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), NULL);
68 CUVIDAutoUnmapper unmapper(this, dec, devptr);
69 Q_UNUSED(unmapper);
70 uchar* host_data = NULL;
71 const unsigned int host_size = pitch*coded_height*3/2;
72 CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
73 // copy to the memory not allocated by cuda is possible but much slower
74 CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);
75
76 VideoFrame frame(width, height, VideoFormat::Format_NV12);
77 uchar *planes[] = {
78 host_data,
79 host_data + pitch * coded_height
80 };
81 frame.setBits(planes);
82 int pitches[] = { (int)pitch, (int)pitch };
83 frame.setBytesPerLine(pitches);
84
85 VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
86 frame.setTimestamp(f->timestamp());
87 frame.setDisplayAspectRatio(f->displayAspectRatio());
88 if (format == frame.format())
89 *f = frame.clone();
90 else
91 *f = frame.to(format);
92
93 CUDA_ENSURE(cuMemFreeHost(host_data), f);
94 return f;
95 }
96
97 #ifndef QT_NO_OPENGL
HostInteropResource()98 HostInteropResource::HostInteropResource()
99 : InteropResource()
100 {
101 memset(&host_mem, 0, sizeof(host_mem));
102 host_mem.index = -1;
103 }
104
~HostInteropResource()105 HostInteropResource::~HostInteropResource()
106 {
107 if (ctx) { //cuMemFreeHost need the context of mem allocated, it's shared context, or own context
108 CUDA_WARN(cuCtxPushCurrent(ctx));
109 }
110 if (host_mem.data) { //FIXME: CUDA_ERROR_INVALID_VALUE
111 CUDA_ENSURE(cuMemFreeHost(host_mem.data));
112 host_mem.data = NULL;
113 }
114 if (ctx) {
115 CUDA_WARN(cuCtxPopCurrent(NULL));
116 }
117 }
118
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)119 bool HostInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane)
120 {
121 Q_UNUSED(w);
122 if (host_mem.index != picIndex || !host_mem.data) {
123 AutoCtxLock locker((cuda_api*)this, lock);
124 Q_UNUSED(locker);
125
126 CUdeviceptr devptr;
127 unsigned int pitch;
128 //qDebug("index: %d=>%d, plane: %d", host_mem.index, picIndex, plane);
129 CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false);
130 CUVIDAutoUnmapper unmapper(this, dec, devptr);
131 Q_UNUSED(unmapper);
132 if (!ensureResource(pitch, H)) //copy height is coded height
133 return false;
134 // the same thread (context) as cuMemAllocHost, so no ccontext switch is needed
135 CUDA_ENSURE(cuMemcpyDtoH(host_mem.data, devptr, pitch*H*3/2), false);
136 host_mem.index = picIndex;
137 }
138 // map to texture
139 //qDebug("map plane %d @%d", plane, picIndex);
140 GLint iformat[2];
141 GLenum format[2], dtype[2];
142 OpenGLHelper::videoFormatToGL(VideoFormat::Format_NV12, iformat, format, dtype);
143 DYGL(glBindTexture(GL_TEXTURE_2D, tex));
144 const int chroma = plane != 0;
145 // chroma pitch for gl is 1/2 (gl_rg)
146 // texture height is not coded height!
147 DYGL(glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, host_mem.pitch>>chroma, h>>chroma, format[plane], dtype[plane], host_mem.data + chroma*host_mem.pitch*host_mem.height));
148 //DYGL(glTexImage2D(GL_TEXTURE_2D, 0, iformat[plane], host_mem.pitch>>chroma, h>>chroma, 0, format[plane], dtype[plane], host_mem.data + chroma*host_mem.pitch*host_mem.height));
149 return true;
150 }
151
unmap(GLuint)152 bool HostInteropResource::unmap(GLuint)
153 {
154 return true;
155 }
156
ensureResource(int pitch,int height)157 bool HostInteropResource::ensureResource(int pitch, int height)
158 {
159 if (host_mem.data && host_mem.pitch == pitch && host_mem.height == height)
160 return true;
161 if (host_mem.data) {
162 CUDA_ENSURE(cuMemFreeHost(host_mem.data), false);
163 host_mem.data = NULL;
164 }
165 qDebug("allocate cuda host mem. %dx%d=>%dx%d", host_mem.pitch, host_mem.height, pitch, height);
166 host_mem.pitch = pitch;
167 host_mem.height = height;
168 if (!ctx) {
169 CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
170 CUDA_WARN(cuCtxPopCurrent(&ctx));
171 share_ctx = false;
172 }
173 if (!share_ctx) // cuMemFreeHost will be called in dtor which is not the current thread.
174 CUDA_WARN(cuCtxPushCurrent(ctx));
175 // NV12
176 CUDA_ENSURE(cuMemAllocHost((void**)&host_mem.data, pitch*height*3/2), false);
177 if (!share_ctx)
178 CUDA_WARN(cuCtxPopCurrent(NULL)); //can be null or &ctx
179 return true;
180 }
181 #endif //QT_NO_OPENGL
182
setSurface(int picIndex,CUVIDPROCPARAMS param,int width,int height,int surface_height)183 void SurfaceInteropCUDA::setSurface(int picIndex, CUVIDPROCPARAMS param, int width, int height, int surface_height)
184 {
185 m_index = picIndex;
186 m_param = param;
187 w = width;
188 h = height;
189 H = surface_height;
190 }
191
map(SurfaceType type,const VideoFormat & fmt,void * handle,int plane)192 void* SurfaceInteropCUDA::map(SurfaceType type, const VideoFormat &fmt, void *handle, int plane)
193 {
194 Q_UNUSED(fmt);
195 if (m_resource.isNull())
196 return NULL;
197 if (!handle)
198 return NULL;
199
200 if (m_index < 0)
201 return 0;
202 if (type == GLTextureSurface) {
203 #ifndef QT_NO_OPENGL
204 // FIXME: to strong ref may delay the delete and cuda resource maybe already destoryed after strong ref is finished
205 if (m_resource.toStrongRef()->map(m_index, m_param, *((GLuint*)handle), w, h, H, plane))
206 return handle;
207 #endif //QT_NO_OPENGL
208 } else if (type == HostMemorySurface) {
209 return m_resource.toStrongRef()->mapToHost(fmt, handle, m_index, m_param, w, h, H);
210 }
211 return NULL;
212 }
213
unmap(void * handle)214 void SurfaceInteropCUDA::unmap(void *handle)
215 {
216 if (m_resource.isNull())
217 return;
218 #ifndef QT_NO_OPENGL
219 // FIXME: to strong ref may delay the delete and cuda resource maybe already destoryed after strong ref is finished
220 m_resource.toStrongRef()->unmap(*((GLuint*)handle));
221 #endif
222 }
223 } //namespace cuda
224 } //namespace QtAV
225
226 #if QTAV_HAVE(CUDA_EGL)
227 #ifdef QT_OPENGL_ES_2_ANGLE_STATIC
228 #define CAPI_LINK_EGL
229 #else
230 #define EGL_CAPI_NS
231 #endif //QT_OPENGL_ES_2_ANGLE_STATIC
232 #include "capi/egl_api.h"
233 #include <EGL/eglext.h> //include after egl_capi.h to match types
234 #define DX_LOG_COMPONENT "CUDA.D3D"
235 #include "utils/DirectXHelper.h"
236
237 namespace QtAV {
238 namespace cuda {
239 class EGL {
240 public:
EGL()241 EGL() : dpy(EGL_NO_DISPLAY), surface(EGL_NO_SURFACE) {}
242 EGLDisplay dpy;
243 EGLSurface surface; //only support rgb. then we must use CUDA kernel
244 #ifdef EGL_VERSION_1_5
245 // eglCreateImageKHR does not support EGL_NATIVE_PIXMAP_KHR, only 2d, 3d, render buffer
246 //EGLImageKHR image[2];
247 //EGLImage image[2]; //not implemented yet
248 #endif //EGL_VERSION_1_5
249 };
250
EGLInteropResource()251 EGLInteropResource::EGLInteropResource()
252 : InteropResource()
253 , egl(new EGL())
254 , dll9(NULL)
255 , d3d9(NULL)
256 , device9(NULL)
257 , texture9(NULL)
258 , surface9(NULL)
259 , texture9_nv12(NULL)
260 , surface9_nv12(NULL)
261 , query9(NULL)
262 {
263 ctx = NULL; //need a context created with d3d (TODO: check it?)
264 share_ctx = false;
265 }
266
~EGLInteropResource()267 EGLInteropResource::~EGLInteropResource()
268 {
269 releaseEGL();
270 if (egl) {
271 delete egl;
272 egl = NULL;
273 }
274 SafeRelease(&query9);
275 SafeRelease(&surface9_nv12);
276 SafeRelease(&texture9_nv12);
277 SafeRelease(&surface9);
278 SafeRelease(&texture9);
279 SafeRelease(&device9);
280 SafeRelease(&d3d9);
281 if (dll9)
282 FreeLibrary(dll9);
283 }
284
ensureD3DDevice()285 bool EGLInteropResource::ensureD3DDevice()
286 {
287 if (device9)
288 return true;
289 if (!dll9)
290 dll9 = LoadLibrary(TEXT("D3D9.DLL"));
291 if (!dll9) {
292 qWarning("cuda::EGLInteropResource cannot load d3d9.dll");
293 return false;
294 }
295 D3DADAPTER_IDENTIFIER9 ai9;
296 ZeroMemory(&ai9, sizeof(ai9));
297 device9 = DXHelper::CreateDevice9Ex(dll9, (IDirect3D9Ex**)(&d3d9), &ai9);
298 if (!device9) {
299 qWarning("Failed to create d3d9 device ex, fallback to d3d9 device");
300 device9 = DXHelper::CreateDevice9(dll9, &d3d9, &ai9);
301 }
302 if (!device9)
303 return false;
304 qDebug() << QString().sprintf("CUDA.D3D9 (%.*s, vendor %lu, device %lu, revision %lu)",
305 sizeof(ai9.Description), ai9.Description,
306 ai9.VendorId, ai9.DeviceId, ai9.Revision);
307
308 // move to ensureResouce
309 DX_ENSURE(device9->CreateQuery(D3DQUERYTYPE_EVENT, &query9), false);
310 query9->Issue(D3DISSUE_END);
311 return !!device9;
312 }
313
releaseEGL()314 void EGLInteropResource::releaseEGL() {
315 if (egl->surface != EGL_NO_SURFACE) {
316 eglReleaseTexImage(egl->dpy, egl->surface, EGL_BACK_BUFFER);
317 eglDestroySurface(egl->dpy, egl->surface);
318 egl->surface = EGL_NO_SURFACE;
319 }
320 }
321
ensureResource(int w,int h,int W,int H,GLuint tex)322 bool EGLInteropResource::ensureResource(int w, int h, int W, int H, GLuint tex)
323 {
324 TexRes &r = res[0];// 1 NV12 texture
325 if (ensureD3D9CUDA(w, h, W, H) && ensureD3D9EGL(w, h)) {
326 r.texture = tex;
327 r.w = w;
328 r.h = h;
329 r.W = W;
330 r.H = H;
331 return true;
332 }
333 releaseEGL();
334 //releaseDX();
335 SafeRelease(&query9);
336 SafeRelease(&surface9);
337 SafeRelease(&texture9);
338 SafeRelease(&surface9_nv12);
339 SafeRelease(&texture9_nv12);
340 return false;
341 }
342
ensureD3D9CUDA(int w,int h,int W,int H)343 bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
344 {
345 TexRes &r = res[0];// 1 NV12 texture
346 if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
347 return true;
348 if (share_ctx) {
349 share_ctx = false;
350 ctx = NULL;
351 }
352 if (!ctx) {
353 // TODO: how to use pop/push decoder's context without the context in opengl context
354 if (!ensureD3DDevice())
355 return false;
356 // CUdevice is different from decoder's
357 CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false);
358 #if USE_STREAM
359 CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
360 CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
361 #endif //USE_STREAM
362 qDebug("cuda contex on gl thread: %p", ctx);
363 CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
364 }
365 if (r.cuRes) {
366 CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
367 r.cuRes = NULL;
368 }
369
370 // create d3d resource for interop
371 if (!surface9_nv12) {
372 // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch
373 DX_ENSURE(device9->CreateTexture(W
374 //, H
375 , H*3/2
376 , 1
377 , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET?
378 //, D3DUSAGE_RENDERTARGET
379 , D3DFMT_L8
380 //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8?
381 , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda?
382 , &texture9_nv12
383 , NULL) // - Resources allocated as shared may not be registered with CUDA.
384 , false);
385 DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false); //TODO: createrendertarget
386 }
387
388 // TODO: cudaD3D9.h says NV12 is not supported
389 // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface
390 // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl
391 CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
392 return true;
393 }
394
ensureD3D9EGL(int w,int h)395 bool EGLInteropResource::ensureD3D9EGL(int w, int h) {
396 if (egl->surface && res[0].w == w && res[0].h == h)
397 return true;
398 releaseEGL();
399 egl->dpy = eglGetCurrentDisplay();
400 qDebug("EGL version: %s, client api: %s", eglQueryString(egl->dpy, EGL_VERSION), eglQueryString(egl->dpy, EGL_CLIENT_APIS));
401 EGLint cfg_attribs[] = {
402 EGL_RED_SIZE, 8,
403 EGL_GREEN_SIZE, 8,
404 EGL_BLUE_SIZE, 8,
405 EGL_ALPHA_SIZE, 8, //
406 EGL_BIND_TO_TEXTURE_RGBA, EGL_TRUE, //remove?
407 EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
408 EGL_NONE
409 };
410 EGLint nb_cfgs;
411 EGLConfig egl_cfg;
412 if (!eglChooseConfig(egl->dpy, cfg_attribs, &egl_cfg, 1, &nb_cfgs)) {
413 qWarning("Failed to create EGL configuration");
414 return false;
415 }
416 // check extensions
417 QList<QByteArray> extensions = QByteArray(eglQueryString(egl->dpy, EGL_EXTENSIONS)).split(' ');
418 // ANGLE_d3d_share_handle_client_buffer will be used if possible
419 const bool kEGL_ANGLE_d3d_share_handle_client_buffer = extensions.contains("EGL_ANGLE_d3d_share_handle_client_buffer");
420 const bool kEGL_ANGLE_query_surface_pointer = extensions.contains("EGL_ANGLE_query_surface_pointer");
421 if (!kEGL_ANGLE_d3d_share_handle_client_buffer && !kEGL_ANGLE_query_surface_pointer) {
422 qWarning("EGL extension 'kEGL_ANGLE_query_surface_pointer' or 'ANGLE_d3d_share_handle_client_buffer' is required!");
423 return false;
424 }
425 GLint has_alpha = 1; //QOpenGLContext::currentContext()->format().hasAlpha()
426 eglGetConfigAttrib(egl->dpy, egl_cfg, EGL_BIND_TO_TEXTURE_RGBA, &has_alpha); //EGL_ALPHA_SIZE
427 qDebug("choose egl display:%p config: %p/%d, has alpha: %d", egl->dpy, egl_cfg, nb_cfgs, has_alpha);
428 EGLint attribs[] = {
429 EGL_WIDTH, w,
430 EGL_HEIGHT, h,
431 EGL_TEXTURE_FORMAT, has_alpha ? EGL_TEXTURE_RGBA : EGL_TEXTURE_RGB,
432 EGL_TEXTURE_TARGET, EGL_TEXTURE_2D,
433 EGL_NONE
434 };
435
436 HANDLE share_handle = NULL;
437 if (!kEGL_ANGLE_d3d_share_handle_client_buffer && kEGL_ANGLE_query_surface_pointer) {
438 EGL_ENSURE((egl->surface = eglCreatePbufferSurface(egl->dpy, egl_cfg, attribs)) != EGL_NO_SURFACE, false);
439 qDebug("pbuffer surface: %p", egl->surface);
440 PFNEGLQUERYSURFACEPOINTERANGLEPROC eglQuerySurfacePointerANGLE = reinterpret_cast<PFNEGLQUERYSURFACEPOINTERANGLEPROC>(eglGetProcAddress("eglQuerySurfacePointerANGLE"));
441 if (!eglQuerySurfacePointerANGLE) {
442 qWarning("EGL_ANGLE_query_surface_pointer is not supported");
443 return false;
444 }
445 EGL_ENSURE(eglQuerySurfacePointerANGLE(egl->dpy, egl->surface, EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE, &share_handle), false);
446 }
447
448 SafeRelease(&surface9);
449 SafeRelease(&texture9);
450 // _A8 for a yuv plane
451 /*
452 * d3d resource share requires windows >= vista: https://msdn.microsoft.com/en-us/library/windows/desktop/bb219800(v=vs.85).aspx
453 * from extension files:
454 * d3d9: level must be 1, dimensions must match EGL surface's
455 * d3d9ex or d3d10:
456 */
457 DX_ENSURE(device9->CreateTexture(w, h, 1,
458 D3DUSAGE_RENDERTARGET,
459 has_alpha ? D3DFMT_A8R8G8B8 : D3DFMT_X8R8G8B8,
460 D3DPOOL_DEFAULT,
461 &texture9,
462 &share_handle) , false);
463 DX_ENSURE(texture9->GetSurfaceLevel(0, &surface9), false);
464
465 if (kEGL_ANGLE_d3d_share_handle_client_buffer) {
466 // requires extension EGL_ANGLE_d3d_share_handle_client_buffer
467 // egl surface size must match d3d texture's
468 // d3d9ex or d3d10 is required
469 EGL_ENSURE((egl->surface = eglCreatePbufferFromClientBuffer(egl->dpy, EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE, share_handle, egl_cfg, attribs)), false);
470 qDebug("pbuffer surface from client buffer: %p", egl->surface);
471 }
472 return true;
473 }
474
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)475 bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane)
476 {
477 // plane is always 0 because frame is rgb
478 AutoCtxLock locker((cuda_api*)this, lock);
479 Q_UNUSED(locker);
480 if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data
481 return false;
482 //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
483 CUdeviceptr devptr;
484 unsigned int pitch;
485
486 CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false);
487 CUVIDAutoUnmapper unmapper(this, dec, devptr);
488 Q_UNUSED(unmapper);
489 // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
490 CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
491 CUarray array;
492 CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
493 CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible!
494
495 CUDA_MEMCPY2D cu2d;
496 memset(&cu2d, 0, sizeof(cu2d));
497 // Y plane
498 cu2d.srcDevice = devptr;
499 cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
500 cu2d.srcPitch = pitch;
501 cu2d.dstArray = array;
502 cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
503 cu2d.dstPitch = pitch;
504 // the whole size or copy size?
505 cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12
506 cu2d.Height = H*3/2;
507 if (res[plane].stream)
508 CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
509 else
510 CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
511 //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
512 // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
513 // so I simply unmap the resource here
514 if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
515 if (res[plane].stream) {
516 //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
517 CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
518 }
519 /*
520 * This function provides the synchronization guarantee that any CUDA work issued
521 * in \p stream before ::cuGraphicsUnmapResources() will complete before any
522 * subsequently issued graphics work begins.
523 * The graphics API from which \p resources were registered
524 * should not access any resources while they are mapped by CUDA. If an
525 * application does so, the results are undefined.
526 */
527 // CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
528 }
529 D3DLOCKED_RECT rect_src, rect_dst;
530 DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false);
531 DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false);
532 memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h
533 DX_ENSURE(surface9_nv12->UnlockRect(), false);
534 DX_ENSURE(texture9_nv12->UnlockRect(0), false);
535 #if 0
536 //IDirect3DSurface9 *raw_surface = NULL;
537 //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false);
538 const RECT src = { 0, 0, (~0-1)&w, (~0-1)&(h*3/2)};
539 DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false);
540 #endif
541 if (!map(surface9_nv12, tex, w, h, H))
542 return false;
543 return true;
544 }
545
map(IDirect3DSurface9 * surface,GLuint tex,int w,int h,int H)546 bool EGLInteropResource::map(IDirect3DSurface9* surface, GLuint tex, int w, int h, int H)
547 {
548 Q_UNUSED(H);
549 D3DSURFACE_DESC dxvaDesc;
550 surface->GetDesc(&dxvaDesc);
551 const RECT src = { 0, 0, (~0-1)&w, (~0-1)&h}; //StretchRect does not supports odd values
552 DX_ENSURE(device9->StretchRect(surface, &src, surface9, NULL, D3DTEXF_NONE), false);
553 if (query9) {
554 // Flush the draw command now. Ideally, this should be done immediately before the draw call that uses the texture. Flush it once here though.
555 query9->Issue(D3DISSUE_END);
556 // ensure data is copied to egl surface. Solution and comment is from chromium
557 // The DXVA decoder has its own device which it uses for decoding. ANGLE has its own device which we don't have access to.
558 // The above code attempts to copy the decoded picture into a surface which is owned by ANGLE.
559 // As there are multiple devices involved in this, the StretchRect call above is not synchronous.
560 // We attempt to flush the batched operations to ensure that the picture is copied to the surface owned by ANGLE.
561 // We need to do this in a loop and call flush multiple times.
562 // We have seen the GetData call for flushing the command buffer fail to return success occassionally on multi core machines, leading to an infinite loop.
563 // Workaround is to have an upper limit of 10 on the number of iterations to wait for the Flush to finish.
564 int k = 0;
565 // skip at decoder.close()
566 while (/*!skip_dx.load() && */(query9->GetData(NULL, 0, D3DGETDATA_FLUSH) == FALSE) && ++k < 10) {
567 Sleep(1);
568 }
569 }
570 DYGL(glBindTexture(GL_TEXTURE_2D, tex));
571 eglBindTexImage(egl->dpy, egl->surface, EGL_BACK_BUFFER);
572 DYGL(glBindTexture(GL_TEXTURE_2D, 0));
573 return true;
574 }
575
576 } //namespace cuda
577 } //namespace QtAV
578 #endif //QTAV_HAVE(CUDA_EGL)
579 #if QTAV_HAVE(CUDA_GL)
580 namespace QtAV {
581 namespace cuda {
582 //TODO: cuGLMapBufferObject: get cudeviceptr from pbo, then memcpy2d
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)583 bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane)
584 {
585 AutoCtxLock locker((cuda_api*)this, lock);
586 Q_UNUSED(locker);
587 if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data
588 return false;
589 //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
590 CUdeviceptr devptr;
591 unsigned int pitch;
592
593 CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false);
594 CUVIDAutoUnmapper unmapper(this, dec, devptr);
595 Q_UNUSED(unmapper);
596 // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
597 CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
598 CUarray array;
599 CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
600
601 CUDA_MEMCPY2D cu2d;
602 memset(&cu2d, 0, sizeof(cu2d));
603 cu2d.srcDevice = devptr;
604 cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
605 cu2d.srcPitch = pitch;
606 cu2d.dstArray = array;
607 cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
608 cu2d.dstPitch = pitch;
609 // the whole size or copy size?
610 cu2d.WidthInBytes = pitch;
611 cu2d.Height = h;
612 if (plane == 1) {
613 cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes
614 cu2d.srcY = H; // skip the padding height
615 cu2d.Height /= 2;
616 }
617 if (res[plane].stream)
618 CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
619 else
620 CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
621 //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
622 // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
623 // so I simply unmap the resource here
624 if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
625 if (res[plane].stream) {
626 //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
627 CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
628 }
629 /*
630 * This function provides the synchronization guarantee that any CUDA work issued
631 * in \p stream before ::cuGraphicsUnmapResources() will complete before any
632 * subsequently issued graphics work begins.
633 * The graphics API from which \p resources were registered
634 * should not access any resources while they are mapped by CUDA. If an
635 * application does so, the results are undefined.
636 */
637 CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
638 } else {
639 // call it at last. current context will be used by other cuda calls (unmap() for example)
640 CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required
641 }
642 return true;
643 }
644
unmap(GLuint tex)645 bool GLInteropResource::unmap(GLuint tex)
646 {
647 Q_UNUSED(tex);
648 if (WORKAROUND_UNMAP_CONTEXT_SWITCH)
649 return true;
650 int plane = -1;
651 if (res[0].texture == tex)
652 plane = 0;
653 else if (res[1].texture == tex)
654 plane = 1;
655 else
656 return false;
657 // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed?
658 CUDA_WARN(cuCtxPushCurrent(ctx));
659 CUDA_WARN(cuStreamSynchronize(res[plane].stream));
660 // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread
661 // Because the decoder switch the context in another thread so we have to switch the context back?
662 // to workaround the context issue, we must pop the context that valid in map() and push it here
663 CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
664 CUDA_ENSURE(cuCtxPopCurrent(&ctx), false);
665 return true;
666 }
667
ensureResource(int w,int h,int H,GLuint tex,int plane)668 bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane)
669 {
670 Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12");
671 TexRes &r = res[plane];
672 if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes)
673 return true;
674 if (!ctx) {
675 // TODO: how to use pop/push decoder's context without the context in opengl context
676 CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
677 if (USE_STREAM) {
678 CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
679 CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
680 }
681 qDebug("cuda contex on gl thread: %p", ctx);
682 CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
683 }
684 if (r.cuRes) {
685 CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
686 r.cuRes = NULL;
687 }
688 // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d
689 CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
690 r.texture = tex;
691 r.w = w;
692 r.h = h;
693 r.H = H;
694 return true;
695 }
696 } //namespace cuda
697 } //namespace QtAV
698 #endif //QTAV_HAVE(CUDA_GL)
699