1 /******************************************************************************
2     QtAV:  Multimedia framework based on Qt and FFmpeg
3     Copyright (C) 2012-2016 Wang Bin <wbsecg1@gmail.com>
4 
5 *   This file is part of QtAV (from 2015)
6 
7     This library is free software; you can redistribute it and/or
8     modify it under the terms of the GNU Lesser General Public
9     License as published by the Free Software Foundation; either
10     version 2.1 of the License, or (at your option) any later version.
11 
12     This library is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15     Lesser General Public License for more details.
16 
17     You should have received a copy of the GNU Lesser General Public
18     License along with this library; if not, write to the Free Software
19     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 ******************************************************************************/
21 
22 #include "SurfaceInteropCUDA.h"
23 #include "QtAV/VideoFrame.h"
24 #include "utils/Logger.h"
25 #include "helper_cuda.h"
26 
27 #define WORKAROUND_UNMAP_CONTEXT_SWITCH 1
28 #define USE_STREAM 1
29 
30 namespace QtAV {
31 namespace cuda {
32 
InteropResource()33 InteropResource::InteropResource()
34     : cuda_api()
35     , dev(0)
36     , ctx(0)
37     , dec(0)
38     , lock(0)
39 {
40     memset(res, 0, sizeof(res));
41 }
42 
~InteropResource()43 InteropResource::~InteropResource()
44 {
45     //CUDA_WARN(cuCtxPushCurrent(ctx)); //error invalid value
46     if (res[0].cuRes)
47         CUDA_WARN(cuGraphicsUnregisterResource(res[0].cuRes));
48     if (res[1].cuRes)
49         CUDA_WARN(cuGraphicsUnregisterResource(res[1].cuRes));
50     if (res[0].stream)
51         CUDA_WARN(cuStreamDestroy(res[0].stream));
52     if (res[1].stream)
53         CUDA_WARN(cuStreamDestroy(res[1].stream));
54 
55     // FIXME: we own the context. But why crash to destroy ctx? CUDA_ERROR_INVALID_VALUE
56     if (!share_ctx && ctx)
57         CUDA_ENSURE(cuCtxDestroy(ctx));
58 }
59 
mapToHost(const VideoFormat & format,void * handle,int picIndex,const CUVIDPROCPARAMS & param,int width,int height,int coded_height)60 void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
61 {
62     AutoCtxLock locker((cuda_api*)this, lock);
63     Q_UNUSED(locker);
64     CUdeviceptr devptr;
65     unsigned int pitch;
66 
67     CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), NULL);
68     CUVIDAutoUnmapper unmapper(this, dec, devptr);
69     Q_UNUSED(unmapper);
70     uchar* host_data = NULL;
71     const unsigned int host_size = pitch*coded_height*3/2;
72     CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
73     // copy to the memory not allocated by cuda is possible but much slower
74     CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);
75 
76     VideoFrame frame(width, height, VideoFormat::Format_NV12);
77     uchar *planes[] = {
78         host_data,
79         host_data + pitch * coded_height
80     };
81     frame.setBits(planes);
82     int pitches[] = { (int)pitch, (int)pitch };
83     frame.setBytesPerLine(pitches);
84 
85     VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
86     frame.setTimestamp(f->timestamp());
87     frame.setDisplayAspectRatio(f->displayAspectRatio());
88     if (format == frame.format())
89         *f = frame.clone();
90     else
91         *f = frame.to(format);
92 
93     CUDA_ENSURE(cuMemFreeHost(host_data), f);
94     return f;
95 }
96 
97 #ifndef QT_NO_OPENGL
HostInteropResource()98 HostInteropResource::HostInteropResource()
99     : InteropResource()
100 {
101     memset(&host_mem, 0, sizeof(host_mem));
102     host_mem.index = -1;
103 }
104 
~HostInteropResource()105 HostInteropResource::~HostInteropResource()
106 {
107     if (ctx) { //cuMemFreeHost need the context of mem allocated, it's shared context, or own context
108         CUDA_WARN(cuCtxPushCurrent(ctx));
109     }
110     if (host_mem.data) { //FIXME: CUDA_ERROR_INVALID_VALUE
111         CUDA_ENSURE(cuMemFreeHost(host_mem.data));
112         host_mem.data = NULL;
113     }
114     if (ctx) {
115         CUDA_WARN(cuCtxPopCurrent(NULL));
116     }
117 }
118 
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)119 bool HostInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
120 {
121     Q_UNUSED(w);
122     if (host_mem.index != picIndex || !host_mem.data) {
123         AutoCtxLock locker((cuda_api*)this, lock);
124         Q_UNUSED(locker);
125 
126         CUdeviceptr devptr;
127         unsigned int pitch;
128         //qDebug("index: %d=>%d, plane: %d", host_mem.index, picIndex, plane);
129         CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
130         CUVIDAutoUnmapper unmapper(this, dec, devptr);
131         Q_UNUSED(unmapper);
132         if (!ensureResource(pitch, H)) //copy height is coded height
133             return false;
134         // the same thread (context) as cuMemAllocHost, so no ccontext switch is needed
135         CUDA_ENSURE(cuMemcpyDtoH(host_mem.data, devptr, pitch*H*3/2), false);
136         host_mem.index = picIndex;
137     }
138     // map to texture
139     //qDebug("map plane %d @%d", plane, picIndex);
140     GLint iformat[2];
141     GLenum format[2], dtype[2];
142     OpenGLHelper::videoFormatToGL(VideoFormat::Format_NV12, iformat, format, dtype);
143     DYGL(glBindTexture(GL_TEXTURE_2D, tex));
144     const int chroma = plane != 0;
145     // chroma pitch for gl is 1/2 (gl_rg)
146     // texture height is not coded height!
147     DYGL(glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, host_mem.pitch>>chroma, h>>chroma, format[plane], dtype[plane], host_mem.data + chroma*host_mem.pitch*host_mem.height));
148     //DYGL(glTexImage2D(GL_TEXTURE_2D, 0, iformat[plane], host_mem.pitch>>chroma, h>>chroma, 0, format[plane], dtype[plane], host_mem.data + chroma*host_mem.pitch*host_mem.height));
149     return true;
150 }
151 
unmap(GLuint)152 bool HostInteropResource::unmap(GLuint)
153 {
154     return true;
155 }
156 
ensureResource(int pitch,int height)157 bool HostInteropResource::ensureResource(int pitch, int height)
158 {
159     if (host_mem.data && host_mem.pitch == pitch && host_mem.height == height)
160         return true;
161     if (host_mem.data) {
162         CUDA_ENSURE(cuMemFreeHost(host_mem.data), false);
163         host_mem.data = NULL;
164     }
165     qDebug("allocate cuda host mem. %dx%d=>%dx%d", host_mem.pitch, host_mem.height, pitch, height);
166     host_mem.pitch = pitch;
167     host_mem.height = height;
168     if (!ctx) {
169         CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
170         CUDA_WARN(cuCtxPopCurrent(&ctx));
171         share_ctx = false;
172     }
173     if (!share_ctx) // cuMemFreeHost will be called in dtor which is not the current thread.
174         CUDA_WARN(cuCtxPushCurrent(ctx));
175     // NV12
176     CUDA_ENSURE(cuMemAllocHost((void**)&host_mem.data, pitch*height*3/2), false);
177     if (!share_ctx)
178         CUDA_WARN(cuCtxPopCurrent(NULL)); //can be null or &ctx
179     return true;
180 }
181 #endif //QT_NO_OPENGL
182 
setSurface(int picIndex,CUVIDPROCPARAMS param,int width,int height,int surface_height)183 void SurfaceInteropCUDA::setSurface(int picIndex, CUVIDPROCPARAMS param, int width, int height, int surface_height)
184 {
185     m_index = picIndex;
186     m_param = param;
187     w = width;
188     h = height;
189     H = surface_height;
190 }
191 
map(SurfaceType type,const VideoFormat & fmt,void * handle,int plane)192 void* SurfaceInteropCUDA::map(SurfaceType type, const VideoFormat &fmt, void *handle, int plane)
193 {
194     Q_UNUSED(fmt);
195     if (m_resource.isNull())
196         return NULL;
197     if (!handle)
198         return NULL;
199 
200     if (m_index < 0)
201         return 0;
202     if (type == GLTextureSurface) {
203 #ifndef QT_NO_OPENGL
204         // FIXME: to strong ref may delay the delete and cuda resource maybe already destoryed after strong ref is finished
205         if (m_resource.toStrongRef()->map(m_index, m_param, *((GLuint*)handle), w, h, H, plane))
206             return handle;
207 #endif //QT_NO_OPENGL
208     } else if (type == HostMemorySurface) {
209         return m_resource.toStrongRef()->mapToHost(fmt, handle, m_index, m_param, w, h, H);
210     }
211     return NULL;
212 }
213 
unmap(void * handle)214 void SurfaceInteropCUDA::unmap(void *handle)
215 {
216     if (m_resource.isNull())
217         return;
218 #ifndef QT_NO_OPENGL
219     // FIXME: to strong ref may delay the delete and cuda resource maybe already destoryed after strong ref is finished
220     m_resource.toStrongRef()->unmap(*((GLuint*)handle));
221 #endif
222 }
223 } //namespace cuda
224 } //namespace QtAV
225 
226 #if QTAV_HAVE(CUDA_EGL)
227 #ifdef QT_OPENGL_ES_2_ANGLE_STATIC
228 #define CAPI_LINK_EGL
229 #else
230 #define EGL_CAPI_NS
231 #endif //QT_OPENGL_ES_2_ANGLE_STATIC
232 #include "capi/egl_api.h"
233 #include <EGL/eglext.h> //include after egl_capi.h to match types
234 #define DX_LOG_COMPONENT "CUDA.D3D"
235 #include "utils/DirectXHelper.h"
236 
237 namespace QtAV {
238 namespace cuda {
239 class EGL {
240 public:
EGL()241     EGL() : dpy(EGL_NO_DISPLAY), surface(EGL_NO_SURFACE) {}
242     EGLDisplay dpy;
243     EGLSurface surface; //only support rgb. then we must use CUDA kernel
244 #ifdef EGL_VERSION_1_5
245     // eglCreateImageKHR does not support EGL_NATIVE_PIXMAP_KHR, only 2d, 3d, render buffer
246     //EGLImageKHR image[2];
247     //EGLImage image[2]; //not implemented yet
248 #endif //EGL_VERSION_1_5
249 };
250 
EGLInteropResource()251 EGLInteropResource::EGLInteropResource()
252     : InteropResource()
253     , egl(new EGL())
254     , dll9(NULL)
255     , d3d9(NULL)
256     , device9(NULL)
257     , texture9(NULL)
258     , surface9(NULL)
259     , texture9_nv12(NULL)
260     , surface9_nv12(NULL)
261     , query9(NULL)
262 {
263     ctx = NULL; //need a context created with d3d (TODO: check it?)
264     share_ctx = false;
265 }
266 
~EGLInteropResource()267 EGLInteropResource::~EGLInteropResource()
268 {
269     releaseEGL();
270     if (egl) {
271         delete egl;
272         egl = NULL;
273     }
274     SafeRelease(&query9);
275     SafeRelease(&surface9_nv12);
276     SafeRelease(&texture9_nv12);
277     SafeRelease(&surface9);
278     SafeRelease(&texture9);
279     SafeRelease(&device9);
280     SafeRelease(&d3d9);
281     if (dll9)
282         FreeLibrary(dll9);
283 }
284 
ensureD3DDevice()285 bool EGLInteropResource::ensureD3DDevice()
286 {
287     if (device9)
288         return true;
289     if (!dll9)
290         dll9 = LoadLibrary(TEXT("D3D9.DLL"));
291     if (!dll9) {
292         qWarning("cuda::EGLInteropResource cannot load d3d9.dll");
293         return false;
294     }
295     D3DADAPTER_IDENTIFIER9 ai9;
296     ZeroMemory(&ai9, sizeof(ai9));
297     device9 = DXHelper::CreateDevice9Ex(dll9, (IDirect3D9Ex**)(&d3d9), &ai9);
298     if (!device9) {
299         qWarning("Failed to create d3d9 device ex, fallback to d3d9 device");
300         device9 = DXHelper::CreateDevice9(dll9, &d3d9, &ai9);
301     }
302     if (!device9)
303         return false;
304     qDebug() << QString().sprintf("CUDA.D3D9 (%.*s, vendor %lu, device %lu, revision %lu)",
305                                     sizeof(ai9.Description), ai9.Description,
306                                     ai9.VendorId, ai9.DeviceId, ai9.Revision);
307 
308     // move to ensureResouce
309     DX_ENSURE(device9->CreateQuery(D3DQUERYTYPE_EVENT, &query9), false);
310     query9->Issue(D3DISSUE_END);
311     return !!device9;
312 }
313 
releaseEGL()314 void EGLInteropResource::releaseEGL() {
315     if (egl->surface != EGL_NO_SURFACE) {
316         eglReleaseTexImage(egl->dpy, egl->surface, EGL_BACK_BUFFER);
317         eglDestroySurface(egl->dpy, egl->surface);
318         egl->surface = EGL_NO_SURFACE;
319     }
320 }
321 
ensureResource(int w,int h,int W,int H,GLuint tex)322 bool EGLInteropResource::ensureResource(int w, int h, int W, int H, GLuint tex)
323 {
324     TexRes &r = res[0];// 1 NV12 texture
325     if (ensureD3D9CUDA(w, h, W, H) && ensureD3D9EGL(w, h)) {
326         r.texture = tex;
327         r.w = w;
328         r.h = h;
329         r.W = W;
330         r.H = H;
331         return true;
332     }
333     releaseEGL();
334     //releaseDX();
335     SafeRelease(&query9);
336     SafeRelease(&surface9);
337     SafeRelease(&texture9);
338     SafeRelease(&surface9_nv12);
339     SafeRelease(&texture9_nv12);
340     return false;
341 }
342 
ensureD3D9CUDA(int w,int h,int W,int H)343 bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
344 {
345     TexRes &r = res[0];// 1 NV12 texture
346     if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
347         return true;
348     if (share_ctx) {
349         share_ctx = false;
350         ctx = NULL;
351     }
352     if (!ctx) {
353         // TODO: how to use pop/push decoder's context without the context in opengl context
354         if (!ensureD3DDevice())
355             return false;
356         // CUdevice is different from decoder's
357         CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false);
358 #if USE_STREAM
359         CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
360         CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
361 #endif //USE_STREAM
362         qDebug("cuda contex on gl thread: %p", ctx);
363         CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
364     }
365     if (r.cuRes) {
366         CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
367         r.cuRes = NULL;
368     }
369 
370     // create d3d resource for interop
371     if (!surface9_nv12) {
372         // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch
373         DX_ENSURE(device9->CreateTexture(W
374                                          //, H
375                                          , H*3/2
376                                          , 1
377                                          , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET?
378                                          //, D3DUSAGE_RENDERTARGET
379                                          , D3DFMT_L8
380                                          //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8?
381                                          , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda?
382                                          , &texture9_nv12
383                                          , NULL) // - Resources allocated as shared may not be registered with CUDA.
384                   , false);
385         DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false); //TODO: createrendertarget
386     }
387 
388     // TODO: cudaD3D9.h says NV12 is not supported
389     // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface
390     // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl
391     CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
392     return true;
393 }
394 
ensureD3D9EGL(int w,int h)395 bool EGLInteropResource::ensureD3D9EGL(int w, int h) {
396     if (egl->surface && res[0].w == w && res[0].h == h)
397         return true;
398     releaseEGL();
399     egl->dpy = eglGetCurrentDisplay();
400     qDebug("EGL version: %s, client api: %s", eglQueryString(egl->dpy, EGL_VERSION), eglQueryString(egl->dpy, EGL_CLIENT_APIS));
401     EGLint cfg_attribs[] = {
402         EGL_RED_SIZE, 8,
403         EGL_GREEN_SIZE, 8,
404         EGL_BLUE_SIZE, 8,
405         EGL_ALPHA_SIZE, 8, //
406         EGL_BIND_TO_TEXTURE_RGBA, EGL_TRUE, //remove?
407         EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
408         EGL_NONE
409     };
410     EGLint nb_cfgs;
411     EGLConfig egl_cfg;
412     if (!eglChooseConfig(egl->dpy, cfg_attribs, &egl_cfg, 1, &nb_cfgs)) {
413         qWarning("Failed to create EGL configuration");
414         return false;
415     }
416     // check extensions
417     QList<QByteArray> extensions = QByteArray(eglQueryString(egl->dpy, EGL_EXTENSIONS)).split(' ');
418     // ANGLE_d3d_share_handle_client_buffer will be used if possible
419     const bool kEGL_ANGLE_d3d_share_handle_client_buffer = extensions.contains("EGL_ANGLE_d3d_share_handle_client_buffer");
420     const bool kEGL_ANGLE_query_surface_pointer = extensions.contains("EGL_ANGLE_query_surface_pointer");
421     if (!kEGL_ANGLE_d3d_share_handle_client_buffer && !kEGL_ANGLE_query_surface_pointer) {
422         qWarning("EGL extension 'kEGL_ANGLE_query_surface_pointer' or 'ANGLE_d3d_share_handle_client_buffer' is required!");
423         return false;
424     }
425     GLint has_alpha = 1; //QOpenGLContext::currentContext()->format().hasAlpha()
426     eglGetConfigAttrib(egl->dpy, egl_cfg, EGL_BIND_TO_TEXTURE_RGBA, &has_alpha); //EGL_ALPHA_SIZE
427     qDebug("choose egl display:%p config: %p/%d, has alpha: %d", egl->dpy, egl_cfg, nb_cfgs, has_alpha);
428     EGLint attribs[] = {
429         EGL_WIDTH, w,
430         EGL_HEIGHT, h,
431         EGL_TEXTURE_FORMAT, has_alpha ? EGL_TEXTURE_RGBA : EGL_TEXTURE_RGB,
432         EGL_TEXTURE_TARGET, EGL_TEXTURE_2D,
433         EGL_NONE
434     };
435 
436     HANDLE share_handle = NULL;
437     if (!kEGL_ANGLE_d3d_share_handle_client_buffer && kEGL_ANGLE_query_surface_pointer) {
438         EGL_ENSURE((egl->surface = eglCreatePbufferSurface(egl->dpy, egl_cfg, attribs)) != EGL_NO_SURFACE, false);
439         qDebug("pbuffer surface: %p", egl->surface);
440         PFNEGLQUERYSURFACEPOINTERANGLEPROC eglQuerySurfacePointerANGLE = reinterpret_cast<PFNEGLQUERYSURFACEPOINTERANGLEPROC>(eglGetProcAddress("eglQuerySurfacePointerANGLE"));
441         if (!eglQuerySurfacePointerANGLE) {
442             qWarning("EGL_ANGLE_query_surface_pointer is not supported");
443             return false;
444         }
445         EGL_ENSURE(eglQuerySurfacePointerANGLE(egl->dpy, egl->surface, EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE, &share_handle), false);
446     }
447 
448     SafeRelease(&surface9);
449     SafeRelease(&texture9);
450     // _A8 for a yuv plane
451     /*
452      * d3d resource share requires windows >= vista: https://msdn.microsoft.com/en-us/library/windows/desktop/bb219800(v=vs.85).aspx
453      * from extension files:
454      * d3d9: level must be 1, dimensions must match EGL surface's
455      * d3d9ex or d3d10:
456      */
457     DX_ENSURE(device9->CreateTexture(w, h, 1,
458                                         D3DUSAGE_RENDERTARGET,
459                                         has_alpha ? D3DFMT_A8R8G8B8 : D3DFMT_X8R8G8B8,
460                                         D3DPOOL_DEFAULT,
461                                         &texture9,
462                                         &share_handle) , false);
463     DX_ENSURE(texture9->GetSurfaceLevel(0, &surface9), false);
464 
465     if (kEGL_ANGLE_d3d_share_handle_client_buffer) {
466         // requires extension EGL_ANGLE_d3d_share_handle_client_buffer
467         // egl surface size must match d3d texture's
468         // d3d9ex or d3d10 is required
469         EGL_ENSURE((egl->surface = eglCreatePbufferFromClientBuffer(egl->dpy, EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE, share_handle, egl_cfg, attribs)), false);
470         qDebug("pbuffer surface from client buffer: %p", egl->surface);
471     }
472     return true;
473 }
474 
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)475 bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
476 {
477     // plane is always 0 because frame is rgb
478     AutoCtxLock locker((cuda_api*)this, lock);
479     Q_UNUSED(locker);
480     if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data
481         return false;
482     //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
483     CUdeviceptr devptr;
484     unsigned int pitch;
485 
486     CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
487     CUVIDAutoUnmapper unmapper(this, dec, devptr);
488     Q_UNUSED(unmapper);
489     // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
490     CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
491     CUarray array;
492     CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
493     CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible!
494 
495     CUDA_MEMCPY2D cu2d;
496     memset(&cu2d, 0, sizeof(cu2d));
497     // Y plane
498     cu2d.srcDevice = devptr;
499     cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
500     cu2d.srcPitch = pitch;
501     cu2d.dstArray = array;
502     cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
503     cu2d.dstPitch = pitch;
504     // the whole size or copy size?
505     cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12
506     cu2d.Height = H*3/2;
507     if (res[plane].stream)
508         CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
509     else
510         CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
511     //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
512     // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
513     // so I simply unmap the resource here
514     if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
515         if (res[plane].stream) {
516             //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
517             CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
518         }
519         /*
520          * This function provides the synchronization guarantee that any CUDA work issued
521          * in \p stream before ::cuGraphicsUnmapResources() will complete before any
522          * subsequently issued graphics work begins.
523          * The graphics API from which \p resources were registered
524          * should not access any resources while they are mapped by CUDA. If an
525          * application does so, the results are undefined.
526          */
527 //        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
528     }
529     D3DLOCKED_RECT rect_src, rect_dst;
530     DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false);
531     DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false);
532     memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h
533     DX_ENSURE(surface9_nv12->UnlockRect(), false);
534     DX_ENSURE(texture9_nv12->UnlockRect(0), false);
535 #if 0
536     //IDirect3DSurface9 *raw_surface = NULL;
537     //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false);
538     const RECT src = { 0, 0, (~0-1)&w, (~0-1)&(h*3/2)};
539     DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false);
540 #endif
541     if (!map(surface9_nv12, tex, w, h, H))
542         return false;
543     return true;
544 }
545 
map(IDirect3DSurface9 * surface,GLuint tex,int w,int h,int H)546 bool EGLInteropResource::map(IDirect3DSurface9* surface, GLuint tex, int w, int h, int H)
547 {
548     Q_UNUSED(H);
549     D3DSURFACE_DESC dxvaDesc;
550     surface->GetDesc(&dxvaDesc);
551     const RECT src = { 0, 0, (~0-1)&w, (~0-1)&h}; //StretchRect does not supports odd values
552     DX_ENSURE(device9->StretchRect(surface, &src, surface9, NULL, D3DTEXF_NONE), false);
553     if (query9) {
554         // Flush the draw command now. Ideally, this should be done immediately before the draw call that uses the texture. Flush it once here though.
555         query9->Issue(D3DISSUE_END);
556         // ensure data is copied to egl surface. Solution and comment is from chromium
557         // The DXVA decoder has its own device which it uses for decoding. ANGLE has its own device which we don't have access to.
558         // The above code attempts to copy the decoded picture into a surface which is owned by ANGLE.
559         // As there are multiple devices involved in this, the StretchRect call above is not synchronous.
560         // We attempt to flush the batched operations to ensure that the picture is copied to the surface owned by ANGLE.
561         // We need to do this in a loop and call flush multiple times.
562         // We have seen the GetData call for flushing the command buffer fail to return success occassionally on multi core machines, leading to an infinite loop.
563         // Workaround is to have an upper limit of 10 on the number of iterations to wait for the Flush to finish.
564         int k = 0;
565         // skip at decoder.close()
566         while (/*!skip_dx.load() && */(query9->GetData(NULL, 0, D3DGETDATA_FLUSH) == FALSE) && ++k < 10) {
567             Sleep(1);
568         }
569     }
570     DYGL(glBindTexture(GL_TEXTURE_2D, tex));
571     eglBindTexImage(egl->dpy, egl->surface, EGL_BACK_BUFFER);
572     DYGL(glBindTexture(GL_TEXTURE_2D, 0));
573     return true;
574 }
575 
576 } //namespace cuda
577 } //namespace QtAV
578 #endif //QTAV_HAVE(CUDA_EGL)
579 #if QTAV_HAVE(CUDA_GL)
580 namespace QtAV {
581 namespace cuda {
582 //TODO: cuGLMapBufferObject: get cudeviceptr from pbo, then memcpy2d
map(int picIndex,const CUVIDPROCPARAMS & param,GLuint tex,int w,int h,int H,int plane)583 bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
584 {
585     AutoCtxLock locker((cuda_api*)this, lock);
586     Q_UNUSED(locker);
587     if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data
588         return false;
589     //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
590     CUdeviceptr devptr;
591     unsigned int pitch;
592 
593     CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
594     CUVIDAutoUnmapper unmapper(this, dec, devptr);
595     Q_UNUSED(unmapper);
596     // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
597     CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
598     CUarray array;
599     CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
600 
601     CUDA_MEMCPY2D cu2d;
602     memset(&cu2d, 0, sizeof(cu2d));
603     cu2d.srcDevice = devptr;
604     cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
605     cu2d.srcPitch = pitch;
606     cu2d.dstArray = array;
607     cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
608     cu2d.dstPitch = pitch;
609     // the whole size or copy size?
610     cu2d.WidthInBytes = pitch;
611     cu2d.Height = h;
612     if (plane == 1) {
613         cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes
614         cu2d.srcY = H; // skip the padding height
615         cu2d.Height /= 2;
616     }
617     if (res[plane].stream)
618         CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
619     else
620         CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
621     //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
622     // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
623     // so I simply unmap the resource here
624     if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
625         if (res[plane].stream) {
626             //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
627             CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
628         }
629         /*
630          * This function provides the synchronization guarantee that any CUDA work issued
631          * in \p stream before ::cuGraphicsUnmapResources() will complete before any
632          * subsequently issued graphics work begins.
633          * The graphics API from which \p resources were registered
634          * should not access any resources while they are mapped by CUDA. If an
635          * application does so, the results are undefined.
636          */
637         CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
638     } else {
639         // call it at last. current context will be used by other cuda calls (unmap() for example)
640         CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required
641     }
642     return true;
643 }
644 
unmap(GLuint tex)645 bool GLInteropResource::unmap(GLuint tex)
646 {
647     Q_UNUSED(tex);
648     if (WORKAROUND_UNMAP_CONTEXT_SWITCH)
649         return true;
650     int plane = -1;
651     if (res[0].texture == tex)
652         plane = 0;
653     else if (res[1].texture == tex)
654         plane = 1;
655     else
656         return false;
657     // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed?
658     CUDA_WARN(cuCtxPushCurrent(ctx));
659     CUDA_WARN(cuStreamSynchronize(res[plane].stream));
660     // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread
661     // Because the decoder switch the context in another thread so we have to switch the context back?
662     // to workaround the context issue, we must pop the context that valid in map() and push it here
663     CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
664     CUDA_ENSURE(cuCtxPopCurrent(&ctx), false);
665     return true;
666 }
667 
ensureResource(int w,int h,int H,GLuint tex,int plane)668 bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane)
669 {
670     Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12");
671     TexRes &r = res[plane];
672     if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes)
673         return true;
674     if (!ctx) {
675         // TODO: how to use pop/push decoder's context without the context in opengl context
676         CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
677         if (USE_STREAM) {
678             CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
679             CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
680         }
681         qDebug("cuda contex on gl thread: %p", ctx);
682         CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
683     }
684     if (r.cuRes) {
685         CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
686         r.cuRes = NULL;
687     }
688     // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d
689     CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
690     r.texture = tex;
691     r.w = w;
692     r.h = h;
693     r.H = H;
694     return true;
695 }
696 } //namespace cuda
697 } //namespace QtAV
698 #endif //QTAV_HAVE(CUDA_GL)
699