1 /**
2  * FreeRDP: A Remote Desktop Protocol Implementation
3  * Optimized YUV/RGB conversion operations using openCL
4  *
5  * Copyright 2019 David Fort <contact@hardening-consulting.com>
6  * Copyright 2019 Rangee Gmbh
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <freerdp/types.h>
26 #include <freerdp/primitives.h>
27 #include "prim_internal.h"
28 
29 #if defined(WITH_OPENCL)
30 #ifdef __APPLE__
31 #include "OpenCL/opencl.h"
32 #else
33 #include <CL/cl.h>
34 #endif
35 #endif
36 
37 #define TAG FREERDP_TAG("primitives")
38 
39 typedef struct
40 {
41 	BOOL support;
42 	cl_platform_id platformId;
43 	cl_device_id deviceId;
44 	cl_context context;
45 	cl_command_queue commandQueue;
46 	cl_program program;
47 } primitives_opencl_context;
48 
49 static primitives_opencl_context* primitives_get_opencl_context(void);
50 
opencl_YUVToRGB(const char * kernelName,const BYTE * const pSrc[3],const UINT32 srcStep[3],BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)51 static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* const pSrc[3],
52                                  const UINT32 srcStep[3], BYTE* pDst, UINT32 dstStep,
53                                  const prim_size_t* roi)
54 {
55 	cl_int ret;
56 	cl_uint i;
57 	cl_mem objs[3] = { NULL, NULL, NULL };
58 	cl_mem destObj;
59 	cl_kernel kernel;
60 	size_t indexes[2];
61 	const char* sourceNames[] = { "Y", "U", "V" };
62 	primitives_opencl_context* cl = primitives_get_opencl_context();
63 
64 	kernel = clCreateKernel(cl->program, kernelName, &ret);
65 	if (ret != CL_SUCCESS)
66 	{
67 		WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
68 		return -1;
69 	}
70 
71 	for (i = 0; i < 3; i++)
72 	{
73 		objs[i] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
74 		                         srcStep[i] * roi->height, (char*)pSrc[i], &ret);
75 		if (ret != CL_SUCCESS)
76 		{
77 			WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
78 			goto error_objs;
79 		}
80 	}
81 
82 	destObj = clCreateBuffer(cl->context, CL_MEM_WRITE_ONLY, dstStep * roi->height, NULL, &ret);
83 	if (ret != CL_SUCCESS)
84 	{
85 		WLog_ERR(TAG, "unable to create dest obj");
86 		goto error_objs;
87 	}
88 
89 	/* push source + stride arguments*/
90 	for (i = 0; i < 3; i++)
91 	{
92 		ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), &objs[i]);
93 		if (ret != CL_SUCCESS)
94 		{
95 			WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
96 			goto error_set_args;
97 		}
98 
99 		ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), &srcStep[i]);
100 		if (ret != CL_SUCCESS)
101 		{
102 			WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
103 			goto error_set_args;
104 		}
105 	}
106 
107 	ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), &destObj);
108 	if (ret != CL_SUCCESS)
109 	{
110 		WLog_ERR(TAG, "unable to set arg destObj");
111 		goto error_set_args;
112 	}
113 
114 	ret = clSetKernelArg(kernel, 7, sizeof(cl_int), &dstStep);
115 	if (ret != CL_SUCCESS)
116 	{
117 		WLog_ERR(TAG, "unable to set arg dstStep");
118 		goto error_set_args;
119 	}
120 
121 	indexes[0] = roi->width;
122 	indexes[1] = roi->height;
123 	ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL, 0, NULL, NULL);
124 	if (ret != CL_SUCCESS)
125 	{
126 		WLog_ERR(TAG, "unable to enqueue call kernel");
127 		goto error_set_args;
128 	}
129 
130 	/* Transfer result to host */
131 	ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0,
132 	                          NULL, NULL);
133 	if (ret != CL_SUCCESS)
134 	{
135 		WLog_ERR(TAG, "unable to read back buffer");
136 		goto error_set_args;
137 	}
138 
139 	/* cleanup things */
140 	clReleaseMemObject(destObj);
141 	for (i = 0; i < 3; i++)
142 		if (objs[i])
143 			clReleaseMemObject(objs[i]);
144 	clReleaseKernel(kernel);
145 
146 	return PRIMITIVES_SUCCESS;
147 
148 error_set_args:
149 	clReleaseMemObject(destObj);
150 error_objs:
151 	for (i = 0; i < 3; i++)
152 	{
153 		if (objs[i])
154 			clReleaseMemObject(objs[i]);
155 	}
156 	clReleaseKernel(kernel);
157 	return -1;
158 }
159 
160 static primitives_opencl_context openclContext;
161 
primitives_get_opencl_context(void)162 static primitives_opencl_context* primitives_get_opencl_context(void)
163 {
164 	return &openclContext;
165 }
166 
primitives_uninit_opencl(void)167 static pstatus_t primitives_uninit_opencl(void)
168 {
169 	if (!openclContext.support)
170 		return PRIMITIVES_SUCCESS;
171 
172 	clReleaseProgram(openclContext.program);
173 	clReleaseCommandQueue(openclContext.commandQueue);
174 	clReleaseContext(openclContext.context);
175 	clReleaseDevice(openclContext.deviceId);
176 	openclContext.support = FALSE;
177 	return PRIMITIVES_SUCCESS;
178 }
179 
180 static const char* openclProgram =
181 #include "primitives.cl"
182     ;
183 
primitives_init_opencl_context(primitives_opencl_context * cl)184 static BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
185 {
186 	cl_platform_id* platform_ids = NULL;
187 	cl_uint ndevices, nplatforms, i;
188 	cl_kernel kernel;
189 	cl_int ret;
190 
191 	BOOL gotGPU = FALSE;
192 	size_t programLen;
193 
194 	ret = clGetPlatformIDs(0, NULL, &nplatforms);
195 	if (ret != CL_SUCCESS || nplatforms < 1)
196 		return FALSE;
197 
198 	platform_ids = calloc(nplatforms, sizeof(*platform_ids));
199 	if (!platform_ids)
200 		return FALSE;
201 
202 	ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
203 	if (ret != CL_SUCCESS)
204 	{
205 		free(platform_ids);
206 		return FALSE;
207 	}
208 
209 	for (i = 0; (i < nplatforms) && !gotGPU; i++)
210 	{
211 		cl_device_id device_id;
212 		cl_context context;
213 		char platformName[1000];
214 		char deviceName[1000];
215 
216 		ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
217 		                        platformName, NULL);
218 		if (ret != CL_SUCCESS)
219 			continue;
220 
221 		ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
222 		if (ret != CL_SUCCESS)
223 			continue;
224 
225 		ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
226 		if (ret != CL_SUCCESS)
227 		{
228 			WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
229 			clReleaseDevice(device_id);
230 			continue;
231 		}
232 
233 		context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
234 		if (ret != CL_SUCCESS)
235 		{
236 			WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
237 			         platformName, deviceName);
238 			clReleaseDevice(device_id);
239 			continue;
240 		}
241 
242 		cl->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
243 		if (ret != CL_SUCCESS)
244 		{
245 			WLog_ERR(TAG, "openCL: unable to create command queue");
246 			clReleaseContext(context);
247 			clReleaseDevice(device_id);
248 			continue;
249 		}
250 
251 		WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
252 
253 		cl->platformId = platform_ids[i];
254 		cl->deviceId = device_id;
255 		cl->context = context;
256 		gotGPU = TRUE;
257 	}
258 
259 	free(platform_ids);
260 
261 	if (!gotGPU)
262 	{
263 		WLog_ERR(TAG, "openCL: no GPU found");
264 		return FALSE;
265 	}
266 
267 	programLen = strlen(openclProgram);
268 	cl->program =
269 	    clCreateProgramWithSource(cl->context, 1, (const char**)&openclProgram, &programLen, &ret);
270 	if (ret != CL_SUCCESS)
271 	{
272 		WLog_ERR(TAG, "openCL: unable to create program");
273 		goto out_program_create;
274 	}
275 
276 	ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL);
277 	if (ret != CL_SUCCESS)
278 	{
279 		size_t length;
280 		char buffer[2048];
281 		ret = clGetProgramBuildInfo(cl->program, cl->deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer),
282 		                            buffer, &length);
283 		if (ret != CL_SUCCESS)
284 		{
285 			WLog_ERR(TAG,
286 			         "openCL: building program failed but unable to retrieve buildLog, error=%d",
287 			         ret);
288 		}
289 		else
290 		{
291 			WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
292 		}
293 		goto out_program_build;
294 	}
295 
296 	kernel = clCreateKernel(cl->program, "yuv420_to_bgra_1b", &ret);
297 	if (ret != CL_SUCCESS)
298 	{
299 		WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
300 		goto out_program_build;
301 	}
302 	clReleaseKernel(kernel);
303 
304 	cl->support = TRUE;
305 	return TRUE;
306 
307 out_program_build:
308 	clReleaseProgram(cl->program);
309 out_program_create:
310 	clReleaseCommandQueue(cl->commandQueue);
311 	clReleaseContext(cl->context);
312 	clReleaseDevice(cl->deviceId);
313 	return FALSE;
314 }
315 
opencl_YUV420ToRGB_8u_P3AC4R(const BYTE * const pSrc[3],const UINT32 srcStep[3],BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)316 static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* const pSrc[3], const UINT32 srcStep[3],
317                                               BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
318                                               const prim_size_t* roi)
319 {
320 	const char* kernel_name;
321 
322 	switch (DstFormat)
323 	{
324 		case PIXEL_FORMAT_BGRA32:
325 		case PIXEL_FORMAT_BGRX32:
326 			kernel_name = "yuv420_to_bgra_1b";
327 			break;
328 		case PIXEL_FORMAT_XRGB32:
329 		case PIXEL_FORMAT_ARGB32:
330 			kernel_name = "yuv420_to_argb_1b";
331 			break;
332 		default:
333 		{
334 			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
335 			if (!p)
336 				return -1;
337 			return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
338 		}
339 	}
340 
341 	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
342 }
343 
opencl_YUV444ToRGB_8u_P3AC4R(const BYTE * const pSrc[3],const UINT32 srcStep[3],BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)344 static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const pSrc[3], const UINT32 srcStep[3],
345                                               BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
346                                               const prim_size_t* roi)
347 {
348 	const char* kernel_name;
349 
350 	switch (DstFormat)
351 	{
352 		case PIXEL_FORMAT_BGRA32:
353 		case PIXEL_FORMAT_BGRX32:
354 			kernel_name = "yuv444_to_bgra_1b";
355 			break;
356 		case PIXEL_FORMAT_XRGB32:
357 		case PIXEL_FORMAT_ARGB32:
358 			kernel_name = "yuv444_to_argb_1b";
359 			break;
360 		default:
361 		{
362 			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
363 			if (!p)
364 				return -1;
365 			return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
366 		}
367 	}
368 
369 	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
370 }
371 
primitives_init_opencl(primitives_t * prims)372 BOOL primitives_init_opencl(primitives_t* prims)
373 {
374 	primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
375 	if (!prims || !p)
376 		return FALSE;
377 	*prims = *p;
378 
379 	if (!primitives_init_opencl_context(&openclContext))
380 		return FALSE;
381 
382 	prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
383 	prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
384 	prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
385 	prims->uninit = primitives_uninit_opencl;
386 	return TRUE;
387 }
388