1 /*
2 * Adapted from an example found here https://stackoverflow.com/questions/4979504/fast-rgb-yuv-conversion-in-opencl
3 * Copyright 2019 Scott Moreau
4 *
5 */
6
7 #include <iostream>
8
9 #include "opencl.hpp"
10
11
12 static char const *cl_source_str = R"(
13 __kernel void rgbx_2_yuv420 (__global unsigned int *sourceImage,
14 __global unsigned char *destImage,
15 unsigned int srcWidth,
16 unsigned int srcHeight,
17 short rgb0)
18 {
19 int i, d;
20 unsigned int pixels[4], posSrc[2];
21 unsigned int RGB, ValueY, ValueU, ValueV, c1, c2, c3, u_offset, v_offset;
22 unsigned char r, g, b;
23
24 unsigned int posX = get_global_id(0);
25 unsigned int posY = get_global_id(1);
26
27 unsigned int X2 = posX * 2;
28 unsigned int Y2 = posY * 2;
29
30 unsigned int size = srcWidth * srcHeight;
31
32 unsigned int halfWidth = ((srcWidth + 1) >> 1);
33 unsigned int halfHeight = ((srcHeight + 1) >> 1);
34
35 if (posX >= halfWidth || posY >= halfHeight)
36 return;
37
38 posSrc[0] = (Y2 * srcWidth) + X2;
39 posSrc[1] = ((Y2 + 1) * srcWidth) + X2;
40
41 pixels[0] = sourceImage[posSrc[0] + 0];
42 pixels[1] = sourceImage[posSrc[0] + 1];
43 pixels[2] = sourceImage[posSrc[1] + 0];
44 pixels[3] = sourceImage[posSrc[1] + 1];
45
46 for (i = 0; i < 4; i++)
47 {
48 if (i == 1 && (X2 + 1) >= srcWidth)
49 continue;
50 if (i > 1 && (posSrc[1] + ((i - 1) >> 1)) >= size)
51 break;
52
53 RGB = pixels[i];
54 if (rgb0)
55 {
56 r = (RGB) & 0xff; g = (RGB >> 8) & 0xff; b = (RGB >> 16) & 0xff;
57 }
58 else //bgr0
59 {
60 b = (RGB) & 0xff; g = (RGB >> 8) & 0xff; r = (RGB >> 16) & 0xff;
61 }
62
63 // Y plane - pack 1 * 8-bit Y within each 8-bit unit.
64 ValueY = ((66 * r + 129 * g + 25 * b) >> 8) + 16;
65 if (i < 2)
66 destImage[(Y2 * srcWidth) + X2 + i] = ValueY;
67 else
68 destImage[((Y2 + 1) * srcWidth) + X2 + (i - 2)] = ValueY;
69 }
70
71 c1 = (pixels[0] & 0xff);
72 c2 = ((pixels[0] >> 8) & 0xff);
73 c3 = ((pixels[0] >> 16) & 0xff);
74 d = 0;
75 if ((X2 + 1) < srcWidth)
76 {
77 c1 += (pixels[1] & 0xff);
78 c2 += ((pixels[1] >> 8) & 0xff);
79 c3 += ((pixels[1] >> 16) & 0xff);
80 d++;
81 }
82 if ((Y2 + 1) < srcHeight)
83 {
84 c1 += (pixels[2] & 0xff);
85 c2 += ((pixels[2] >> 8) & 0xff);
86 c3 += ((pixels[2] >> 16) & 0xff);
87 d++;
88 }
89 if (d == 2)
90 {
91 c1 += (pixels[3] & 0xff);
92 c2 += ((pixels[3] >> 8) & 0xff);
93 c3 += ((pixels[3] >> 16) & 0xff);
94 }
95 if (rgb0)
96 {
97 r = c1 >> d; g = c2 >> d; b = c3 >> d;
98 }
99 else //bgr0
100 {
101 b = c1 >> d; g = c2 >> d; r = c3 >> d;
102 }
103
104 // UV plane - pack 1 * 8-bit U and 1 * 8-bit V for each subsample average
105 ValueU = ((-38 * r - 74 * g + 112 * b) >> 8) + 128;
106 ValueV = ((112 * r - 94 * g - 18 * b) >> 8) + 128;
107
108 u_offset = size + (posY * halfWidth);
109 v_offset = u_offset + (halfWidth * halfHeight);
110
111 destImage[u_offset + posX] = ValueU;
112 destImage[v_offset + posX] = ValueV;
113
114 return;
115 }
116 )";
117
118 cl_device_id
get_device_id(int device)119 OpenCL::get_device_id(int device)
120 {
121 uint32_t i, j;
122 char* value;
123 size_t valueSize;
124 cl_uint platformCount;
125 cl_platform_id* platforms;
126 cl_uint deviceCount;
127 cl_device_id* devices;
128 cl_device_id device_id;
129 std::vector<cl_device_id> all_devices;
130
131 ret = clGetPlatformIDs(0, NULL, &platformCount);
132 if (ret)
133 {
134 std::cerr << "clGetPlatformIDs failed!" << std::endl;
135 return NULL;
136 }
137 if (!platformCount)
138 {
139 std::cerr << "No OpenCL platforms detected." << std::endl;
140 return NULL;
141 }
142 platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
143 ret = clGetPlatformIDs(platformCount, platforms, NULL);
144 if (ret)
145 {
146 std::cerr << "clGetPlatformIDs failed!" << std::endl;
147 return NULL;
148 }
149
150 if (platformCount == 1 && device <= 0)
151 {
152 ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
153 if (ret)
154 {
155 std::cerr << "clGetDeviceIDs failed!" << std::endl;
156 return NULL;
157 }
158 if (!deviceCount)
159 {
160 std::cerr << "No OpenCL devices detected." << std::endl;
161 return NULL;
162 }
163 if (deviceCount == 1)
164 {
165 ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1,
166 &device_id, &deviceCount);
167 if (ret)
168 {
169 std::cerr << "clGetDeviceIDs failed!" << std::endl;
170 return NULL;
171 }
172 return device_id;
173 }
174 }
175
176 if (device < 0)
177 {
178 std::cout << std::endl;
179 std::cout << "Please choose an OpenCL device:" << std::endl;
180 }
181
182 for (i = 0; i < platformCount; i++) {
183 ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
184 switch (ret)
185 {
186 case CL_INVALID_PLATFORM:
187 case CL_INVALID_DEVICE_TYPE:
188 case CL_INVALID_VALUE:
189 case CL_DEVICE_NOT_FOUND:
190 continue;
191 break;
192 case CL_SUCCESS:
193 default:
194 break;
195 }
196 if (!deviceCount)
197 {
198 std::cerr << "No OpenCL devices detected for platform " << i + 1 << std::endl;
199 continue;
200 }
201 devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
202 ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
203 switch (ret)
204 {
205 case CL_INVALID_PLATFORM:
206 case CL_INVALID_DEVICE_TYPE:
207 case CL_INVALID_VALUE:
208 case CL_DEVICE_NOT_FOUND:
209 continue;
210 break;
211 case CL_SUCCESS:
212 default:
213 break;
214 }
215
216 for (j = 0; j < deviceCount; j++) {
217 ret = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
218 if (ret)
219 {
220 std::cerr << "clGetDeviceInfo failed!" << std::endl;
221 return NULL;
222 }
223 value = (char*) malloc(valueSize);
224 ret = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
225 if (ret)
226 {
227 std::cerr << "clGetDeviceInfo failed!" << std::endl;
228 return NULL;
229 }
230 all_devices.push_back(devices[j]);
231 if (device < 0)
232 std::cout << all_devices.size() << ": " << value << std::endl;
233 free(value);
234 if (device == (int) all_devices.size())
235 break;
236 }
237
238 free(devices);
239 if (device == (int) all_devices.size())
240 break;
241 }
242
243 free(platforms);
244
245 if (device > (int) all_devices.size())
246 {
247 std::cerr << "Max OpenCL device number is " << all_devices.size() << std::endl;
248 return NULL;
249 }
250
251 if (!device)
252 return all_devices[device];
253
254 if (device > 0)
255 return all_devices[device - 1];
256
257 std::cout << "Enter device no.:";
258 fflush(stdout);
259
260 int choice;
261 if (scanf("%d", &choice) != 1 || choice > (int) all_devices.size() || choice <= 0)
262 {
263 std::cerr << "Bad choice." << std::endl;
264 return NULL;
265 }
266
267 return all_devices[choice - 1];
268 }
269
270 int
init(int _width,int _height)271 OpenCL::init(int _width, int _height)
272 {
273 if (ret)
274 return ret;
275
276 width = _width;
277 height = _height;
278
279 halfWidth = ((width + 1) >> 1);
280 halfHeight = ((height + 1) >> 1);
281 unsigned int frameSize = width * height;
282 unsigned int frameSizeUV = halfWidth * halfHeight;
283
284 argbSize = frameSize * 4; // ARGB pixels
285
286 yuv420Size = frameSize + frameSizeUV * 2; // Y+UV planes
287
288 yuv420_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, yuv420Size * sizeof(uint8_t), 0, &ret);
289 if (ret)
290 {
291 std::cerr << "clCreateBuffer (yuv420) failure!" << std::endl;
292 return ret;
293 }
294
295 local_yuv420_buffer = (uint8_t *) malloc(yuv420Size * sizeof(uint8_t));
296
297 if (!local_yuv420_buffer)
298 {
299 std::cerr << "malloc failure!" << std::endl;
300 ret = -1;
301 return ret;
302 }
303
304 std::cout << "Using OpenCL for accelerated RGB to YUV420 conversion" << std::endl;
305
306 return ret;
307 }
308
OpenCL(int device)309 OpenCL::OpenCL(int device)
310 {
311 device_id = get_device_id(device);
312 if (!device_id)
313 {
314 ret = -1;
315 return;
316 }
317
318 // Create an OpenCL context
319 context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
320 if (ret)
321 {
322 std::cerr << "clCreateContext failed!" << std::endl;
323 return;
324 }
325
326 // Create a command queue
327 command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
328 if (ret)
329 {
330 std::cerr << "clCreateCommandQueue failed!" << std::endl;
331 return;
332 }
333 // Create a program from the kernel source
334 program = clCreateProgramWithSource(context, 1,
335 (const char **)&cl_source_str, NULL, &ret);
336 if (ret)
337 {
338 std::cerr << "clCreateProgramWithSource failed!" << std::endl;
339 return;
340 }
341
342 // Build the program
343 ret |= clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
344 if (ret)
345 {
346 std::cerr << "clBuildProgram failed!" << std::endl;
347
348 char *build_log;
349 size_t ret_val_size;
350 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
351 build_log = new char[ret_val_size+1];
352 clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
353 std::cout << build_log << std::endl;
354 delete build_log;
355 }
356
357 // Create the OpenCL kernel
358 kernel = clCreateKernel(program, "rgbx_2_yuv420", &ret);
359 if (ret)
360 {
361 std::cerr << "clCreateKernel failed!" << std::endl;
362 return;
363 }
364 }
365
366 int
do_frame(const uint8_t * pixels,AVFrame * encoder_frame,AVPixelFormat format,bool y_invert)367 OpenCL::do_frame(const uint8_t* pixels, AVFrame *encoder_frame, AVPixelFormat format, bool y_invert)
368 {
369 const uint8_t *formatted_pixels;
370 short rgb0 = format == AV_PIX_FMT_RGB0 ? 1 : 0;
371
372 if (ret)
373 return ret;
374
375 rgb_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, argbSize, (void *) pixels, &ret);
376 if (ret)
377 {
378 std::cerr << "clCreateBuffer (rgb) failed!" << std::endl;
379 return ret;
380 }
381
382 ret |= clSetKernelArg ( kernel, 0, sizeof(cl_mem), &rgb_buffer );
383 ret |= clSetKernelArg ( kernel, 1, sizeof(cl_mem), &yuv420_buffer );
384 ret |= clSetKernelArg ( kernel, 2, sizeof(unsigned int), &width);
385 ret |= clSetKernelArg ( kernel, 3, sizeof(unsigned int), &height);
386 ret |= clSetKernelArg ( kernel, 4, sizeof(short), &rgb0);
387 if (ret)
388 {
389 std::cerr << "clSetKernelArg failed!" << std::endl;
390 return ret;
391 }
392
393 const size_t global_ws[] = {halfWidth, halfHeight};
394 ret |= clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_ws, NULL, 0, NULL, NULL);
395 if (ret)
396 {
397 std::cerr << "clEnqueueNDRangeKernel failed!" << std::endl;
398 return ret;
399 }
400
401 // Read yuv420 buffer from gpu
402 ret |= clEnqueueReadBuffer(command_queue, yuv420_buffer, CL_TRUE, 0,
403 yuv420Size * sizeof(uint8_t), local_yuv420_buffer, 0, NULL, NULL);
404 if (ret)
405 {
406 std::cerr << "clEnqueueReadBuffer failed!" << std::endl;
407 return ret;
408 }
409
410 ret |= clReleaseMemObject(rgb_buffer);
411 if (ret)
412 {
413 std::cerr << "clReleaseMemObject failed!" << std::endl;
414 return ret;
415 }
416
417 formatted_pixels = local_yuv420_buffer;
418
419 if (y_invert)
420 formatted_pixels += width * (height - 1);
421 encoder_frame->data[0] = (uint8_t *) formatted_pixels;
422
423 if (y_invert)
424 formatted_pixels += (halfWidth) * (halfHeight - 1) + width;
425 else
426 formatted_pixels += width * height;
427 encoder_frame->data[1] = (uint8_t *) formatted_pixels;
428
429 formatted_pixels += halfWidth * halfHeight;
430 encoder_frame->data[2] = (uint8_t *) formatted_pixels;
431
432 short flip = y_invert ? -1 : 1;
433
434 encoder_frame->linesize[0] = width * flip;
435 encoder_frame->linesize[1] = halfWidth * flip;
436 encoder_frame->linesize[2] = halfWidth * flip;
437
438 return ret;
439 }
440
~OpenCL()441 OpenCL::~OpenCL()
442 {
443 free(local_yuv420_buffer);
444
445 if (ret)
446 return;
447
448 clFlush(command_queue);
449 clFinish(command_queue);
450 clReleaseKernel(kernel);
451 clReleaseProgram(program);
452 clReleaseMemObject(yuv420_buffer);
453 clReleaseCommandQueue(command_queue);
454 clReleaseContext(context);
455 }