1 /*
2  * Adapted from an example found here https://stackoverflow.com/questions/4979504/fast-rgb-yuv-conversion-in-opencl
3  * Copyright 2019 Scott Moreau
4  *
5  */
6 
7 #include <iostream>
8 
9 #include "opencl.hpp"
10 
11 
12 static char const *cl_source_str = R"(
13 __kernel void rgbx_2_yuv420 (__global unsigned int  *sourceImage,
14                              __global unsigned char *destImage,
15                              unsigned int srcWidth,
16                              unsigned int srcHeight,
17                              short rgb0)
18 {
19     int i, d;
20     unsigned int pixels[4], posSrc[2];
21     unsigned int RGB, ValueY, ValueU, ValueV, c1, c2, c3, u_offset, v_offset;
22     unsigned char r, g, b;
23 
24     unsigned int posX = get_global_id(0);
25     unsigned int posY = get_global_id(1);
26 
27     unsigned int X2 = posX * 2;
28     unsigned int Y2 = posY * 2;
29 
30     unsigned int size = srcWidth * srcHeight;
31 
32     unsigned int halfWidth = ((srcWidth + 1) >> 1);
33     unsigned int halfHeight = ((srcHeight + 1) >> 1);
34 
35     if (posX >= halfWidth || posY >= halfHeight)
36         return;
37 
38     posSrc[0] = (Y2 * srcWidth) + X2;
39     posSrc[1] = ((Y2 + 1) * srcWidth) + X2;
40 
41     pixels[0] = sourceImage[posSrc[0] + 0];
42     pixels[1] = sourceImage[posSrc[0] + 1];
43     pixels[2] = sourceImage[posSrc[1] + 0];
44     pixels[3] = sourceImage[posSrc[1] + 1];
45 
46     for (i = 0; i < 4; i++)
47     {
48         if (i == 1 && (X2 + 1) >= srcWidth)
49             continue;
50         if (i > 1 && (posSrc[1] + ((i - 1) >> 1)) >= size)
51             break;
52 
53         RGB = pixels[i];
54         if (rgb0)
55         {
56             r = (RGB) & 0xff; g = (RGB >> 8) & 0xff; b = (RGB >> 16) & 0xff;
57         }
58         else //bgr0
59         {
60             b = (RGB) & 0xff; g = (RGB >> 8) & 0xff; r = (RGB >> 16) & 0xff;
61         }
62 
63         // Y plane - pack 1 * 8-bit Y within each 8-bit unit.
64         ValueY = ((66 * r + 129 * g + 25 * b) >> 8) + 16;
65         if (i < 2)
66             destImage[(Y2 * srcWidth) + X2 + i] = ValueY;
67         else
68             destImage[((Y2 + 1) * srcWidth) + X2 + (i - 2)] = ValueY;
69     }
70 
71     c1 = (pixels[0] & 0xff);
72     c2 = ((pixels[0] >> 8) & 0xff);
73     c3 = ((pixels[0] >> 16) & 0xff);
74     d = 0;
75     if ((X2 + 1) < srcWidth)
76     {
77         c1 += (pixels[1] & 0xff);
78         c2 += ((pixels[1] >> 8) & 0xff);
79         c3 += ((pixels[1] >> 16) & 0xff);
80         d++;
81     }
82     if ((Y2 + 1) < srcHeight)
83     {
84         c1 += (pixels[2] & 0xff);
85         c2 += ((pixels[2] >> 8) & 0xff);
86         c3 += ((pixels[2] >> 16) & 0xff);
87         d++;
88     }
89     if (d == 2)
90     {
91         c1 += (pixels[3] & 0xff);
92         c2 += ((pixels[3] >> 8) & 0xff);
93         c3 += ((pixels[3] >> 16) & 0xff);
94     }
95     if (rgb0)
96     {
97         r = c1 >> d; g = c2 >> d; b = c3 >> d;
98     }
99     else //bgr0
100     {
101         b = c1 >> d; g = c2 >> d; r = c3 >> d;
102     }
103 
104     // UV plane - pack 1 * 8-bit U and 1 * 8-bit V for each subsample average
105     ValueU = ((-38 * r - 74 * g + 112 * b) >> 8) + 128;
106     ValueV = ((112 * r - 94 * g - 18  * b) >> 8) + 128;
107 
108     u_offset = size + (posY * halfWidth);
109     v_offset = u_offset + (halfWidth * halfHeight);
110 
111     destImage[u_offset + posX] = ValueU;
112     destImage[v_offset + posX] = ValueV;
113 
114     return;
115 }
116 )";
117 
118 cl_device_id
get_device_id(int device)119 OpenCL::get_device_id(int device)
120 {
121     uint32_t i, j;
122     char* value;
123     size_t valueSize;
124     cl_uint platformCount;
125     cl_platform_id* platforms;
126     cl_uint deviceCount;
127     cl_device_id* devices;
128     cl_device_id device_id;
129     std::vector<cl_device_id> all_devices;
130 
131     ret = clGetPlatformIDs(0, NULL, &platformCount);
132     if (ret)
133     {
134         std::cerr << "clGetPlatformIDs failed!" << std::endl;
135         return NULL;
136     }
137     if (!platformCount)
138     {
139         std::cerr << "No OpenCL platforms detected." << std::endl;
140         return NULL;
141     }
142     platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
143     ret = clGetPlatformIDs(platformCount, platforms, NULL);
144     if (ret)
145     {
146         std::cerr << "clGetPlatformIDs failed!" << std::endl;
147         return NULL;
148     }
149 
150     if (platformCount == 1 && device <= 0)
151     {
152         ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
153         if (ret)
154         {
155             std::cerr << "clGetDeviceIDs failed!" << std::endl;
156             return NULL;
157         }
158         if (!deviceCount)
159         {
160             std::cerr << "No OpenCL devices detected." << std::endl;
161             return NULL;
162         }
163         if (deviceCount == 1)
164         {
165             ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1,
166                 &device_id, &deviceCount);
167             if (ret)
168             {
169                 std::cerr << "clGetDeviceIDs failed!" << std::endl;
170                 return NULL;
171             }
172             return device_id;
173 	}
174     }
175 
176     if (device < 0)
177     {
178         std::cout << std::endl;
179         std::cout << "Please choose an OpenCL device:" << std::endl;
180     }
181 
182     for (i = 0; i < platformCount; i++) {
183         ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
184         switch (ret)
185         {
186             case CL_INVALID_PLATFORM:
187             case CL_INVALID_DEVICE_TYPE:
188             case CL_INVALID_VALUE:
189             case CL_DEVICE_NOT_FOUND:
190             continue;
191             break;
192             case CL_SUCCESS:
193             default:
194             break;
195         }
196         if (!deviceCount)
197         {
198             std::cerr << "No OpenCL devices detected for platform " << i + 1 << std::endl;
199             continue;
200         }
201         devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
202         ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
203         switch (ret)
204         {
205             case CL_INVALID_PLATFORM:
206             case CL_INVALID_DEVICE_TYPE:
207             case CL_INVALID_VALUE:
208             case CL_DEVICE_NOT_FOUND:
209             continue;
210             break;
211             case CL_SUCCESS:
212             default:
213             break;
214         }
215 
216         for (j = 0; j < deviceCount; j++) {
217             ret = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
218             if (ret)
219             {
220                 std::cerr << "clGetDeviceInfo failed!" << std::endl;
221                 return NULL;
222             }
223             value = (char*) malloc(valueSize);
224             ret = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
225             if (ret)
226             {
227                 std::cerr << "clGetDeviceInfo failed!" << std::endl;
228                 return NULL;
229             }
230             all_devices.push_back(devices[j]);
231             if (device < 0)
232                 std::cout << all_devices.size() << ": " << value << std::endl;
233             free(value);
234             if (device == (int) all_devices.size())
235                 break;
236         }
237 
238         free(devices);
239         if (device == (int) all_devices.size())
240             break;
241     }
242 
243     free(platforms);
244 
245     if (device > (int) all_devices.size())
246     {
247         std::cerr << "Max OpenCL device number is " << all_devices.size() << std::endl;
248         return NULL;
249     }
250 
251     if (!device)
252         return all_devices[device];
253 
254     if (device > 0)
255         return all_devices[device - 1];
256 
257     std::cout << "Enter device no.:";
258     fflush(stdout);
259 
260     int choice;
261     if (scanf("%d", &choice) != 1 || choice > (int) all_devices.size() || choice <= 0)
262     {
263         std::cerr << "Bad choice." << std::endl;
264         return NULL;
265     }
266 
267     return all_devices[choice - 1];
268 }
269 
270 int
init(int _width,int _height)271 OpenCL::init(int _width, int _height)
272 {
273     if (ret)
274         return ret;
275 
276     width = _width;
277     height = _height;
278 
279     halfWidth = ((width + 1) >> 1);
280     halfHeight = ((height + 1) >> 1);
281     unsigned int frameSize = width * height;
282     unsigned int frameSizeUV = halfWidth * halfHeight;
283 
284     argbSize = frameSize * 4; // ARGB pixels
285 
286     yuv420Size = frameSize + frameSizeUV * 2; // Y+UV planes
287 
288     yuv420_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, yuv420Size * sizeof(uint8_t), 0, &ret);
289     if (ret)
290     {
291         std::cerr << "clCreateBuffer (yuv420) failure!" << std::endl;
292         return ret;
293     }
294 
295     local_yuv420_buffer = (uint8_t *) malloc(yuv420Size * sizeof(uint8_t));
296 
297     if (!local_yuv420_buffer)
298     {
299         std::cerr << "malloc failure!" << std::endl;
300         ret = -1;
301         return ret;
302     }
303 
304     std::cout << "Using OpenCL for accelerated RGB to YUV420 conversion" << std::endl;
305 
306     return ret;
307 }
308 
OpenCL(int device)309 OpenCL::OpenCL(int device)
310 {
311     device_id = get_device_id(device);
312     if (!device_id)
313     {
314         ret = -1;
315         return;
316     }
317 
318     // Create an OpenCL context
319     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
320     if (ret)
321     {
322         std::cerr << "clCreateContext failed!" << std::endl;
323         return;
324     }
325 
326     // Create a command queue
327     command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
328     if (ret)
329     {
330         std::cerr << "clCreateCommandQueue failed!" << std::endl;
331         return;
332     }
333     // Create a program from the kernel source
334     program = clCreateProgramWithSource(context, 1,
335         (const char **)&cl_source_str, NULL, &ret);
336     if (ret)
337     {
338         std::cerr << "clCreateProgramWithSource failed!" << std::endl;
339         return;
340     }
341 
342     // Build the program
343     ret |= clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
344     if (ret)
345     {
346         std::cerr << "clBuildProgram failed!" << std::endl;
347 
348         char *build_log;
349         size_t ret_val_size;
350         clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
351         build_log = new char[ret_val_size+1];
352         clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
353         std::cout << build_log << std::endl;
354         delete build_log;
355     }
356 
357     // Create the OpenCL kernel
358     kernel = clCreateKernel(program, "rgbx_2_yuv420", &ret);
359     if (ret)
360     {
361         std::cerr << "clCreateKernel failed!" << std::endl;
362         return;
363     }
364 }
365 
366 int
do_frame(const uint8_t * pixels,AVFrame * encoder_frame,AVPixelFormat format,bool y_invert)367 OpenCL::do_frame(const uint8_t* pixels, AVFrame *encoder_frame, AVPixelFormat format, bool y_invert)
368 {
369     const uint8_t *formatted_pixels;
370     short rgb0 = format == AV_PIX_FMT_RGB0 ? 1 : 0;
371 
372     if (ret)
373         return ret;
374 
375     rgb_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, argbSize, (void *) pixels, &ret);
376     if (ret)
377     {
378         std::cerr << "clCreateBuffer (rgb) failed!" << std::endl;
379         return ret;
380     }
381 
382     ret |= clSetKernelArg ( kernel, 0, sizeof(cl_mem), &rgb_buffer );
383     ret |= clSetKernelArg ( kernel, 1, sizeof(cl_mem), &yuv420_buffer );
384     ret |= clSetKernelArg ( kernel, 2, sizeof(unsigned int), &width);
385     ret |= clSetKernelArg ( kernel, 3, sizeof(unsigned int), &height);
386     ret |= clSetKernelArg ( kernel, 4, sizeof(short), &rgb0);
387     if (ret)
388     {
389         std::cerr << "clSetKernelArg failed!" << std::endl;
390         return ret;
391     }
392 
393     const size_t global_ws[] = {halfWidth, halfHeight};
394     ret |= clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_ws, NULL, 0, NULL, NULL);
395     if (ret)
396     {
397         std::cerr << "clEnqueueNDRangeKernel failed!" << std::endl;
398         return ret;
399     }
400 
401     // Read yuv420 buffer from gpu
402     ret |= clEnqueueReadBuffer(command_queue, yuv420_buffer, CL_TRUE, 0,
403         yuv420Size * sizeof(uint8_t), local_yuv420_buffer, 0, NULL, NULL);
404     if (ret)
405     {
406         std::cerr << "clEnqueueReadBuffer failed!" << std::endl;
407         return ret;
408     }
409 
410     ret |= clReleaseMemObject(rgb_buffer);
411     if (ret)
412     {
413         std::cerr << "clReleaseMemObject failed!" << std::endl;
414         return ret;
415     }
416 
417     formatted_pixels = local_yuv420_buffer;
418 
419     if (y_invert)
420         formatted_pixels += width * (height - 1);
421     encoder_frame->data[0] = (uint8_t *) formatted_pixels;
422 
423     if (y_invert)
424         formatted_pixels += (halfWidth) * (halfHeight - 1) + width;
425     else
426         formatted_pixels += width * height;
427     encoder_frame->data[1] = (uint8_t *) formatted_pixels;
428 
429     formatted_pixels += halfWidth * halfHeight;
430     encoder_frame->data[2] = (uint8_t *) formatted_pixels;
431 
432     short flip = y_invert ? -1 : 1;
433 
434     encoder_frame->linesize[0] = width * flip;
435     encoder_frame->linesize[1] = halfWidth * flip;
436     encoder_frame->linesize[2] = halfWidth * flip;
437 
438     return ret;
439 }
440 
~OpenCL()441 OpenCL::~OpenCL()
442 {
443     free(local_yuv420_buffer);
444 
445     if (ret)
446         return;
447 
448     clFlush(command_queue);
449     clFinish(command_queue);
450     clReleaseKernel(kernel);
451     clReleaseProgram(program);
452     clReleaseMemObject(yuv420_buffer);
453     clReleaseCommandQueue(command_queue);
454     clReleaseContext(context);
455 }