1 /*****************************************************************************
2  * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3  *****************************************************************************
4  * Copyright (C) 2012-2021 x264 project
5  *
6  * Authors: Steve Borho <sborho@multicorewareinc.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25 
26 #include "common/common.h"
27 #include "macroblock.h"
28 #include "me.h"
29 #include "slicetype-cl.h"
30 
31 #if HAVE_OPENCL
32 #ifdef _WIN32
33 #include <windows.h>
34 #endif
35 
36 #define x264_weights_analyse x264_template(weights_analyse)
37 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
38 
39 /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
40  * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
41  * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
42 #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
43 
44 #define OCLCHECK( method, ... )\
45 do\
46 {\
47     if( h->opencl.b_fatal_error )\
48         return -1;\
49     status = ocl->method( __VA_ARGS__ );\
50     if( status != CL_SUCCESS ) {\
51         h->param.b_opencl = 0;\
52         h->opencl.b_fatal_error = 1;\
53         x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
54         return -1;\
55     }\
56 } while( 0 )
57 
x264_opencl_flush(x264_t * h)58 void x264_opencl_flush( x264_t *h )
59 {
60     x264_opencl_function_t *ocl = h->opencl.ocl;
61 
62     ocl->clFinish( h->opencl.queue );
63 
64     /* Finish copies from the GPU by copying from the page-locked buffer to
65      * their final destination */
66     for( int i = 0; i < h->opencl.num_copies; i++ )
67         memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
68     h->opencl.num_copies = 0;
69     h->opencl.pl_occupancy = 0;
70 }
71 
opencl_alloc_locked(x264_t * h,int bytes)72 static void *opencl_alloc_locked( x264_t *h, int bytes )
73 {
74     if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
75         x264_opencl_flush( h );
76     assert( bytes < PAGE_LOCKED_BUF_SIZE );
77     char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
78     h->opencl.pl_occupancy += bytes;
79     return ptr;
80 }
81 
x264_opencl_lowres_init(x264_t * h,x264_frame_t * fenc,int lambda)82 int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
83 {
84     if( fenc->b_intra_calculated )
85         return 0;
86     fenc->b_intra_calculated = 1;
87 
88     x264_opencl_function_t *ocl = h->opencl.ocl;
89     int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
90 
91 #define CREATEBUF( out, flags, size )\
92     out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
93     if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
94 #define CREATEIMAGE( out, flags, pf, width, height )\
95     out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
96     if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
97 
98     int mb_count = h->mb.i_mb_count;
99     cl_int status;
100 
101     if( !h->opencl.lowres_mv_costs )
102     {
103         /* Allocate shared memory buffers */
104         int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
105         int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
106 
107         cl_image_format pixel_format;
108         pixel_format.image_channel_order = CL_R;
109         pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
110         CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
111 
112         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
113         {
114             pixel_format.image_channel_order = CL_RGBA;
115             pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
116             CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
117             width >>= 1;
118             height >>= 1;
119         }
120 
121         CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
122         CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
123         CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
124         CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
125         CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
126         CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
127         CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
128         CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
129         CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
130         CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
131         CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
132         CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
133     }
134 
135     if( !fenc->opencl.intra_cost )
136     {
137         /* Allocate per-frame buffers */
138         int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL;
139         int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL;
140 
141         cl_image_format pixel_format;
142         pixel_format.image_channel_order = CL_R;
143         pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
144         CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
145 
146         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
147         {
148             pixel_format.image_channel_order = CL_RGBA;
149             pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
150             CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
151             width >>= 1;
152             height >>= 1;
153         }
154         CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
155         CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
156         CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
157         CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
158         CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
159         CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
160     }
161 #undef CREATEBUF
162 #undef CREATEIMAGE
163 
164     /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
165 
166     char *locked = opencl_alloc_locked( h, luma_length );
167     memcpy( locked, fenc->plane[0], luma_length );
168     OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
169 
170     size_t gdim[2];
171     if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
172     {
173         int size = h->mb.i_mb_count * sizeof(int16_t);
174         locked = opencl_alloc_locked( h, size );
175         memcpy( locked, fenc->i_inv_qscale_factor, size );
176         OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
177     }
178     else
179     {
180         /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
181         cl_uint arg = 0;
182         int16_t value = 256;
183         OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
184         OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
185         gdim[0] = h->mb.i_mb_count;
186         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
187     }
188 
189     int stride = fenc->i_stride[0];
190     cl_uint arg = 0;
191     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
192     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
193     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
194     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
195     gdim[0] = 8 * h->mb.i_mb_width;
196     gdim[1] = 8 * h->mb.i_mb_height;
197     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
198 
199     for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
200     {
201         /* Workaround for AMD Southern Island:
202          *
203          * Alternate kernel instances.  No perf impact to this, so we do it for
204          * all GPUs.  It prevents the same kernel from being enqueued
205          * back-to-back, avoiding a dependency calculation bug in the driver.
206          */
207         cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
208 
209         arg = 0;
210         OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
211         OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
212         gdim[0] >>= 1;
213         gdim[1] >>= 1;
214         if( gdim[0] < 16 || gdim[1] < 16 )
215             break;
216         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
217     }
218 
219     size_t ldim[2];
220     gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
221     gdim[1] = 8*h->mb.i_mb_height;
222     ldim[0] = 32;
223     ldim[1] = 8;
224     arg = 0;
225 
226     /* For presets slow, slower, and placebo, check all 10 intra modes that the
227      * C lookahead supports.  For faster presets, only check the most frequent 8
228      * modes
229      */
230     int slow = h->param.analyse.i_subpel_refine > 7;
231     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
232     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
233     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
234     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
235     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
236     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
237     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
238 
239     gdim[0] = 256;
240     gdim[1] = h->mb.i_mb_height;
241     ldim[0] = 256;
242     ldim[1] = 1;
243     arg = 0;
244     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
245     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
246     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
247     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
248     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
249     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
250 
251     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
252         x264_opencl_flush( h );
253 
254     int size = h->mb.i_mb_count * sizeof(int16_t);
255     locked = opencl_alloc_locked( h, size );
256     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
257     h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
258     h->opencl.copies[h->opencl.num_copies].src = locked;
259     h->opencl.copies[h->opencl.num_copies].bytes = size;
260     h->opencl.num_copies++;
261 
262     size = h->mb.i_mb_height * sizeof(int);
263     locked = opencl_alloc_locked( h, size );
264     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
265     h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
266     h->opencl.copies[h->opencl.num_copies].src = locked;
267     h->opencl.copies[h->opencl.num_copies].bytes = size;
268     h->opencl.num_copies++;
269 
270     size = sizeof(int) * 4;
271     locked = opencl_alloc_locked( h, size );
272     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
273     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
274     h->opencl.copies[h->opencl.num_copies].src = locked;
275     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
276     h->opencl.num_copies++;
277     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
278     h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
279     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
280     h->opencl.num_copies++;
281 
282     h->opencl.last_buf = !h->opencl.last_buf;
283     return 0;
284 }
285 
286 /* This function was tested emprically on a number of AMD and NV GPUs.  Making a
287  * function which returns perfect launch dimensions is impossible; some
288  * applications will have self-tuning code to try many possible variables and
289  * measure the runtime.  Here we simply make an educated guess based on what we
290  * know GPUs typically prefer.  */
optimal_launch_dims(x264_t * h,size_t * gdims,size_t * ldims,const cl_kernel kernel,const cl_device_id device)291 static void optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
292 {
293     x264_opencl_function_t *ocl = h->opencl.ocl;
294     size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
295     size_t preferred_multiple = 64;
296     cl_uint num_cus = 6;
297 
298     ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
299     ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
300     ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
301 
302     ldims[0] = preferred_multiple;
303     ldims[1] = 8;
304 
305     /* make ldims[1] an even divisor of gdims[1] */
306     while( gdims[1] & (ldims[1] - 1) )
307     {
308         ldims[0] <<= 1;
309         ldims[1] >>= 1;
310     }
311     /* make total ldims fit under the max work-group dimensions for the device */
312     while( ldims[0] * ldims[1] > max_work_group )
313     {
314         if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
315             ldims[1] >>= 1;
316         else
317             ldims[0] >>= 1;
318     }
319 
320     if( ldims[0] > gdims[0] )
321     {
322         /* remove preferred multiples until we're close to gdims[0] */
323         while( gdims[0] + preferred_multiple < ldims[0] )
324             ldims[0] -= preferred_multiple;
325         gdims[0] = ldims[0];
326     }
327     else
328     {
329         /* make gdims an even multiple of ldims */
330         gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
331         gdims[0] *= ldims[0];
332     }
333 
334     /* make ldims smaller to spread work across compute units */
335     while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
336     {
337         if( ldims[0] > preferred_multiple )
338             ldims[0] >>= 1;
339         else if( ldims[1] > 1 )
340             ldims[1] >>= 1;
341         else
342             break;
343     }
344     /* for smaller GPUs, try not to abuse their texture cache */
345     if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
346         ldims[0] = 32;
347 }
348 
x264_opencl_motionsearch(x264_t * h,x264_frame_t ** frames,int b,int ref,int b_islist1,int lambda,const x264_weight_t * w)349 int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
350 {
351     x264_opencl_function_t *ocl = h->opencl.ocl;
352     x264_frame_t *fenc = frames[b];
353     x264_frame_t *fref = frames[ref];
354 
355     cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
356     cl_mem ref_luma_hpel;
357     cl_int status;
358 
359     if( w && w->weightfn )
360     {
361         size_t gdims[2];
362 
363         gdims[0] = 8 * h->mb.i_mb_width;
364         gdims[1] = 8 * h->mb.i_mb_height;
365 
366         /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
367         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
368         {
369             cl_uint arg = 0;
370             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
371             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
372             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
373             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
374             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
375             OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
376 
377             gdims[0] >>= 1;
378             gdims[1] >>= 1;
379             if( gdims[0] < 16 || gdims[1] < 16 )
380                 break;
381         }
382 
383         cl_uint arg = 0;
384         gdims[0] = 8 * h->mb.i_mb_width;
385         gdims[1] = 8 * h->mb.i_mb_height;
386 
387         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
388         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
389         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
390         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
391         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
392         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
393 
394         /* Use weighted reference planes for motion search */
395         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
396             ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
397         ref_luma_hpel = h->opencl.weighted_luma_hpel;
398     }
399     else
400     {
401         /* Use unweighted reference planes for motion search */
402         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
403             ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
404         ref_luma_hpel = fref->opencl.luma_hpel;
405     }
406 
407     const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
408     int b_first_iteration = 1;
409     int b_reverse_references = 1;
410     int A = 1;
411 
412 
413     int mb_per_group = 0;
414     int cost_local_size = 0;
415     int mvc_local_size = 0;
416     int mb_width;
417 
418     size_t gdims[2];
419     size_t ldims[2];
420 
421     /* scale 0 is 8x8 */
422     for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
423     {
424         mb_width = h->mb.i_mb_width >> scale;
425         gdims[0] = mb_width;
426         gdims[1] = h->mb.i_mb_height >> scale;
427         if( gdims[0] < 2 || gdims[1] < 2 )
428             continue;
429         gdims[0] <<= 2;
430         optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
431 
432         mb_per_group = (ldims[0] >> 2) * ldims[1];
433         cost_local_size = 4 * mb_per_group * sizeof(int16_t);
434         mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
435         int scaled_me_range = h->param.analyse.i_me_range >> scale;
436         int b_shift_index = 1;
437 
438         cl_uint arg = 0;
439         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
440         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
441         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
442         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
443         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
444         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
445         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
446         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
447         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
448         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
449         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
450         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
451         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
452         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
453         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
454 
455         for( int iter = 0; iter < num_iterations[scale]; iter++ )
456         {
457             OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
458 
459             b_shift_index = 0;
460             b_first_iteration = 0;
461 
462             /* alternate top-left vs bot-right MB references at lower scales, so
463              * motion field smooths more quickly.  */
464             if( scale > 2 )
465                 b_reverse_references ^= 1;
466             else
467                 b_reverse_references = 0;
468             A = !A;
469             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
470             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
471             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
472             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
473             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
474         }
475     }
476 
477     int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
478     cl_uint arg = 0;
479     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
480     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
481     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
482     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
483     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
484     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
485     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
486 
487     if( b_islist1 )
488     {
489         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
490         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
491     }
492     else
493     {
494         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
495         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
496     }
497 
498     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
499     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
500     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
501     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
502     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
503 
504     if( h->opencl.b_device_AMD_SI )
505     {
506         /* workaround for AMD Southern Island driver scheduling bug (fixed in
507          * July 2012), perform meaningless small copy to add a data dependency */
508         OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
509     }
510 
511     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
512 
513     int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
514 
515     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
516         x264_opencl_flush( h );
517 
518     char *locked = opencl_alloc_locked( h, mvlen );
519     h->opencl.copies[h->opencl.num_copies].src = locked;
520     h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
521 
522     if( b_islist1 )
523     {
524         int mvs_offset = mvlen * (ref - b - 1);
525         OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
526         h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
527     }
528     else
529     {
530         int mvs_offset = mvlen * (b - ref - 1);
531         OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
532         h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
533     }
534 
535     h->opencl.num_copies++;
536 
537     return 0;
538 }
539 
x264_opencl_finalize_cost(x264_t * h,int lambda,x264_frame_t ** frames,int p0,int p1,int b,int dist_scale_factor)540 int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
541 {
542     x264_opencl_function_t *ocl = h->opencl.ocl;
543     cl_int status;
544     x264_frame_t *fenc = frames[b];
545     x264_frame_t *fref0 = frames[p0];
546     x264_frame_t *fref1 = frames[p1];
547 
548     int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
549 
550     /* Tasks for this kernel:
551      * 1. Select least cost mode (intra, ref0, ref1)
552      *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
553      * 2. if B frame, try bidir predictions.
554      * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
555     size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
556     size_t ldim_bidir[2];
557     size_t *ldims = NULL;
558     int cost_local_size = 4;
559     int satd_local_size = 4;
560     if( b < p1 )
561     {
562         /* For B frames, use 4 threads per MB for BIDIR checks */
563         ldims = ldim_bidir;
564         gdims[0] <<= 2;
565         optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
566         int mb_per_group = (ldims[0] >> 2) * ldims[1];
567         cost_local_size = 4 * mb_per_group * sizeof(int16_t);
568         satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
569     }
570 
571     cl_uint arg = 0;
572     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
573     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
574     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
575     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
576     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
577     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
578     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
579     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
580     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
581     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
582     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
583     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
584     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
585     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
586     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
587     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
588     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
589     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
590     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
591     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
592     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
593 
594     /* Sum costs across rows, atomicAdd down frame */
595     size_t gdim[2] = { 256, h->mb.i_mb_height };
596     size_t ldim[2] = { 256, 1 };
597 
598     arg = 0;
599     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
600     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
601     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
602     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
603     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
604     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
605     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
606     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
607     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
608     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
609 
610     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
611         x264_opencl_flush( h );
612 
613     int size =  h->mb.i_mb_count * sizeof(int16_t);
614     char *locked = opencl_alloc_locked( h, size );
615     h->opencl.copies[h->opencl.num_copies].src = locked;
616     h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
617     h->opencl.copies[h->opencl.num_copies].bytes = size;
618     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
619     h->opencl.num_copies++;
620 
621     size =  h->mb.i_mb_height * sizeof(int);
622     locked = opencl_alloc_locked( h, size );
623     h->opencl.copies[h->opencl.num_copies].src = locked;
624     h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
625     h->opencl.copies[h->opencl.num_copies].bytes = size;
626     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
627     h->opencl.num_copies++;
628 
629     size =  4 * sizeof(int);
630     locked = opencl_alloc_locked( h, size );
631     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
632     h->opencl.last_buf = !h->opencl.last_buf;
633 
634     h->opencl.copies[h->opencl.num_copies].src = locked;
635     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
636     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
637     h->opencl.num_copies++;
638     h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
639     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
640     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
641     h->opencl.num_copies++;
642 
643     if( b == p1 ) // P frames only
644     {
645         h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
646         h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
647         h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
648         h->opencl.num_copies++;
649     }
650     return 0;
651 }
652 
x264_opencl_slicetype_prep(x264_t * h,x264_frame_t ** frames,int num_frames,int lambda)653 void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
654 {
655     if( h->param.b_opencl )
656     {
657 #ifdef _WIN32
658         /* Temporarily boost priority of this lookahead thread and the OpenCL
659          * driver's thread until the end of this function.  On AMD GPUs this
660          * greatly reduces the latency of enqueuing kernels and getting results
661          * on Windows. */
662         HANDLE id = GetCurrentThread();
663         h->opencl.lookahead_thread_pri = GetThreadPriority( id );
664         SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
665         x264_opencl_function_t *ocl = h->opencl.ocl;
666         cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
667         if( status == CL_SUCCESS )
668         {
669             h->opencl.opencl_thread_pri = GetThreadPriority( id );
670             SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
671         }
672 #endif
673 
674         /* precalculate intra and I frames */
675         for( int i = 0; i <= num_frames; i++ )
676             x264_opencl_lowres_init( h, frames[i], lambda );
677         x264_opencl_flush( h );
678 
679         if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
680         {
681             /* For trellis B-Adapt, precompute exhaustive motion searches */
682             for( int b = 0; b <= num_frames; b++ )
683             {
684                 for( int j = 1; j < h->param.i_bframe; j++ )
685                 {
686                     int p0 = b - j;
687                     if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
688                     {
689                         const x264_weight_t *w = x264_weight_none;
690 
691                         if( h->param.analyse.i_weighted_pred )
692                         {
693                             x264_emms();
694                             x264_weights_analyse( h, frames[b], frames[p0], 1 );
695                             w = frames[b]->weight[0];
696                         }
697                         frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
698                         x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
699                     }
700                     int p1 = b + j;
701                     if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
702                     {
703                         frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
704                         x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
705                     }
706                 }
707             }
708 
709             x264_opencl_flush( h );
710         }
711     }
712 }
713 
714 
x264_opencl_slicetype_end(x264_t * h)715 void x264_opencl_slicetype_end( x264_t *h )
716 {
717 #ifdef _WIN32
718     if( h->param.b_opencl )
719     {
720         HANDLE id = GetCurrentThread();
721         SetThreadPriority( id, h->opencl.lookahead_thread_pri );
722         x264_opencl_function_t *ocl = h->opencl.ocl;
723         cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
724         if( status == CL_SUCCESS )
725             SetThreadPriority( id, h->opencl.opencl_thread_pri );
726     }
727 #endif
728 }
729 
x264_opencl_precalculate_frame_cost(x264_t * h,x264_frame_t ** frames,int lambda,int p0,int p1,int b)730 int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
731 {
732     if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
733         return 0;
734     else
735     {
736         int do_search[2];
737         int dist_scale_factor = 128;
738         const x264_weight_t *w = x264_weight_none;
739 
740         // avoid duplicating work
741         frames[b]->i_cost_est[b-p0][p1-b] = 0;
742 
743         do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
744         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
745         if( do_search[0] )
746         {
747             if( h->param.analyse.i_weighted_pred && b == p1 )
748             {
749                 x264_emms();
750                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
751                 w = frames[b]->weight[0];
752             }
753             frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
754         }
755         if( do_search[1] )
756             frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
757         if( b == p1 )
758             frames[b]->i_intra_mbs[b-p0] = 0;
759         if( p1 != p0 )
760             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
761 
762         frames[b]->i_cost_est[b-p0][p1-b] = 0;
763         frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
764 
765         x264_opencl_lowres_init( h, frames[b], lambda );
766 
767         if( do_search[0] )
768         {
769             x264_opencl_lowres_init( h, frames[p0], lambda );
770             x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
771         }
772         if( do_search[1] )
773         {
774             x264_opencl_lowres_init( h, frames[p1], lambda );
775             x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
776         }
777         x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
778         return 1;
779     }
780 }
781 
782 #endif
783