1/*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19#define REFERENCE_WHITE 100.0f
20extern float3 lrgb2yuv(float3);
21extern float  lrgb2y(float3);
22extern float3 yuv2lrgb(float3);
23extern float3 lrgb2lrgb(float3);
24extern float  get_luma_src(float3);
25extern float  get_luma_dst(float3);
26extern float3 ootf(float3 c, float peak);
27extern float3 inverse_ootf(float3 c, float peak);
28extern float3 get_chroma_sample(float3, float3, float3, float3);
29
30struct detection_result {
31    float peak;
32    float average;
33};
34
35float hable_f(float in) {
36    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
37    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
38}
39
40float direct(float s, float peak) {
41    return s;
42}
43
44float linear(float s, float peak) {
45    return s * tone_param / peak;
46}
47
48float gamma(float s, float peak) {
49    float p = s > 0.05f ? s /peak : 0.05f / peak;
50    float v = powr(p, 1.0f / tone_param);
51    return s > 0.05f ? v : (s * v /0.05f);
52}
53
54float clip(float s, float peak) {
55    return clamp(s * tone_param, 0.0f, 1.0f);
56}
57
58float reinhard(float s, float peak) {
59    return s / (s + tone_param) * (peak + tone_param) / peak;
60}
61
62float hable(float s, float peak) {
63    return hable_f(s)/hable_f(peak);
64}
65
66float mobius(float s, float peak) {
67    float j = tone_param;
68    float a, b;
69
70    if (s <= j)
71        return s;
72
73    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
74    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
75
76    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
77}
78
79// detect peak/average signal of a frame, the algorithm was ported from:
80// libplacebo (https://github.com/haasn/libplacebo)
81struct detection_result
82detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
83            float signal, float peak) {
84// layout of the util buffer
85//
86// Name:             : Size (units of 4-bytes)
87// average buffer    : detection_frames + 1
88// peak buffer       : detection_frames + 1
89// workgroup counter : 1
90// total of peak     : 1
91// total of average  : 1
92// frame index       : 1
93// frame number      : 1
94    global uint *avg_buf = util_buf;
95    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
96    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
97    global uint *max_total_p = counter_wg_p + 1;
98    global uint *avg_total_p = max_total_p + 1;
99    global uint *frame_idx_p = avg_total_p + 1;
100    global uint *scene_frame_num_p = frame_idx_p + 1;
101
102    uint frame_idx = *frame_idx_p;
103    uint scene_frame_num = *scene_frame_num_p;
104
105    size_t lidx = get_local_id(0);
106    size_t lidy = get_local_id(1);
107    size_t lsizex = get_local_size(0);
108    size_t lsizey = get_local_size(1);
109    uint num_wg = get_num_groups(0) * get_num_groups(1);
110    size_t group_idx = get_group_id(0);
111    size_t group_idy = get_group_id(1);
112    struct detection_result r = {peak, sdr_avg};
113    if (lidx == 0 && lidy == 0)
114        *sum_wg = 0;
115    barrier(CLK_LOCAL_MEM_FENCE);
116
117    // update workgroup sum
118    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));
119    barrier(CLK_LOCAL_MEM_FENCE);
120
121    // update frame peak/avg using work-group-average.
122    if (lidx == 0 && lidy == 0) {
123        uint avg_wg = *sum_wg / (lsizex * lsizey);
124        atomic_max(&peak_buf[frame_idx], avg_wg);
125        atomic_add(&avg_buf[frame_idx], avg_wg);
126    }
127
128    if (scene_frame_num > 0) {
129        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
130        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
131        r.peak = max(1.0f, peak);
132        r.average = max(0.25f, avg);
133    }
134
135    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
136        *counter_wg_p = 0;
137        avg_buf[frame_idx] /= num_wg;
138
139        if (scene_threshold > 0.0f) {
140            uint cur_max = peak_buf[frame_idx];
141            uint cur_avg = avg_buf[frame_idx];
142            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
143
144            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
145                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
146                  avg_buf[i] = 0;
147                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
148                  peak_buf[i] = 0;
149                *avg_total_p = *max_total_p = 0;
150                *scene_frame_num_p = 0;
151                avg_buf[frame_idx] = cur_avg;
152                peak_buf[frame_idx] = cur_max;
153            }
154        }
155        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
156        // add current frame, subtract next frame
157        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
158        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
159        // reset next frame
160        peak_buf[next] = avg_buf[next] = 0;
161        *frame_idx_p = next;
162        *scene_frame_num_p = min(*scene_frame_num_p + 1,
163                                 (uint)DETECTION_FRAMES);
164    }
165    return r;
166}
167
168float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
169    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);
170
171    // Rescale the variables in order to bring it into a representation where
172    // 1.0 represents the dst_peak. This is because all of the tone mapping
173    // algorithms are defined in such a way that they map to the range [0.0, 1.0].
174    if (target_peak > 1.0f) {
175        sig *= 1.0f / target_peak;
176        peak *= 1.0f / target_peak;
177    }
178
179    float sig_old = sig;
180
181    // Scale the signal to compensate for differences in the average brightness
182    float slope = min(1.0f, sdr_avg / average);
183    sig *= slope;
184    peak *= slope;
185
186    // Desaturate the color using a coefficient dependent on the signal level
187    if (desat_param > 0.0f) {
188        float luma = get_luma_dst(rgb);
189        float coeff = max(sig - 0.18f, 1e-6f) / max(sig, 1e-6f);
190        coeff = native_powr(coeff, 10.0f / desat_param);
191        rgb = mix(rgb, (float3)luma, (float3)coeff);
192        sig = mix(sig, luma * slope, coeff);
193    }
194
195    sig = TONE_FUNC(sig, peak);
196
197    sig = min(sig, 1.0f);
198    rgb *= (sig/sig_old);
199    return rgb;
200}
201// map from source space YUV to destination space RGB
202float3 map_to_dst_space_from_yuv(float3 yuv, float peak) {
203    float3 c = yuv2lrgb(yuv);
204    c = ootf(c, peak);
205    c = lrgb2lrgb(c);
206    return c;
207}
208
209__kernel void tonemap(__write_only image2d_t dst1,
210                      __read_only  image2d_t src1,
211                      __write_only image2d_t dst2,
212                      __read_only  image2d_t src2,
213                      global uint *util_buf,
214                      float peak
215                      )
216{
217    __local uint sum_wg;
218    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
219                               CLK_ADDRESS_CLAMP_TO_EDGE   |
220                               CLK_FILTER_NEAREST);
221    int xi = get_global_id(0);
222    int yi = get_global_id(1);
223    // each work item process four pixels
224    int x = 2 * xi;
225    int y = 2 * yi;
226
227    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
228    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
229    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
230    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
231    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
232
233    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y), peak);
234    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y), peak);
235    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y), peak);
236    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y), peak);
237
238    float sig0 = max(c0.x, max(c0.y, c0.z));
239    float sig1 = max(c1.x, max(c1.y, c1.z));
240    float sig2 = max(c2.x, max(c2.y, c2.z));
241    float sig3 = max(c3.x, max(c3.y, c3.z));
242    float sig = max(sig0, max(sig1, max(sig2, sig3)));
243
244    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
245
246    float3 c0_old = c0, c1_old = c1, c2_old = c2;
247    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
248    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
249    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
250    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
251
252    c0 = inverse_ootf(c0, target_peak);
253    c1 = inverse_ootf(c1, target_peak);
254    c2 = inverse_ootf(c2, target_peak);
255    c3 = inverse_ootf(c3, target_peak);
256
257    y0 = lrgb2y(c0);
258    y1 = lrgb2y(c1);
259    y2 = lrgb2y(c2);
260    y3 = lrgb2y(c3);
261    float3 chroma_c = get_chroma_sample(c0, c1, c2, c3);
262    float3 chroma = lrgb2yuv(chroma_c);
263
264    if (xi < get_image_width(dst2) && yi < get_image_height(dst2)) {
265        write_imagef(dst1, (int2)(x, y), (float4)(y0, 0.0f, 0.0f, 1.0f));
266        write_imagef(dst1, (int2)(x+1, y), (float4)(y1, 0.0f, 0.0f, 1.0f));
267        write_imagef(dst1, (int2)(x, y+1), (float4)(y2, 0.0f, 0.0f, 1.0f));
268        write_imagef(dst1, (int2)(x+1, y+1), (float4)(y3, 0.0f, 0.0f, 1.0f));
269        write_imagef(dst2, (int2)(xi, yi),
270                     (float4)(chroma.y, chroma.z, 0.0f, 1.0f));
271    }
272}
273