1 /*
2  * This file is part of mpv.
3  *
4  * mpv is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * mpv is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdlib.h>
19 
20 #include "error_diffusion.h"
21 
22 #include "common/common.h"
23 
24 #define GLSL(...) gl_sc_addf(sc, __VA_ARGS__)
25 #define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__)
26 
27 // After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
28 // will be affected by the current column.
compute_rightmost_shifted_column(const struct error_diffusion_kernel * k)29 static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k)
30 {
31     int ret = 0;
32     for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
33         for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
34             if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
35                 int shifted_x = x + y * k->shift;
36 
37                 // The shift mapping guarantees current column (or left of it)
38                 // won't be affected by error diffusion.
39                 assert(shifted_x > 0);
40 
41                 ret = MPMAX(ret, shifted_x);
42             }
43         }
44     }
45     return ret;
46 }
47 
mp_find_error_diffusion_kernel(const char * name)48 const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name)
49 {
50     if (!name)
51         return NULL;
52     for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels;
53          k->name;
54          k++) {
55         if (strcmp(k->name, name) == 0)
56             return k;
57     }
58     return NULL;
59 }
60 
mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel * k,int height)61 int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k,
62                                      int height)
63 {
64     // We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors
65     // propagated out from bottom side.
66     int rows = height + EF_MAX_DELTA_Y;
67     int shifted_columns = compute_rightmost_shifted_column(k) + 1;
68 
69     // The shared memory is an array of size rows*shifted_columns. Each element
70     // is a single uint for three RGB component.
71     return rows * shifted_columns * 4;
72 }
73 
pass_error_diffusion(struct gl_shader_cache * sc,const struct error_diffusion_kernel * k,int tex,int width,int height,int depth,int block_size)74 void pass_error_diffusion(struct gl_shader_cache *sc,
75                           const struct error_diffusion_kernel *k,
76                           int tex, int width, int height, int depth, int block_size)
77 {
78     assert(block_size <= height);
79 
80     // The parallel error diffusion works by applying the shift mapping first.
81     // Taking the Floyd and Steinberg algorithm for example. After applying
82     // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
83     // propagated into the next few columns, which makes parallel processing on
84     // the same column possible.
85     //
86     //           X    7/16                X    7/16
87     //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
88 
89     // Figuring out the size of rectangle containing all shifted pixels.
90     // The rectangle height is not changed.
91     int shifted_width = width + (height - 1) * k->shift;
92 
93     // We process all pixels from the shifted rectangles column by column, with
94     // a single global work group of size |block_size|.
95     // Figuring out how many block are required to process all pixels. We need
96     // this explicitly to make the number of barrier() calls match.
97     int blocks = (height * shifted_width + block_size - 1) / block_size;
98 
99     // If we figure out how many of the next columns will be affected while the
100     // current columns is being processed. We can store errors of only a few
101     // columns in the shared memory. Using a ring buffer will further save the
102     // cost while iterating to next column.
103     int ring_buffer_rows = height + EF_MAX_DELTA_Y;
104     int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1;
105     int ring_buffer_size = ring_buffer_rows * ring_buffer_columns;
106 
107     // Defines the ring buffer in shared memory.
108     GLSLH("shared uint err_rgb8[%d];\n", ring_buffer_size);
109 
110     // Initialize the ring buffer.
111     GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) ",
112          ring_buffer_size, block_size);
113     GLSL("err_rgb8[i] = 0;\n");
114 
115     GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks);
116 
117     // Add barrier here to have previous block all processed before starting
118     // the processing of the next.
119     GLSL("groupMemoryBarrier();\n");
120     GLSL("barrier();\n");
121 
122     // Compute the coordinate of the pixel we are currently processing, both
123     // before and after the shift mapping.
124     GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size);
125     GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height);
126     GLSL("int x = x_shifted - y * %d;\n", k->shift);
127 
128     // Proceed only if we are processing a valid pixel.
129     GLSL("if (0 <= x && x < %d) {\n", width);
130 
131     // The index that the current pixel have on the ring buffer.
132     GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size);
133 
134     // Fetch the current pixel.
135     GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex);
136 
137     // The dithering will quantize pixel value into multiples of 1/dither_quant.
138     int dither_quant = (1 << depth) - 1;
139 
140     // We encode errors in RGB components into a single 32-bit unsigned integer.
141     // The error we propagate from the current pixel is in range of
142     // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
143     // sum of all errors been propagated into a pixel is also in the same range.
144     // It's possible to map errors in this range into [-127, 127], and use an
145     // unsigned 8-bit integer to store it (using standard two's complement).
146     // The three 8-bit unsigned integers can then be encoded into a single
147     // 32-bit unsigned integer, with two 4-bit padding to prevent addition
148     // operation overflows affecting other component. There are at most 12
149     // addition operations on each pixel, so 4-bit padding should be enough.
150     // The overflow from R component will be discarded.
151     //
152     // The following figure is how the encoding looks like.
153     //
154     //     +------------------------------------+
155     //     |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
156     //     +------------------------------------+
157     //
158 
159     // The bitshift position for R and G component.
160     int bitshift_r = 24, bitshift_g = 12;
161     // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
162     int uint8_mul = 127 * 2;
163 
164     // Adding the error previously propagated into current pixel, and clear it
165     // in the buffer.
166     GLSL("uint err_u32 = err_rgb8[idx] + %uu;\n",
167          (128u << bitshift_r) | (128u << bitshift_g) | 128u);
168     GLSL("pix = pix * %d.0 + vec3("
169          "int((err_u32 >> %d) & 255u) - 128,"
170          "int((err_u32 >> %d) & 255u) - 128,"
171          "int( err_u32        & 255u) - 128"
172          ") / %d.0;\n", dither_quant, bitshift_r, bitshift_g, uint8_mul);
173     GLSL("err_rgb8[idx] = 0;\n");
174 
175     // Write the dithered pixel.
176     GLSL("vec3 dithered = round(pix);\n");
177     GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered / %d.0, 0.0));\n",
178          dither_quant);
179 
180     GLSL("vec3 err_divided = (pix - dithered) * %d.0 / %d.0;\n",
181          uint8_mul, k->divisor);
182     GLSL("ivec3 tmp;\n");
183 
184     // Group error propagation with same weight factor together, in order to
185     // reduce the number of annoying error encoding.
186     for (int dividend = 1; dividend <= k->divisor; dividend++) {
187         bool err_assigned = false;
188 
189         for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
190             for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
191                 if (k->pattern[y][x - EF_MIN_DELTA_X] != dividend)
192                     continue;
193 
194                 if (!err_assigned) {
195                     err_assigned = true;
196 
197                     GLSL("tmp = ivec3(round(err_divided * %d.0));\n", dividend);
198 
199                     GLSL("err_u32 = "
200                          "(uint(tmp.r & 255) << %d)|"
201                          "(uint(tmp.g & 255) << %d)|"
202                          " uint(tmp.b & 255);\n",
203                          bitshift_r, bitshift_g);
204                 }
205 
206                 int shifted_x = x + y * k->shift;
207 
208                 // Unlike the right border, errors propagated out from left
209                 // border will remain in the ring buffer. This will produce
210                 // visible artifacts near the left border, especially for
211                 // shift=3 kernels.
212                 if (x < 0)
213                     GLSL("if (x >= %d) ", -x);
214 
215                 // Calculate the new position in the ring buffer to propagate
216                 // the error into.
217                 int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
218                 GLSL("atomicAdd(err_rgb8[(idx + %d) %% %d], err_u32);\n",
219                      ring_buffer_delta, ring_buffer_size);
220             }
221         }
222     }
223 
224     GLSL("}\n"); // if (0 <= x && x < width)
225 
226     GLSL("}\n"); // block_id
227 }
228 
229 // Different kernels for error diffusion.
230 // Patterns are from http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
231 const struct error_diffusion_kernel mp_error_diffusion_kernels[] = {
232     {
233         .name = "simple",
234         .shift = 1,
235         .pattern = {{0, 0, 0, 1, 0},
236                     {0, 0, 1, 0, 0},
237                     {0, 0, 0, 0, 0}},
238         .divisor = 2
239     },
240     {
241         // The "false" Floyd-Steinberg kernel
242         .name = "false-fs",
243         .shift = 1,
244         .pattern = {{0, 0, 0, 3, 0},
245                     {0, 0, 3, 2, 0},
246                     {0, 0, 0, 0, 0}},
247         .divisor = 8
248     },
249     {
250         .name = "sierra-lite",
251         .shift = 2,
252         .pattern = {{0, 0, 0, 2, 0},
253                     {0, 1, 1, 0, 0},
254                     {0, 0, 0, 0, 0}},
255         .divisor = 4
256     },
257     {
258         .name = "floyd-steinberg",
259         .shift = 2,
260         .pattern = {{0, 0, 0, 7, 0},
261                     {0, 3, 5, 1, 0},
262                     {0, 0, 0, 0, 0}},
263         .divisor = 16
264     },
265     {
266         .name = "atkinson",
267         .shift = 2,
268         .pattern = {{0, 0, 0, 1, 1},
269                     {0, 1, 1, 1, 0},
270                     {0, 0, 1, 0, 0}},
271         .divisor = 8
272     },
273     // All kernels below have shift value of 3, and probably are too heavy for
274     // low end GPU.
275     {
276         .name = "jarvis-judice-ninke",
277         .shift = 3,
278         .pattern = {{0, 0, 0, 7, 5},
279                     {3, 5, 7, 5, 3},
280                     {1, 3, 5, 3, 1}},
281         .divisor = 48
282     },
283     {
284         .name = "stucki",
285         .shift = 3,
286         .pattern = {{0, 0, 0, 8, 4},
287                     {2, 4, 8, 4, 2},
288                     {1, 2, 4, 2, 1}},
289         .divisor = 42
290     },
291     {
292         .name = "burkes",
293         .shift = 3,
294         .pattern = {{0, 0, 0, 8, 4},
295                     {2, 4, 8, 4, 2},
296                     {0, 0, 0, 0, 0}},
297         .divisor = 32
298     },
299     {
300         .name = "sierra-3",
301         .shift = 3,
302         .pattern = {{0, 0, 0, 5, 3},
303                     {2, 4, 5, 4, 2},
304                     {0, 2, 3, 2, 0}},
305         .divisor = 32
306     },
307     {
308         .name = "sierra-2",
309         .shift = 3,
310         .pattern = {{0, 0, 0, 4, 3},
311                     {1, 2, 3, 2, 1},
312                     {0, 0, 0, 0, 0}},
313         .divisor = 16
314     },
315     {0}
316 };
317