1 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
2 
3 //  crt-royale: A full-featured CRT shader, with cheese.
4 //  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
5 //
6 //  This program is free software; you can redistribute it and/or modify it
7 //  under the terms of the GNU General Public License as published by the Free
8 //  Software Foundation; either version 2 of the License, or any later version.
9 //
10 //  This program is distributed in the hope that it will be useful, but WITHOUT
11 //  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 //  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 //  more details.
14 //
15 //  You should have received a copy of the GNU General Public License along with
16 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17 //  Place, Suite 330, Boston, MA 02111-1307 USA
18 
layout(push_constant)19 layout(push_constant) uniform Push
20 {
21 	vec4 SourceSize;
22 	vec4 OriginalSize;
23 	vec4 OutputSize;
24 	uint FrameCount;
25 	vec4 ORIG_LINEARIZEDSize;
26 } params;
27 
28 #define ORIG_LINEARIZEDvideo_size params.SourceSize.xy
29 #define ORIG_LINEARIZEDtexture_size params.SourceSize.xy
30 
31 float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
32 const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
33 
34 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
35 
36 #include "../../../../include/compat_macros.inc"
37 #include "../user-settings.h"
38 #include "bind-shader-params.h"
39 #include "../../../../include/gamma-management.h"
40 #include "derived-settings-and-constants.h"
41 #include "scanline-functions.h"
42 
43 #pragma stage vertex
44 layout(location = 0) in vec4 Position;
45 layout(location = 1) in vec2 TexCoord;
46 layout(location = 0) out vec2 tex_uv;
47 layout(location = 1) out vec2 blur_dxdy;
48 layout(location = 2) out vec2 uv_scanline_step;
49 layout(location = 3) out float estimated_viewport_size_x;
50 layout(location = 4) out vec2 texture_size_inv;
51 layout(location = 5) out vec2 tex_uv_to_pixel_scale;
52 
main()53 void main()
54 {
55    gl_Position = global.MVP * Position;
56    float2 vTexCoord = TexCoord;
57     const float2 video_uv = vTexCoord * IN.texture_size/IN.video_size;
58     tex_uv = video_uv * ORIG_LINEARIZEDvideo_size /
59         ORIG_LINEARIZEDtexture_size;
60     //  The last pass (vertical scanlines) had a viewport y scale, so we can
61     //  use it to calculate a better runtime sigma:
62     estimated_viewport_size_x =
63         IN.video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y;
64 
65     //  Get the uv sample distance between output pixels.  We're using a resize
66     //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
67     //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
68     //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
69     //  resize is used instead of tex2Dblur3x3_resize (which samples between
70     //  texels even for upsizing).
71     const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/IN.output_size;
72     texture_size_inv = float2(1.0)/ORIG_LINEARIZEDtexture_size;
73     if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
74     {
75         //  For upsizing, we'll snap to texels and sample the nearest 4.
76         const float2 dxdy_scale = max(dxdy_min_scale, float2(1.0));
77         blur_dxdy = dxdy_scale * texture_size_inv;
78     }
79     else
80     {
81         const float2 dxdy_scale = dxdy_min_scale;
82         blur_dxdy = dxdy_scale * texture_size_inv;
83     }
84     //  tex2Dresize_gaussian4x4 needs to know a bit more than the other filters:
85     tex_uv_to_pixel_scale = IN.output_size *
86         ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size;
87     //texture_size_inv = texture_size_inv;
88 
89     //  Detecting interlacing again here lets us apply convergence offsets in
90     //  this pass.  il_step_multiple contains the (texel, scanline) step
91     //  multiple: 1 for progressive, 2 for interlaced.
92     const float2 orig_video_size = ORIG_LINEARIZEDvideo_size;
93     const float y_step = 1.0 + float(is_interlaced(orig_video_size.y));
94     const float2 il_step_multiple = float2(1.0, y_step);
95     //  Get the uv distance between (texels, same-field scanlines):
96     uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size;
97 }
98 
99 #pragma stage fragment
100 #pragma format R8G8B8A8_SRGB
101 layout(location = 0) in vec2 tex_uv;
102 layout(location = 1) in vec2 blur_dxdy;
103 layout(location = 2) in vec2 uv_scanline_step;
104 layout(location = 3) in float estimated_viewport_size_x;
105 layout(location = 4) in vec2 texture_size_inv;
106 layout(location = 5) in vec2 tex_uv_to_pixel_scale;
107 layout(location = 0) out vec4 FragColor;
108 layout(set = 0, binding = 2) uniform sampler2D Source;
109 layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
110 layout(set = 0, binding = 4) uniform sampler2D Original;
111 
112 //////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
113 
114 #include "../../../../include/blur-functions.h"
115 #include "bloom-functions.h"
116 #include "../../../../include/gamma-management.h"
117 
118 
119 ///////////////////////////////////  HELPERS  //////////////////////////////////
120 
tex2Dresize_gaussian4x4(sampler2D tex,float2 tex_uv,float2 dxdy,float2 tex_size,float2 texture_size_inv,float2 tex_uv_to_pixel_scale,float sigma)121 float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma)
122 {
123     //  Requires:   1.) All requirements of gamma-management.h must be satisfied!
124     //              2.) filter_linearN must == "true" in your .cgp preset.
125     //              3.) mipmap_inputN must == "true" in your .cgp preset if
126     //                  IN.output_size << SRC.video_size.
127     //              4.) dxdy should contain the uv pixel spacing:
128     //                      dxdy = max(float2(1.0),
129     //                          SRC.video_size/IN.output_size)/SRC.texture_size;
130     //              5.) texture_size == SRC.texture_size
131     //              6.) texture_size_inv == float2(1.0)/SRC.texture_size
132     //              7.) tex_uv_to_pixel_scale == IN.output_size *
133     //                      SRC.texture_size / SRC.video_size;
134     //              8.) sigma is the desired Gaussian standard deviation, in
135     //                  terms of output pixels.  It should be < ~0.66171875 to
136     //                  ensure the first unused sample (outside the 4x4 box) has
137     //                  a weight < 1.0/256.0.
138     //  Returns:    A true 4x4 Gaussian resize of the input.
139     //  Description:
140     //  Given correct inputs, this Gaussian resizer samples 4 pixel locations
141     //  along each downsized dimension and/or 4 texel locations along each
142     //  upsized dimension.  It computes dynamic weights based on the pixel-space
143     //  distance of each sample from the destination pixel.  It is arbitrarily
144     //  resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
145     //  TODO: Move this to a more suitable file once there are others like it.
146     const float denom_inv = 0.5/(sigma*sigma);
147     //  We're taking 4x4 samples, and we're snapping to texels for upsizing.
148     //  Find texture coords for sample 5 (second row, second column):
149     const float2 curr_texel = tex_uv * tex_size;
150     const float2 prev_texel =
151         floor(curr_texel - float2(under_half)) + float2(0.5);
152     const float2 prev_texel_uv = prev_texel * texture_size_inv;
153     const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y));
154     const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
155     const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap);
156     //  Compute texture coords for other samples:
157     const float2 dx = float2(dxdy.x, 0.0);
158     const float2 sample0_uv = sample5_uv - dxdy;
159     const float2 sample10_uv = sample5_uv + dxdy;
160     const float2 sample15_uv = sample5_uv + 2.0 * dxdy;
161     const float2 sample1_uv = sample0_uv + dx;
162     const float2 sample2_uv = sample0_uv + 2.0 * dx;
163     const float2 sample3_uv = sample0_uv + 3.0 * dx;
164     const float2 sample4_uv = sample5_uv - dx;
165     const float2 sample6_uv = sample5_uv + dx;
166     const float2 sample7_uv = sample5_uv + 2.0 * dx;
167     const float2 sample8_uv = sample10_uv - 2.0 * dx;
168     const float2 sample9_uv = sample10_uv - dx;
169     const float2 sample11_uv = sample10_uv + dx;
170     const float2 sample12_uv = sample15_uv - 3.0 * dx;
171     const float2 sample13_uv = sample15_uv - 2.0 * dx;
172     const float2 sample14_uv = sample15_uv - dx;
173     //  Load each sample:
174     float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
175     float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
176     float3 sample2 = tex2D_linearize(tex, dx).rgb;
177     float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
178     float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
179     float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
180     float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
181     float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
182     float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
183     float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
184     float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
185     float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
186     float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
187     float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
188     float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
189     float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
190     //  Compute destination pixel offsets for each sample:
191     const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
192     const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
193     const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
194     const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
195     const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
196     const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
197     const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
198     const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
199     const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
200     const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
201     const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
202     const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
203     const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
204     const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
205     const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
206     const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
207     const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
208     //  Compute Gaussian sample weights:
209     const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
210     const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
211     const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
212     const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
213     const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
214     const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
215     const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
216     const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
217     const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
218     const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
219     const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
220     const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
221     const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
222     const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
223     const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
224     const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
225     const float weight_sum_inv = 1.0/(
226         w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
227         w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
228     //  Weight and sum the samples:
229     const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
230         w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
231         w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
232         w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
233     return sum * weight_sum_inv;
234 }
235 
main()236 void main()
237 {
238     //  Would a viewport-relative size work better for this pass?  (No.)
239     //  PROS:
240     //  1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
241     //      write a viewport scale.  That number could be used to directly scale
242     //      the viewport-resolution bloom sigma and/or triad size to a smaller
243     //      scale.  This way, we could calculate an optimal dynamic sigma no
244     //      matter how the dot pitch is specified.
245     //  CONS:
246     //  1.) Texel smearing would be much worse at small viewport sizes, but
247     //      performance would be much worse at large viewport sizes, so there
248     //      would be no easy way to calculate a decent scale.
249     //  2.) Worse, we could no longer get away with using a constant-size blur!
250     //      Instead, we'd have to face all the same difficulties as the real
251     //      phosphor bloom, which requires static #ifdefs to decide the blur
252     //      size based on the expected triad size...a dynamic value.
253     //  3.) Like the phosphor bloom, we'd have less control over making the blur
254     //      size correct for an optical blur.  That said, we likely overblur (to
255     //      maintain brightness) more than the eye would do by itself: 20/20
256     //      human vision distinguishes ~1 arc minute, or 1/60 of a degree.  The
257     //      highest viewing angle recommendation I know of is THX's 40.04 degree
258     //      recommendation, at which 20/20 vision can distinguish about 2402.4
259     //      lines.  Assuming the "TV lines" definition, that means 1201.2
260     //      distinct light lines and 1201.2 distinct dark lines can be told
261     //      apart, i.e. 1201.2 pairs of lines.  This would correspond to 1201.2
262     //      pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
263     //      (if they're alternately lit).  That's a max of 800.8 triads.  Using
264     //      a more popular 30 degree viewing angle recommendation, 20/20 vision
265     //      can distinguish 1800 lines, or 600 triads of alternately lit
266     //      phosphors.  In contrast, we currently blur phosphors all the way
267     //      down to 341.3 triads to ensure full brightness.
268     //  4.) Realistically speaking, we're usually just going to use bilinear
269     //      filtering in this pass anyway, but it only works well to limit
270     //      bandwidth if it's done at a small constant scale.
271 
272     //  Get the constants we need to sample:
273 //    const sampler2D texture = ORIG_LINEARIZED.texture;
274 //    const float2 tex_uv = tex_uv;
275 //    const float2 blur_dxdy = blur_dxdy;
276     const float2 texture_size_ = ORIG_LINEARIZEDtexture_size;
277 //    const float2 texture_size_inv = texture_size_inv;
278 //    const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale;
279     float2 tex_uv_r, tex_uv_g, tex_uv_b;
280 
281     if(beam_misconvergence)
282     {
283         const float2 uv_scanline_step = uv_scanline_step;
284         const float2 convergence_offsets_r = get_convergence_offsets_r_vector();
285         const float2 convergence_offsets_g = get_convergence_offsets_g_vector();
286         const float2 convergence_offsets_b = get_convergence_offsets_b_vector();
287         tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step;
288         tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step;
289         tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step;
290     }
291     //  Get the blur sigma:
292     const float bloom_approx_sigma = get_bloom_approx_sigma(IN.output_size.x,
293         estimated_viewport_size_x);
294 
295     //  Sample the resized and blurred texture, and apply convergence offsets if
296     //  necessary.  Applying convergence offsets here triples our samples from
297     //  16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
298     //  HALATION_BLUR 3 times at full resolution every time they're used.
299     float3 color_r, color_g, color_b, color;
300     if(bloom_approx_filter > 1.5)
301     {
302         //  Use a 4x4 Gaussian resize.  This is slower but technically correct.
303         if(beam_misconvergence)
304         {
305             color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
306                 blur_dxdy, texture_size_, texture_size_inv,
307                 tex_uv_to_pixel_scale, bloom_approx_sigma);
308             color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
309                 blur_dxdy, texture_size_, texture_size_inv,
310                 tex_uv_to_pixel_scale, bloom_approx_sigma);
311             color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
312                 blur_dxdy, texture_size_, texture_size_inv,
313                 tex_uv_to_pixel_scale, bloom_approx_sigma);
314         }
315         else
316         {
317             color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
318                 blur_dxdy, texture_size_, texture_size_inv,
319                 tex_uv_to_pixel_scale, bloom_approx_sigma);
320         }
321     }
322     else if(bloom_approx_filter > 0.5)
323     {
324         //  Use a 3x3 resize blur.  This is the softest option, because we're
325         //  blurring already blurry bilinear samples.  It doesn't play quite as
326         //  nicely with convergence offsets, but it has its charms.
327         if(beam_misconvergence)
328         {
329             color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
330                 blur_dxdy, bloom_approx_sigma);
331             color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
332                 blur_dxdy, bloom_approx_sigma);
333             color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
334                 blur_dxdy, bloom_approx_sigma);
335         }
336         else
337         {
338             color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
339         }
340     }
341     else
342     {
343         //  Use bilinear sampling.  This approximates a 4x4 Gaussian resize MUCH
344         //  better than tex2Dblur3x3_resize for the very small sigmas we're
345         //  likely to use at small output resolutions.  (This estimate becomes
346         //  too sharp above ~400x300, but the blurs break down above that
347         //  resolution too, unless min_allowed_viewport_triads is high enough to
348         //  keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
349         if(beam_misconvergence)
350         {
351             color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
352             color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
353             color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
354         }
355         else
356         {
357             color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
358         }
359     }
360     //  Pack the colors from the red/green/blue beams into a single vector:
361     if(beam_misconvergence)
362     {
363         color = float3(color_r.r, color_g.g, color_b.b);
364     }
365     //  Encode and output the blurred image:
366 		FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv)));
367 }
368