1#version 130
2
3/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
4
5//  crt-royale: A full-featured CRT shader, with cheese.
6//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
7//
8//  This program is free software; you can redistribute it and/or modify it
9//  under the terms of the GNU General Public License as published by the Free
10//  Software Foundation; either version 2 of the License, or any later version.
11//
12//  This program is distributed in the hope that it will be useful, but WITHOUT
13//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15//  more details.
16//
17//  You should have received a copy of the GNU General Public License along with
18//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19//  Place, Suite 330, Boston, MA 02111-1307 USA
20
21#pragma parameter crt_gamma "Simulated CRT Gamma" 2.5 1.0 5.0 0.025
22#pragma parameter lcd_gamma "Your Display Gamma" 2.2 1.0 5.0 0.025
23#pragma parameter levels_contrast "Contrast" 1.0 0.0 4.0 0.015625
24#pragma parameter halation_weight "Halation Weight" 0.0 0.0 1.0 0.005
25#pragma parameter diffusion_weight "Diffusion Weight" 0.075 0.0 1.0 0.005
26#pragma parameter bloom_underestimate_levels "Bloom - Underestimate Levels" 0.8 0.0 5.0 0.01
27#pragma parameter bloom_excess "Bloom - Excess" 0.0 0.0 1.0 0.005
28#pragma parameter beam_min_sigma "Beam - Min Sigma" 0.02 0.005 1.0 0.005
29#pragma parameter beam_max_sigma "Beam - Max Sigma" 0.3 0.005 1.0 0.005
30#pragma parameter beam_spot_power "Beam - Spot Power" 0.33 0.01 16.0 0.01
31#pragma parameter beam_min_shape "Beam - Min Shape" 2.0 2.0 32.0 0.1
32#pragma parameter beam_max_shape "Beam - Max Shape" 4.0 2.0 32.0 0.1
33#pragma parameter beam_shape_power "Beam - Shape Power" 0.25 0.01 16.0 0.01
34#pragma parameter beam_horiz_filter "Beam - Horiz Filter" 0.0 0.0 2.0 1.0
35#pragma parameter beam_horiz_sigma "Beam - Horiz Sigma" 0.35 0.0 0.67 0.005
36#pragma parameter beam_horiz_linear_rgb_weight "Beam - Horiz Linear RGB Weight" 1.0 0.0 1.0 0.01
37#pragma parameter convergence_offset_x_r "Convergence - Offset X Red" 0.0 -4.0 4.0 0.05
38#pragma parameter convergence_offset_x_g "Convergence - Offset X Green" 0.0 -4.0 4.0 0.05
39#pragma parameter convergence_offset_x_b "Convergence - Offset X Blue" 0.0 -4.0 4.0 0.05
40#pragma parameter convergence_offset_y_r "Convergence - Offset Y Red" 0.0 -2.0 2.0 0.05
41#pragma parameter convergence_offset_y_g "Convergence - Offset Y Green" 0.0 -2.0 2.0 0.05
42#pragma parameter convergence_offset_y_b "Convergence - Offset Y Blue" 0.0 -2.0 2.0 0.05
43#pragma parameter mask_type "Mask - Type" 1.0 0.0 2.0 1.0
44#pragma parameter mask_sample_mode_desired "Mask - Sample Mode" 0.0 0.0 2.0 1.0   //  Consider blocking mode 2.
45#pragma parameter mask_specify_num_triads "Mask - Specify Number of Triads" 0.0 0.0 1.0 1.0
46#pragma parameter mask_triad_size_desired "Mask - Triad Size Desired" 3.0 1.0 18.0 0.125
47#pragma parameter mask_num_triads_desired "Mask - Number of Triads Desired" 480.0 342.0 1920.0 1.0
48#pragma parameter aa_subpixel_r_offset_y_runtime "AA - Subpixel R Offset Y" 0.0 -0.333333333 0.333333333 0.333333333
49#pragma parameter aa_cubic_c "AA - Cubic Sharpness" 0.5 0.0 4.0 0.015625
50#pragma parameter aa_gauss_sigma "AA - Gaussian Sigma" 0.5 0.0625 1.0 0.015625
51#pragma parameter geom_mode_runtime "Geometry - Mode" 0.0 0.0 3.0 1.0
52#pragma parameter geom_radius "Geometry - Radius" 2.0 0.16 1024.0 0.1
53#pragma parameter geom_view_dist "Geometry - View Distance" 2.0 0.5 1024.0 0.25
54#pragma parameter geom_tilt_angle_x "Geometry - Tilt Angle X" 0.0 -3.14159265 3.14159265 0.017453292519943295
55#pragma parameter geom_tilt_angle_y "Geometry - Tilt Angle Y" 0.0 -3.14159265 3.14159265 0.017453292519943295
56#pragma parameter geom_aspect_ratio_x "Geometry - Aspect Ratio X" 432.0 1.0 512.0 1.0
57#pragma parameter geom_aspect_ratio_y "Geometry - Aspect Ratio Y" 329.0 1.0 512.0 1.0
58#pragma parameter geom_overscan_x "Geometry - Overscan X" 1.0 0.00390625 4.0 0.00390625
59#pragma parameter geom_overscan_y "Geometry - Overscan Y" 1.0 0.00390625 4.0 0.00390625
60#pragma parameter border_size "Border - Size" 0.015 0.0000001 0.5 0.005
61#pragma parameter border_darkness "Border - Darkness" 2.0 0.0 16.0 0.0625
62#pragma parameter border_compress "Border - Compression" 2.5 1.0 64.0 0.0625
63#pragma parameter interlace_bff "Interlacing - Bottom Field First" 0.0 0.0 1.0 1.0
64#pragma parameter interlace_1080i "Interlace - Detect 1080i" 0.0 0.0 1.0 1.0
65
66// compatibility macros for transparently converting HLSLisms into GLSLisms
67#define mul(a,b) (b*a)
68#define lerp(a,b,c) mix(a,b,c)
69#define saturate(c) clamp(c, 0.0, 1.0)
70#define frac(x) (fract(x))
71#define float2 vec2
72#define float3 vec3
73#define float4 vec4
74#define bool2 bvec2
75#define bool3 bvec3
76#define bool4 bvec4
77#define float2x2 mat2x2
78#define float3x3 mat3x3
79#define float4x4 mat4x4
80#define float4x3 mat4x3
81#define float2x4 mat2x4
82#define IN params
83#define texture_size TextureSize.xy
84#define video_size InputSize.xy
85#define output_size OutputSize.xy
86#define frame_count FrameCount
87#define static
88#define inline
89#define const
90#define fmod(x,y) mod(x,y)
91#define ddx(c) dFdx(c)
92#define ddy(c) dFdy(c)
93#define atan2(x,y) atan(x,y)
94#define rsqrt(c) inversesqrt(c)
95
96#if defined(GL_ES)
97	#define COMPAT_PRECISION mediump
98#else
99	#define COMPAT_PRECISION
100#endif
101
102#if __VERSION__ >= 130
103	#define COMPAT_TEXTURE texture
104#else
105	#define COMPAT_TEXTURE texture2D
106#endif
107
108/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
109
110#define LAST_PASS
111#define SIMULATE_CRT_ON_LCD
112
113//#include "../user-settings.h"
114
115/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
116
117#ifndef USER_SETTINGS_H
118#define USER_SETTINGS_H
119
120/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
121
122//  The Cg compiler uses different "profiles" with different capabilities.
123//  This shader requires a Cg compilation profile >= arbfp1, but a few options
124//  require higher profiles like fp30 or fp40.  The shader can't detect profile
125//  or driver capabilities, so instead you must comment or uncomment the lines
126//  below with "//" before "#define."  Disable an option if you get compilation
127//  errors resembling those listed.  Generally speaking, all of these options
128//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
129//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
130
131//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
132//  Among other things, derivatives help us fix anisotropic filtering artifacts
133//  with curved manually tiled phosphor mask coords.  Related errors:
134//  error C3004: function "float2 ddx(float2);" not supported in this profile
135//  error C3004: function "float2 ddy(float2);" not supported in this profile
136    //#define DRIVERS_ALLOW_DERIVATIVES
137
138//  Fine derivatives: Unsupported on older ATI cards.
139//  Fine derivatives enable 2x2 fragment block communication, letting us perform
140//  fast single-pass blur operations.  If your card uses coarse derivatives and
141//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
142    #ifdef DRIVERS_ALLOW_DERIVATIVES
143        #define DRIVERS_ALLOW_FINE_DERIVATIVES
144    #endif
145
146//  Dynamic looping: Requires an fp30 or newer profile.
147//  This makes phosphor mask resampling faster in some cases.  Related errors:
148//  error C5013: profile does not support "for" statements and "for" could not
149//  be unrolled
150    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
151
152//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
153//  Using one static loop avoids overhead if the user is right, but if the user
154//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
155//  binary search can potentially save some iterations.  However, it may fail:
156//  error C6001: Temporary register limit of 32 exceeded; 35 registers
157//  needed to compile program
158    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
159
160//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
161//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
162//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
163//  this profile
164    //#define DRIVERS_ALLOW_TEX2DLOD
165
166//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
167//  artifacts from anisotropic filtering and mipmapping.  Related errors:
168//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
169//  in this profile
170    //#define DRIVERS_ALLOW_TEX2DBIAS
171
172//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
173//  impose stricter limitations on register counts and instructions.  Enable
174//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
175//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
176//  to compile program.
177//  Enabling integrated graphics compatibility mode will automatically disable:
178//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
179//      (This may be reenabled in a later release.)
180//  2.) RUNTIME_GEOMETRY_MODE
181//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
182    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
183
184
185////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
186
187//  To disable a #define option, turn its line into a comment with "//."
188
189//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
190//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
191//  many of the options in this file and allow real-time tuning, but many of
192//  them are slower.  Disabling them and using this text file will boost FPS.
193#define RUNTIME_SHADER_PARAMS_ENABLE
194//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
195//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
196#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
197//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
198#define RUNTIME_ANTIALIAS_WEIGHTS
199//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
200//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
201//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
202//  parameters?  This will require more math or dynamic branching.
203#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
204//  Specify the tilt at runtime?  This makes things about 3% slower.
205#define RUNTIME_GEOMETRY_TILT
206//  Specify the geometry mode at runtime?
207#define RUNTIME_GEOMETRY_MODE
208//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
209//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
210//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
211#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
212
213//  PHOSPHOR MASK:
214//  Manually resize the phosphor mask for best results (slower)?  Disabling this
215//  removes the option to do so, but it may be faster without dynamic branches.
216    #define PHOSPHOR_MASK_MANUALLY_RESIZE
217//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
218    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
219//  Larger blurs are expensive, but we need them to blur larger triads.  We can
220//  detect the right blur if the triad size is static or our profile allows
221//  dynamic branches, but otherwise we use the largest blur the user indicates
222//  they might need:
223    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
224    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
225    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
226    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
227    //  Here's a helpful chart:
228    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
229    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
230    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
231    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
232    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
233    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
234
235
236///////////////////////////////  USER PARAMETERS  //////////////////////////////
237
238//  Note: Many of these static parameters are overridden by runtime shader
239//  parameters when those are enabled.  However, many others are static codepath
240//  options that were cleaner or more convert to code as static constants.
241
242//  GAMMA:
243    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
244    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
245
246//  LEVELS MANAGEMENT:
247    //  Control the final multiplicative image contrast:
248    static const float levels_contrast_static = 1.0;            //  range [0, 4)
249    //  We auto-dim to avoid clipping between passes and restore brightness
250    //  later.  Control the dim factor here: Lower values clip less but crush
251    //  blacks more (static only for now).
252    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
253
254//  HALATION/DIFFUSION/BLOOM:
255    //  Halation weight: How much energy should be lost to electrons bounding
256    //  around under the CRT glass and exciting random phosphors?
257    static const float halation_weight_static = 0.0;            //  range [0, 1]
258    //  Refractive diffusion weight: How much light should spread/diffuse from
259    //  refracting through the CRT glass?
260    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
261    //  Underestimate brightness: Bright areas bloom more, but we can base the
262    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
263    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
264    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
265    //  Blur all colors more than necessary for a softer phosphor bloom?
266    static const float bloom_excess_static = 0.0;               //  range [0, 1]
267    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
268    //  blurred resize of the input (convergence offsets are applied as well).
269    //  There are three filter options (static option only for now):
270    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
271    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
272    //      and beam_max_sigma is low.
273    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
274    //      always uses a static sigma regardless of beam_max_sigma or
275    //      mask_num_triads_desired.
276    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
277    //  These options are more pronounced for the fast, unbloomed shader version.
278#ifndef RADEON_FIX
279    static const float bloom_approx_filter_static = 2.0;
280#else
281    static const float bloom_approx_filter_static = 1.0;
282#endif
283
284//  ELECTRON BEAM SCANLINE DISTRIBUTION:
285    //  How many scanlines should contribute light to each pixel?  Using more
286    //  scanlines is slower (especially for a generalized Gaussian) but less
287    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
288    //  max_beam_sigma at which the closest unused weight is guaranteed <
289    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
290    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
291    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
292    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
293    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
294    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
295    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
296    //  A generalized Gaussian beam varies shape with color too, now just width.
297    //  It's slower but more flexible (static option only for now).
298    static const bool beam_generalized_gaussian = true;
299    //  What kind of scanline antialiasing do you want?
300    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
301    //  Integrals are slow (especially for generalized Gaussians) and rarely any
302    //  better than 3x antialiasing (static option only for now).
303    static const float beam_antialias_level = 1.0;              //  range [0, 2]
304    //  Min/max standard deviations for scanline beams: Higher values widen and
305    //  soften scanlines.  Depending on other options, low min sigmas can alias.
306    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
307    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
308    //  Beam width varies as a function of color: A power function (0) is more
309    //  configurable, but a spherical function (1) gives the widest beam
310    //  variability without aliasing (static option only for now).
311    static const float beam_spot_shape_function = 0.0;
312    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
313    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
314    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
315    //  Generalized Gaussian max shape parameters: Higher values give flatter
316    //  scanline plateaus and steeper dropoffs, simultaneously widening and
317    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
318    //  values > ~40.0 cause artifacts with integrals.
319    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
320    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
321    //  Generalized Gaussian shape power: Affects how quickly the distribution
322    //  changes shape from Gaussian to steep/plateaued as color increases from 0
323    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
324    //  appear sharper for most colors.
325    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
326    //  What filter should be used to sample scanlines horizontally?
327    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
328    static const float beam_horiz_filter_static = 0.0;
329    //  Standard deviation for horizontal Gaussian resampling:
330    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
331    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
332    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
333    //  limiting circuitry in some CRT's), or a weighted avg.?
334    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
335    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
336    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
337    //  later passes (static option only for now).
338    static const bool beam_misconvergence = true;
339    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
340    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
341    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
342    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
343    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
344    //  Detect interlacing (static option only for now)?
345    static const bool interlace_detect = true;
346    //  Assume 1080-line sources are interlaced?
347    static const bool interlace_1080i_static = false;
348    //  For interlaced sources, assume TFF (top-field first) or BFF order?
349    //  (Whether this matters depends on the nature of the interlaced input.)
350    static const bool interlace_bff_static = false;
351
352//  ANTIALIASING:
353    //  What AA level do you want for curvature/overscan/subpixels?  Options:
354    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
355    //  (Static option only for now)
356    static const float aa_level = 12.0;                     //  range [0, 24]
357    //  What antialiasing filter do you want (static option only)?  Options:
358    //  0: Box (separable), 1: Box (cylindrical),
359    //  2: Tent (separable), 3: Tent (cylindrical),
360    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
361    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
362    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
363    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
364    static const float aa_filter = 6.0;                     //  range [0, 9]
365    //  Flip the sample grid on odd/even frames (static option only for now)?
366    static const bool aa_temporal = false;
367    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
368    //  the blue offset is the negative r offset; range [0, 0.5]
369    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
370    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
371    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
372    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
373    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
374    //  4.) C = 0.0 is a soft spline filter.
375    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
376    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
377    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
378
379//  PHOSPHOR MASK:
380    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
381    static const float mask_type_static = 1.0;                  //  range [0, 2]
382    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
383    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
384    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
385    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
386    //      is halfway decent with LUT mipmapping but atrocious without it.
387    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
388    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
389    //      This mode reuses the same masks, so triads will be enormous unless
390    //      you change the mask LUT filenames in your .cgp file.
391    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
392    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
393    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
394    //  will always be used to calculate the full bloom sigma statically.
395    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
396    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
397    //  triads) will be rounded to the nearest integer tile size and clamped to
398    //  obey minimum size constraints (imposed to reduce downsize taps) and
399    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
400    //  To increase the size limit, double the viewport-relative scales for the
401    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
402    //      range [1, mask_texture_small_size/mask_triads_per_tile]
403    static const float mask_triad_size_desired_static = 24.0 / 8.0;
404    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
405    //  final size will be rounded and constrained as above); default 480.0
406    static const float mask_num_triads_desired_static = 480.0;
407    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
408    //  more samples and avoid moire a bit better, but some is unavoidable
409    //  depending on the destination size (static option for now).
410    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
411    //  The mask is resized using a variable number of taps in each dimension,
412    //  but some Cg profiles always fetch a constant number of taps no matter
413    //  what (no dynamic branching).  We can limit the maximum number of taps if
414    //  we statically limit the minimum phosphor triad size.  Larger values are
415    //  faster, but the limit IS enforced (static option only, forever);
416    //      range [1, mask_texture_small_size/mask_triads_per_tile]
417    //  TODO: Make this 1.0 and compensate with smarter sampling!
418    static const float mask_min_allowed_triad_size = 2.0;
419
420//  GEOMETRY:
421    //  Geometry mode:
422    //  0: Off (default), 1: Spherical mapping (like cgwg's),
423    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
424    static const float geom_mode_static = 0.0;      //  range [0, 3]
425    //  Radius of curvature: Measured in units of your viewport's diagonal size.
426    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
427    //  View dist is the distance from the player to their physical screen, in
428    //  units of the viewport's diagonal size.  It controls the field of view.
429    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
430    //  Tilt angle in radians (clockwise around up and right vectors):
431    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
432    //  Aspect ratio: When the true viewport size is unknown, this value is used
433    //  to help convert between the phosphor triad size and count, along with
434    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
435    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
436    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
437    //  default (256/224)*(54/47) = 1.313069909 (see below)
438    static const float geom_aspect_ratio_static = 1.313069909;
439    //  Before getting into overscan, here's some general aspect ratio info:
440    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
441    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
442    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
443    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
444    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
445    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
446    //  a.) Enable Retroarch's "Crop Overscan"
447    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
448    //  Real consoles use horizontal black padding in the signal, but emulators
449    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
450    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
451    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
452    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
453    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
454    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
455    //  without doing a. or b., but horizontal image borders will be tighter
456    //  than vertical ones, messing up curvature and overscan.  Fixing the
457    //  padding first corrects this.
458    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
459    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
460    //  above: Values < 1.0 zoom out; range (0, inf)
461    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
462    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
463    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
464    //  with strong curvature (static option only for now).
465    static const bool geom_force_correct_tangent_matrix = true;
466
467//  BORDERS:
468    //  Rounded border size in texture uv coords:
469    static const float border_size_static = 0.015;           //  range [0, 0.5]
470    //  Border darkness: Moderate values darken the border smoothly, and high
471    //  values make the image very dark just inside the border:
472    static const float border_darkness_static = 2.0;        //  range [0, inf)
473    //  Border compression: High numbers compress border transitions, narrowing
474    //  the dark border area.
475    static const float border_compress_static = 2.5;        //  range [1, inf)
476
477
478#endif  //  USER_SETTINGS_H
479
480////////////////////////////  END USER-SETTINGS  //////////////////////////
481
482//#include "derived-settings-and-constants.h"
483
484////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
485
486#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
487#define DERIVED_SETTINGS_AND_CONSTANTS_H
488
489/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
490
491//  crt-royale: A full-featured CRT shader, with cheese.
492//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
493//
494//  This program is free software; you can redistribute it and/or modify it
495//  under the terms of the GNU General Public License as published by the Free
496//  Software Foundation; either version 2 of the License, or any later version.
497//
498//  This program is distributed in the hope that it will be useful, but WITHOUT
499//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
500//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
501//  more details.
502//
503//  You should have received a copy of the GNU General Public License along with
504//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
505//  Place, Suite 330, Boston, MA 02111-1307 USA
506
507
508/////////////////////////////////  DESCRIPTION  ////////////////////////////////
509
510//  These macros and constants can be used across the whole codebase.
511//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
512
513
514///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
515
516//#include "../user-settings.h"
517
518/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
519
520#ifndef USER_SETTINGS_H
521#define USER_SETTINGS_H
522
523/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
524
525//  The Cg compiler uses different "profiles" with different capabilities.
526//  This shader requires a Cg compilation profile >= arbfp1, but a few options
527//  require higher profiles like fp30 or fp40.  The shader can't detect profile
528//  or driver capabilities, so instead you must comment or uncomment the lines
529//  below with "//" before "#define."  Disable an option if you get compilation
530//  errors resembling those listed.  Generally speaking, all of these options
531//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
532//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
533
534//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
535//  Among other things, derivatives help us fix anisotropic filtering artifacts
536//  with curved manually tiled phosphor mask coords.  Related errors:
537//  error C3004: function "float2 ddx(float2);" not supported in this profile
538//  error C3004: function "float2 ddy(float2);" not supported in this profile
539    //#define DRIVERS_ALLOW_DERIVATIVES
540
541//  Fine derivatives: Unsupported on older ATI cards.
542//  Fine derivatives enable 2x2 fragment block communication, letting us perform
543//  fast single-pass blur operations.  If your card uses coarse derivatives and
544//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
545    #ifdef DRIVERS_ALLOW_DERIVATIVES
546        #define DRIVERS_ALLOW_FINE_DERIVATIVES
547    #endif
548
549//  Dynamic looping: Requires an fp30 or newer profile.
550//  This makes phosphor mask resampling faster in some cases.  Related errors:
551//  error C5013: profile does not support "for" statements and "for" could not
552//  be unrolled
553    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
554
555//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
556//  Using one static loop avoids overhead if the user is right, but if the user
557//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
558//  binary search can potentially save some iterations.  However, it may fail:
559//  error C6001: Temporary register limit of 32 exceeded; 35 registers
560//  needed to compile program
561    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
562
563//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
564//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
565//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
566//  this profile
567    //#define DRIVERS_ALLOW_TEX2DLOD
568
569//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
570//  artifacts from anisotropic filtering and mipmapping.  Related errors:
571//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
572//  in this profile
573    //#define DRIVERS_ALLOW_TEX2DBIAS
574
575//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
576//  impose stricter limitations on register counts and instructions.  Enable
577//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
578//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
579//  to compile program.
580//  Enabling integrated graphics compatibility mode will automatically disable:
581//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
582//      (This may be reenabled in a later release.)
583//  2.) RUNTIME_GEOMETRY_MODE
584//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
585    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
586
587
588////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
589
590//  To disable a #define option, turn its line into a comment with "//."
591
592//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
593//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
594//  many of the options in this file and allow real-time tuning, but many of
595//  them are slower.  Disabling them and using this text file will boost FPS.
596#define RUNTIME_SHADER_PARAMS_ENABLE
597//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
598//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
599#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
600//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
601#define RUNTIME_ANTIALIAS_WEIGHTS
602//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
603//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
604//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
605//  parameters?  This will require more math or dynamic branching.
606#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
607//  Specify the tilt at runtime?  This makes things about 3% slower.
608#define RUNTIME_GEOMETRY_TILT
609//  Specify the geometry mode at runtime?
610#define RUNTIME_GEOMETRY_MODE
611//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
612//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
613//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
614#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
615
616//  PHOSPHOR MASK:
617//  Manually resize the phosphor mask for best results (slower)?  Disabling this
618//  removes the option to do so, but it may be faster without dynamic branches.
619    #define PHOSPHOR_MASK_MANUALLY_RESIZE
620//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
621    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
622//  Larger blurs are expensive, but we need them to blur larger triads.  We can
623//  detect the right blur if the triad size is static or our profile allows
624//  dynamic branches, but otherwise we use the largest blur the user indicates
625//  they might need:
626    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
627    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
628    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
629    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
630    //  Here's a helpful chart:
631    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
632    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
633    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
634    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
635    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
636    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
637
638
639///////////////////////////////  USER PARAMETERS  //////////////////////////////
640
641//  Note: Many of these static parameters are overridden by runtime shader
642//  parameters when those are enabled.  However, many others are static codepath
643//  options that were cleaner or more convert to code as static constants.
644
645//  GAMMA:
646    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
647    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
648
649//  LEVELS MANAGEMENT:
650    //  Control the final multiplicative image contrast:
651    static const float levels_contrast_static = 1.0;            //  range [0, 4)
652    //  We auto-dim to avoid clipping between passes and restore brightness
653    //  later.  Control the dim factor here: Lower values clip less but crush
654    //  blacks more (static only for now).
655    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
656
657//  HALATION/DIFFUSION/BLOOM:
658    //  Halation weight: How much energy should be lost to electrons bounding
659    //  around under the CRT glass and exciting random phosphors?
660    static const float halation_weight_static = 0.0;            //  range [0, 1]
661    //  Refractive diffusion weight: How much light should spread/diffuse from
662    //  refracting through the CRT glass?
663    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
664    //  Underestimate brightness: Bright areas bloom more, but we can base the
665    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
666    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
667    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
668    //  Blur all colors more than necessary for a softer phosphor bloom?
669    static const float bloom_excess_static = 0.0;               //  range [0, 1]
670    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
671    //  blurred resize of the input (convergence offsets are applied as well).
672    //  There are three filter options (static option only for now):
673    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
674    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
675    //      and beam_max_sigma is low.
676    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
677    //      always uses a static sigma regardless of beam_max_sigma or
678    //      mask_num_triads_desired.
679    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
680    //  These options are more pronounced for the fast, unbloomed shader version.
681#ifndef RADEON_FIX
682    static const float bloom_approx_filter_static = 2.0;
683#else
684    static const float bloom_approx_filter_static = 1.0;
685#endif
686
687//  ELECTRON BEAM SCANLINE DISTRIBUTION:
688    //  How many scanlines should contribute light to each pixel?  Using more
689    //  scanlines is slower (especially for a generalized Gaussian) but less
690    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
691    //  max_beam_sigma at which the closest unused weight is guaranteed <
692    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
693    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
694    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
695    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
696    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
697    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
698    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
699    //  A generalized Gaussian beam varies shape with color too, now just width.
700    //  It's slower but more flexible (static option only for now).
701    static const bool beam_generalized_gaussian = true;
702    //  What kind of scanline antialiasing do you want?
703    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
704    //  Integrals are slow (especially for generalized Gaussians) and rarely any
705    //  better than 3x antialiasing (static option only for now).
706    static const float beam_antialias_level = 1.0;              //  range [0, 2]
707    //  Min/max standard deviations for scanline beams: Higher values widen and
708    //  soften scanlines.  Depending on other options, low min sigmas can alias.
709    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
710    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
711    //  Beam width varies as a function of color: A power function (0) is more
712    //  configurable, but a spherical function (1) gives the widest beam
713    //  variability without aliasing (static option only for now).
714    static const float beam_spot_shape_function = 0.0;
715    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
716    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
717    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
718    //  Generalized Gaussian max shape parameters: Higher values give flatter
719    //  scanline plateaus and steeper dropoffs, simultaneously widening and
720    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
721    //  values > ~40.0 cause artifacts with integrals.
722    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
723    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
724    //  Generalized Gaussian shape power: Affects how quickly the distribution
725    //  changes shape from Gaussian to steep/plateaued as color increases from 0
726    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
727    //  appear sharper for most colors.
728    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
729    //  What filter should be used to sample scanlines horizontally?
730    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
731    static const float beam_horiz_filter_static = 0.0;
732    //  Standard deviation for horizontal Gaussian resampling:
733    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
734    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
735    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
736    //  limiting circuitry in some CRT's), or a weighted avg.?
737    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
738    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
739    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
740    //  later passes (static option only for now).
741    static const bool beam_misconvergence = true;
742    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
743    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
744    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
745    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
746    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
747    //  Detect interlacing (static option only for now)?
748    static const bool interlace_detect = true;
749    //  Assume 1080-line sources are interlaced?
750    static const bool interlace_1080i_static = false;
751    //  For interlaced sources, assume TFF (top-field first) or BFF order?
752    //  (Whether this matters depends on the nature of the interlaced input.)
753    static const bool interlace_bff_static = false;
754
755//  ANTIALIASING:
756    //  What AA level do you want for curvature/overscan/subpixels?  Options:
757    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
758    //  (Static option only for now)
759    static const float aa_level = 12.0;                     //  range [0, 24]
760    //  What antialiasing filter do you want (static option only)?  Options:
761    //  0: Box (separable), 1: Box (cylindrical),
762    //  2: Tent (separable), 3: Tent (cylindrical),
763    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
764    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
765    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
766    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
767    static const float aa_filter = 6.0;                     //  range [0, 9]
768    //  Flip the sample grid on odd/even frames (static option only for now)?
769    static const bool aa_temporal = false;
770    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
771    //  the blue offset is the negative r offset; range [0, 0.5]
772    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
773    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
774    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
775    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
776    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
777    //  4.) C = 0.0 is a soft spline filter.
778    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
779    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
780    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
781
782//  PHOSPHOR MASK:
783    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
784    static const float mask_type_static = 1.0;                  //  range [0, 2]
785    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
786    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
787    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
788    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
789    //      is halfway decent with LUT mipmapping but atrocious without it.
790    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
791    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
792    //      This mode reuses the same masks, so triads will be enormous unless
793    //      you change the mask LUT filenames in your .cgp file.
794    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
795    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
796    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
797    //  will always be used to calculate the full bloom sigma statically.
798    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
799    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
800    //  triads) will be rounded to the nearest integer tile size and clamped to
801    //  obey minimum size constraints (imposed to reduce downsize taps) and
802    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
803    //  To increase the size limit, double the viewport-relative scales for the
804    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
805    //      range [1, mask_texture_small_size/mask_triads_per_tile]
806    static const float mask_triad_size_desired_static = 24.0 / 8.0;
807    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
808    //  final size will be rounded and constrained as above); default 480.0
809    static const float mask_num_triads_desired_static = 480.0;
810    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
811    //  more samples and avoid moire a bit better, but some is unavoidable
812    //  depending on the destination size (static option for now).
813    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
814    //  The mask is resized using a variable number of taps in each dimension,
815    //  but some Cg profiles always fetch a constant number of taps no matter
816    //  what (no dynamic branching).  We can limit the maximum number of taps if
817    //  we statically limit the minimum phosphor triad size.  Larger values are
818    //  faster, but the limit IS enforced (static option only, forever);
819    //      range [1, mask_texture_small_size/mask_triads_per_tile]
820    //  TODO: Make this 1.0 and compensate with smarter sampling!
821    static const float mask_min_allowed_triad_size = 2.0;
822
823//  GEOMETRY:
824    //  Geometry mode:
825    //  0: Off (default), 1: Spherical mapping (like cgwg's),
826    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
827    static const float geom_mode_static = 0.0;      //  range [0, 3]
828    //  Radius of curvature: Measured in units of your viewport's diagonal size.
829    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
830    //  View dist is the distance from the player to their physical screen, in
831    //  units of the viewport's diagonal size.  It controls the field of view.
832    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
833    //  Tilt angle in radians (clockwise around up and right vectors):
834    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
835    //  Aspect ratio: When the true viewport size is unknown, this value is used
836    //  to help convert between the phosphor triad size and count, along with
837    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
838    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
839    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
840    //  default (256/224)*(54/47) = 1.313069909 (see below)
841    static const float geom_aspect_ratio_static = 1.313069909;
842    //  Before getting into overscan, here's some general aspect ratio info:
843    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
844    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
845    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
846    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
847    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
848    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
849    //  a.) Enable Retroarch's "Crop Overscan"
850    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
851    //  Real consoles use horizontal black padding in the signal, but emulators
852    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
853    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
854    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
855    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
856    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
857    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
858    //  without doing a. or b., but horizontal image borders will be tighter
859    //  than vertical ones, messing up curvature and overscan.  Fixing the
860    //  padding first corrects this.
861    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
862    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
863    //  above: Values < 1.0 zoom out; range (0, inf)
864    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
865    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
866    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
867    //  with strong curvature (static option only for now).
868    static const bool geom_force_correct_tangent_matrix = true;
869
870//  BORDERS:
871    //  Rounded border size in texture uv coords:
872    static const float border_size_static = 0.015;           //  range [0, 0.5]
873    //  Border darkness: Moderate values darken the border smoothly, and high
874    //  values make the image very dark just inside the border:
875    static const float border_darkness_static = 2.0;        //  range [0, inf)
876    //  Border compression: High numbers compress border transitions, narrowing
877    //  the dark border area.
878    static const float border_compress_static = 2.5;        //  range [1, inf)
879
880
881#endif  //  USER_SETTINGS_H
882
883/////////////////////////////   END USER-SETTINGS   ////////////////////////////
884
885//#include "user-cgp-constants.h"
886
887/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
888
889#ifndef USER_CGP_CONSTANTS_H
890#define USER_CGP_CONSTANTS_H
891
892//  IMPORTANT:
893//  These constants MUST be set appropriately for the settings in crt-royale.cgp
894//  (or whatever related .cgp file you're using).  If they aren't, you're likely
895//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
896//  set directly in the .cgp file to make things easier, but...they can't.
897
898//  PASS SCALES AND RELATED CONSTANTS:
899//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
900//  this shader: One does a viewport-scale bloom, and the other skips it.  The
901//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
902static const float bloom_approx_size_x = 320.0;
903static const float bloom_approx_size_x_for_fake = 400.0;
904//  Copy the viewport-relative scales of the phosphor mask resize passes
905//  (MASK_RESIZE and the pass immediately preceding it):
906static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
907//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
908static const float geom_max_aspect_ratio = 4.0/3.0;
909
910//  PHOSPHOR MASK TEXTURE CONSTANTS:
911//  Set the following constants to reflect the properties of the phosphor mask
912//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
913//  based on user settings, then repeats a single tile until filling the screen.
914//  The shader must know the input texture size (default 64x64), and to manually
915//  resize, it must also know the horizontal triads per tile (default 8).
916static const float2 mask_texture_small_size = float2(64.0, 64.0);
917static const float2 mask_texture_large_size = float2(512.0, 512.0);
918static const float mask_triads_per_tile = 8.0;
919//  We need the average brightness of the phosphor mask to compensate for the
920//  dimming it causes.  The following four values are roughly correct for the
921//  masks included with the shader.  Update the value for any LUT texture you
922//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
923//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
924//#define PHOSPHOR_MASK_GRILLE14
925static const float mask_grille14_avg_color = 50.6666666/255.0;
926    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
927    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
928static const float mask_grille15_avg_color = 53.0/255.0;
929    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
930    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
931static const float mask_slot_avg_color = 46.0/255.0;
932    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
933    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
934static const float mask_shadow_avg_color = 41.0/255.0;
935    //  TileableLinearShadowMask*.png
936    //  TileableLinearShadowMaskEDP*.png
937
938#ifdef PHOSPHOR_MASK_GRILLE14
939    static const float mask_grille_avg_color = mask_grille14_avg_color;
940#else
941    static const float mask_grille_avg_color = mask_grille15_avg_color;
942#endif
943
944
945#endif  //  USER_CGP_CONSTANTS_H
946
947//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
948
949////////////////////////////////  END INCLUDES  ////////////////////////////////
950
951///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
952
953//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
954#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
955
956//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
957#ifndef SIMULATE_CRT_ON_LCD
958    #define SIMULATE_CRT_ON_LCD
959#endif
960
961//  Manually tiling a manually resized texture creates texture coord derivative
962//  discontinuities and confuses anisotropic filtering, causing discolored tile
963//  seams in the phosphor mask.  Workarounds:
964//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
965//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
966//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
967//  b.) "Tile flat twice" requires drawing two full tiles without border padding
968//      to the resized mask FBO, and it's incompatible with same-pass curvature.
969//      (Same-pass curvature isn't used but could be in the future...maybe.)
970//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
971//      border padding to the resized mask FBO, but it works with same-pass
972//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
973//  Precedence: a, then, b, then c (if multiple strategies are #defined).
974    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
975    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
976    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
977//  Also, manually resampling the phosphor mask is slightly blurrier with
978//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
979//  creates artifacts, but only with the fully bloomed shader.)  The difference
980//  is subtle with small triads, but you can fix it for a small cost.
981    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
982
983
984//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
985
986//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
987//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
988//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
989//  #defined by either user-settings.h or a wrapper .cg that #includes the
990//  current .cg pass.)
991#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
992    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
993        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
994    #endif
995    #ifdef RUNTIME_GEOMETRY_MODE
996        #undef RUNTIME_GEOMETRY_MODE
997    #endif
998    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
999    //  inferior in most cases, so replace 2.0 with 0.0:
1000    static const float bloom_approx_filter =
1001        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
1002#else
1003    static const float bloom_approx_filter = bloom_approx_filter_static;
1004#endif
1005
1006//  Disable slow runtime paths if static parameters are used.  Most of these
1007//  won't be a problem anyway once the params are disabled, but some will.
1008#ifndef RUNTIME_SHADER_PARAMS_ENABLE
1009    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
1010        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
1011    #endif
1012    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
1013        #undef RUNTIME_ANTIALIAS_WEIGHTS
1014    #endif
1015    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
1016        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
1017    #endif
1018    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
1019        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
1020    #endif
1021    #ifdef RUNTIME_GEOMETRY_TILT
1022        #undef RUNTIME_GEOMETRY_TILT
1023    #endif
1024    #ifdef RUNTIME_GEOMETRY_MODE
1025        #undef RUNTIME_GEOMETRY_MODE
1026    #endif
1027    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1028        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1029    #endif
1030#endif
1031
1032//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
1033#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
1034    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1035#endif
1036#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
1037    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
1038#endif
1039//  Rule out unavailable anisotropic compatibility strategies:
1040#ifndef DRIVERS_ALLOW_DERIVATIVES
1041    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1042        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1043    #endif
1044#endif
1045#ifndef DRIVERS_ALLOW_TEX2DLOD
1046    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
1047        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
1048    #endif
1049    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
1050        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
1051    #endif
1052    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
1053        #undef ANTIALIAS_DISABLE_ANISOTROPIC
1054    #endif
1055#endif
1056#ifndef DRIVERS_ALLOW_TEX2DBIAS
1057    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1058        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1059    #endif
1060    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
1061        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
1062    #endif
1063#endif
1064//  Prioritize anisotropic tiling compatibility strategies by performance and
1065//  disable unused strategies.  This concentrates all the nesting in one place.
1066#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
1067    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1068        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1069    #endif
1070    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1071        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1072    #endif
1073    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1074        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1075    #endif
1076#else
1077    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1078        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1079            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1080        #endif
1081        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1082            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1083        #endif
1084    #else
1085        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
1086        //  flat texture coords in the same pass, but that's all we use.
1087        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1088            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1089                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1090            #endif
1091        #endif
1092    #endif
1093#endif
1094//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
1095//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
1096#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
1097    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
1098#endif
1099#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
1100    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
1101#endif
1102//  Prioritize anisotropic resampling compatibility strategies the same way:
1103#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
1104    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
1105        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
1106    #endif
1107#endif
1108
1109
1110///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
1111
1112//  If we can use the large mipmapped LUT without mipmapping artifacts, we
1113//  should: It gives us more options for using fewer samples.
1114#ifdef DRIVERS_ALLOW_TEX2DLOD
1115    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
1116        //  TODO: Take advantage of this!
1117        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
1118        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
1119    #else
1120        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
1121    #endif
1122#else
1123    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
1124#endif
1125
1126
1127//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
1128//  main_fragment, or a static alias of one of the above.  This makes it hard
1129//  to select the phosphor mask at runtime: We can't even assign to a uniform
1130//  global in the vertex shader or select a sampler2D in the vertex shader and
1131//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
1132//  because it just gives us the input texture or a black screen.  However, we
1133//  can get around these limitations by calling tex2D three times with different
1134//  uniform samplers (or resizing the phosphor mask three times altogether).
1135//  With dynamic branches, we can process only one of these branches on top of
1136//  quickly discarding fragments we don't need (cgc seems able to overcome
1137//  limigations around dependent texture fetches inside of branches).  Without
1138//  dynamic branches, we have to process every branch for every fragment...which
1139//  is slower.  Runtime sampling mode selection is slower without dynamic
1140//  branches as well.  Let the user's static #defines decide if it's worth it.
1141#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
1142    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1143#else
1144    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1145        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1146    #endif
1147#endif
1148
1149//  We need to render some minimum number of tiles in the resize passes.
1150//  We need at least 1.0 just to repeat a single tile, and we need extra
1151//  padding beyond that for anisotropic filtering, discontinuitity fixing,
1152//  antialiasing, same-pass curvature (not currently used), etc.  First
1153//  determine how many border texels and tiles we need, based on how the result
1154//  will be sampled:
1155#ifdef GEOMETRY_EARLY
1156        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
1157        //  Most antialiasing filters have a base radius of 4.0 pixels:
1158        static const float max_aa_base_pixel_border = 4.0 +
1159            max_subpixel_offset;
1160#else
1161    static const float max_aa_base_pixel_border = 0.0;
1162#endif
1163//  Anisotropic filtering adds about 0.5 to the pixel border:
1164#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
1165    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
1166#else
1167    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
1168#endif
1169//  Fixing discontinuities adds 1.0 more to the pixel border:
1170#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
1171    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
1172#else
1173    static const float max_tiled_pixel_border = max_aniso_pixel_border;
1174#endif
1175//  Convert the pixel border to an integer texel border.  Assume same-pass
1176//  curvature about triples the texel frequency:
1177#ifdef GEOMETRY_EARLY
1178    static const float max_mask_texel_border =
1179        ceil(max_tiled_pixel_border * 3.0);
1180#else
1181    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
1182#endif
1183//  Convert the texel border to a tile border using worst-case assumptions:
1184static const float max_mask_tile_border = max_mask_texel_border/
1185    (mask_min_allowed_triad_size * mask_triads_per_tile);
1186
1187//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
1188//  the starting texel (inside borders) for sampling it.
1189#ifndef GEOMETRY_EARLY
1190    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
1191        //  Special case: Render two tiles without borders.  Anisotropic
1192        //  filtering doesn't seem to be a problem here.
1193        static const float mask_resize_num_tiles = 1.0 + 1.0;
1194        static const float mask_start_texels = 0.0;
1195    #else
1196        static const float mask_resize_num_tiles = 1.0 +
1197            2.0 * max_mask_tile_border;
1198        static const float mask_start_texels = max_mask_texel_border;
1199    #endif
1200#else
1201    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
1202    static const float mask_start_texels = max_mask_texel_border;
1203#endif
1204
1205//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
1206//  mask_resize_viewport_scale.  This limits the maximum final triad size.
1207//  Estimate the minimum number of triads we can split the screen into in each
1208//  dimension (we'll be as correct as mask_resize_viewport_scale is):
1209static const float mask_resize_num_triads =
1210    mask_resize_num_tiles * mask_triads_per_tile;
1211static const float2 min_allowed_viewport_triads =
1212    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
1213
1214
1215////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
1216
1217static const float pi = 3.141592653589;
1218//  We often want to find the location of the previous texel, e.g.:
1219//      const float2 curr_texel = uv * texture_size;
1220//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
1221//      const float2 prev_texel_uv = prev_texel / texture_size;
1222//  However, many GPU drivers round incorrectly around exact texel locations.
1223//  We need to subtract a little less than 0.5 before flooring, and some GPU's
1224//  require this value to be farther from 0.5 than others; define it here.
1225//      const float2 prev_texel =
1226//          floor(curr_texel - float2(under_half)) + float2(0.5);
1227static const float under_half = 0.4995;
1228
1229
1230#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
1231
1232/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
1233
1234//#include "bind-shader-h"
1235
1236/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
1237
1238#ifndef BIND_SHADER_PARAMS_H
1239#define BIND_SHADER_PARAMS_H
1240
1241/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
1242
1243//  crt-royale: A full-featured CRT shader, with cheese.
1244//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
1245//
1246//  This program is free software; you can redistribute it and/or modify it
1247//  under the terms of the GNU General Public License as published by the Free
1248//  Software Foundation; either version 2 of the License, or any later version.
1249//
1250//  This program is distributed in the hope that it will be useful, but WITHOUT
1251//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1252//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
1253//  more details.
1254//
1255//  You should have received a copy of the GNU General Public License along with
1256//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
1257//  Place, Suite 330, Boston, MA 02111-1307 USA
1258
1259
1260/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
1261
1262///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
1263
1264//#include "../user-settings.h"
1265
1266/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
1267
1268#ifndef USER_SETTINGS_H
1269#define USER_SETTINGS_H
1270
1271/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
1272
1273//  The Cg compiler uses different "profiles" with different capabilities.
1274//  This shader requires a Cg compilation profile >= arbfp1, but a few options
1275//  require higher profiles like fp30 or fp40.  The shader can't detect profile
1276//  or driver capabilities, so instead you must comment or uncomment the lines
1277//  below with "//" before "#define."  Disable an option if you get compilation
1278//  errors resembling those listed.  Generally speaking, all of these options
1279//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
1280//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
1281
1282//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
1283//  Among other things, derivatives help us fix anisotropic filtering artifacts
1284//  with curved manually tiled phosphor mask coords.  Related errors:
1285//  error C3004: function "float2 ddx(float2);" not supported in this profile
1286//  error C3004: function "float2 ddy(float2);" not supported in this profile
1287    //#define DRIVERS_ALLOW_DERIVATIVES
1288
1289//  Fine derivatives: Unsupported on older ATI cards.
1290//  Fine derivatives enable 2x2 fragment block communication, letting us perform
1291//  fast single-pass blur operations.  If your card uses coarse derivatives and
1292//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
1293    #ifdef DRIVERS_ALLOW_DERIVATIVES
1294        #define DRIVERS_ALLOW_FINE_DERIVATIVES
1295    #endif
1296
1297//  Dynamic looping: Requires an fp30 or newer profile.
1298//  This makes phosphor mask resampling faster in some cases.  Related errors:
1299//  error C5013: profile does not support "for" statements and "for" could not
1300//  be unrolled
1301    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
1302
1303//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
1304//  Using one static loop avoids overhead if the user is right, but if the user
1305//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
1306//  binary search can potentially save some iterations.  However, it may fail:
1307//  error C6001: Temporary register limit of 32 exceeded; 35 registers
1308//  needed to compile program
1309    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
1310
1311//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
1312//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
1313//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
1314//  this profile
1315    //#define DRIVERS_ALLOW_TEX2DLOD
1316
1317//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
1318//  artifacts from anisotropic filtering and mipmapping.  Related errors:
1319//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
1320//  in this profile
1321    //#define DRIVERS_ALLOW_TEX2DBIAS
1322
1323//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
1324//  impose stricter limitations on register counts and instructions.  Enable
1325//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
1326//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
1327//  to compile program.
1328//  Enabling integrated graphics compatibility mode will automatically disable:
1329//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
1330//      (This may be reenabled in a later release.)
1331//  2.) RUNTIME_GEOMETRY_MODE
1332//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
1333    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
1334
1335
1336////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
1337
1338//  To disable a #define option, turn its line into a comment with "//."
1339
1340//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
1341//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
1342//  many of the options in this file and allow real-time tuning, but many of
1343//  them are slower.  Disabling them and using this text file will boost FPS.
1344#define RUNTIME_SHADER_PARAMS_ENABLE
1345//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
1346//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
1347#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
1348//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
1349#define RUNTIME_ANTIALIAS_WEIGHTS
1350//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
1351//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
1352//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
1353//  parameters?  This will require more math or dynamic branching.
1354#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
1355//  Specify the tilt at runtime?  This makes things about 3% slower.
1356#define RUNTIME_GEOMETRY_TILT
1357//  Specify the geometry mode at runtime?
1358#define RUNTIME_GEOMETRY_MODE
1359//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
1360//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
1361//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
1362#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1363
1364//  PHOSPHOR MASK:
1365//  Manually resize the phosphor mask for best results (slower)?  Disabling this
1366//  removes the option to do so, but it may be faster without dynamic branches.
1367    #define PHOSPHOR_MASK_MANUALLY_RESIZE
1368//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
1369    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
1370//  Larger blurs are expensive, but we need them to blur larger triads.  We can
1371//  detect the right blur if the triad size is static or our profile allows
1372//  dynamic branches, but otherwise we use the largest blur the user indicates
1373//  they might need:
1374    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
1375    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
1376    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
1377    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
1378    //  Here's a helpful chart:
1379    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
1380    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1381    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1382    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1383    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1384    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1385
1386
1387///////////////////////////////  USER PARAMETERS  //////////////////////////////
1388
1389//  Note: Many of these static parameters are overridden by runtime shader
1390//  parameters when those are enabled.  However, many others are static codepath
1391//  options that were cleaner or more convert to code as static constants.
1392
1393//  GAMMA:
1394    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
1395    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
1396
1397//  LEVELS MANAGEMENT:
1398    //  Control the final multiplicative image contrast:
1399    static const float levels_contrast_static = 1.0;            //  range [0, 4)
1400    //  We auto-dim to avoid clipping between passes and restore brightness
1401    //  later.  Control the dim factor here: Lower values clip less but crush
1402    //  blacks more (static only for now).
1403    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
1404
1405//  HALATION/DIFFUSION/BLOOM:
1406    //  Halation weight: How much energy should be lost to electrons bounding
1407    //  around under the CRT glass and exciting random phosphors?
1408    static const float halation_weight_static = 0.0;            //  range [0, 1]
1409    //  Refractive diffusion weight: How much light should spread/diffuse from
1410    //  refracting through the CRT glass?
1411    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
1412    //  Underestimate brightness: Bright areas bloom more, but we can base the
1413    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
1414    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
1415    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
1416    //  Blur all colors more than necessary for a softer phosphor bloom?
1417    static const float bloom_excess_static = 0.0;               //  range [0, 1]
1418    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
1419    //  blurred resize of the input (convergence offsets are applied as well).
1420    //  There are three filter options (static option only for now):
1421    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
1422    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
1423    //      and beam_max_sigma is low.
1424    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
1425    //      always uses a static sigma regardless of beam_max_sigma or
1426    //      mask_num_triads_desired.
1427    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
1428    //  These options are more pronounced for the fast, unbloomed shader version.
1429#ifndef RADEON_FIX
1430    static const float bloom_approx_filter_static = 2.0;
1431#else
1432    static const float bloom_approx_filter_static = 1.0;
1433#endif
1434
1435//  ELECTRON BEAM SCANLINE DISTRIBUTION:
1436    //  How many scanlines should contribute light to each pixel?  Using more
1437    //  scanlines is slower (especially for a generalized Gaussian) but less
1438    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
1439    //  max_beam_sigma at which the closest unused weight is guaranteed <
1440    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
1441    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
1442    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
1443    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
1444    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
1445    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
1446    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
1447    //  A generalized Gaussian beam varies shape with color too, now just width.
1448    //  It's slower but more flexible (static option only for now).
1449    static const bool beam_generalized_gaussian = true;
1450    //  What kind of scanline antialiasing do you want?
1451    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
1452    //  Integrals are slow (especially for generalized Gaussians) and rarely any
1453    //  better than 3x antialiasing (static option only for now).
1454    static const float beam_antialias_level = 1.0;              //  range [0, 2]
1455    //  Min/max standard deviations for scanline beams: Higher values widen and
1456    //  soften scanlines.  Depending on other options, low min sigmas can alias.
1457    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
1458    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
1459    //  Beam width varies as a function of color: A power function (0) is more
1460    //  configurable, but a spherical function (1) gives the widest beam
1461    //  variability without aliasing (static option only for now).
1462    static const float beam_spot_shape_function = 0.0;
1463    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
1464    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
1465    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
1466    //  Generalized Gaussian max shape parameters: Higher values give flatter
1467    //  scanline plateaus and steeper dropoffs, simultaneously widening and
1468    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
1469    //  values > ~40.0 cause artifacts with integrals.
1470    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
1471    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
1472    //  Generalized Gaussian shape power: Affects how quickly the distribution
1473    //  changes shape from Gaussian to steep/plateaued as color increases from 0
1474    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
1475    //  appear sharper for most colors.
1476    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
1477    //  What filter should be used to sample scanlines horizontally?
1478    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
1479    static const float beam_horiz_filter_static = 0.0;
1480    //  Standard deviation for horizontal Gaussian resampling:
1481    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
1482    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
1483    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
1484    //  limiting circuitry in some CRT's), or a weighted avg.?
1485    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
1486    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
1487    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
1488    //  later passes (static option only for now).
1489    static const bool beam_misconvergence = true;
1490    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
1491    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
1492    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
1493    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
1494    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
1495    //  Detect interlacing (static option only for now)?
1496    static const bool interlace_detect = true;
1497    //  Assume 1080-line sources are interlaced?
1498    static const bool interlace_1080i_static = false;
1499    //  For interlaced sources, assume TFF (top-field first) or BFF order?
1500    //  (Whether this matters depends on the nature of the interlaced input.)
1501    static const bool interlace_bff_static = false;
1502
1503//  ANTIALIASING:
1504    //  What AA level do you want for curvature/overscan/subpixels?  Options:
1505    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
1506    //  (Static option only for now)
1507    static const float aa_level = 12.0;                     //  range [0, 24]
1508    //  What antialiasing filter do you want (static option only)?  Options:
1509    //  0: Box (separable), 1: Box (cylindrical),
1510    //  2: Tent (separable), 3: Tent (cylindrical),
1511    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
1512    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
1513    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
1514    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
1515    static const float aa_filter = 6.0;                     //  range [0, 9]
1516    //  Flip the sample grid on odd/even frames (static option only for now)?
1517    static const bool aa_temporal = false;
1518    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
1519    //  the blue offset is the negative r offset; range [0, 0.5]
1520    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
1521    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
1522    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
1523    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
1524    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
1525    //  4.) C = 0.0 is a soft spline filter.
1526    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
1527    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
1528    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
1529
1530//  PHOSPHOR MASK:
1531    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
1532    static const float mask_type_static = 1.0;                  //  range [0, 2]
1533    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
1534    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
1535    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
1536    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
1537    //      is halfway decent with LUT mipmapping but atrocious without it.
1538    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
1539    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
1540    //      This mode reuses the same masks, so triads will be enormous unless
1541    //      you change the mask LUT filenames in your .cgp file.
1542    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
1543    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
1544    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
1545    //  will always be used to calculate the full bloom sigma statically.
1546    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
1547    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
1548    //  triads) will be rounded to the nearest integer tile size and clamped to
1549    //  obey minimum size constraints (imposed to reduce downsize taps) and
1550    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
1551    //  To increase the size limit, double the viewport-relative scales for the
1552    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
1553    //      range [1, mask_texture_small_size/mask_triads_per_tile]
1554    static const float mask_triad_size_desired_static = 24.0 / 8.0;
1555    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
1556    //  final size will be rounded and constrained as above); default 480.0
1557    static const float mask_num_triads_desired_static = 480.0;
1558    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
1559    //  more samples and avoid moire a bit better, but some is unavoidable
1560    //  depending on the destination size (static option for now).
1561    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
1562    //  The mask is resized using a variable number of taps in each dimension,
1563    //  but some Cg profiles always fetch a constant number of taps no matter
1564    //  what (no dynamic branching).  We can limit the maximum number of taps if
1565    //  we statically limit the minimum phosphor triad size.  Larger values are
1566    //  faster, but the limit IS enforced (static option only, forever);
1567    //      range [1, mask_texture_small_size/mask_triads_per_tile]
1568    //  TODO: Make this 1.0 and compensate with smarter sampling!
1569    static const float mask_min_allowed_triad_size = 2.0;
1570
1571//  GEOMETRY:
1572    //  Geometry mode:
1573    //  0: Off (default), 1: Spherical mapping (like cgwg's),
1574    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
1575    static const float geom_mode_static = 0.0;      //  range [0, 3]
1576    //  Radius of curvature: Measured in units of your viewport's diagonal size.
1577    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
1578    //  View dist is the distance from the player to their physical screen, in
1579    //  units of the viewport's diagonal size.  It controls the field of view.
1580    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
1581    //  Tilt angle in radians (clockwise around up and right vectors):
1582    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
1583    //  Aspect ratio: When the true viewport size is unknown, this value is used
1584    //  to help convert between the phosphor triad size and count, along with
1585    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
1586    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
1587    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
1588    //  default (256/224)*(54/47) = 1.313069909 (see below)
1589    static const float geom_aspect_ratio_static = 1.313069909;
1590    //  Before getting into overscan, here's some general aspect ratio info:
1591    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
1592    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
1593    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
1594    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
1595    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
1596    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
1597    //  a.) Enable Retroarch's "Crop Overscan"
1598    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
1599    //  Real consoles use horizontal black padding in the signal, but emulators
1600    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
1601    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
1602    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
1603    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
1604    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
1605    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
1606    //  without doing a. or b., but horizontal image borders will be tighter
1607    //  than vertical ones, messing up curvature and overscan.  Fixing the
1608    //  padding first corrects this.
1609    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
1610    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
1611    //  above: Values < 1.0 zoom out; range (0, inf)
1612    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
1613    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
1614    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
1615    //  with strong curvature (static option only for now).
1616    static const bool geom_force_correct_tangent_matrix = true;
1617
1618//  BORDERS:
1619    //  Rounded border size in texture uv coords:
1620    static const float border_size_static = 0.015;           //  range [0, 0.5]
1621    //  Border darkness: Moderate values darken the border smoothly, and high
1622    //  values make the image very dark just inside the border:
1623    static const float border_darkness_static = 2.0;        //  range [0, inf)
1624    //  Border compression: High numbers compress border transitions, narrowing
1625    //  the dark border area.
1626    static const float border_compress_static = 2.5;        //  range [1, inf)
1627
1628
1629#endif  //  USER_SETTINGS_H
1630
1631/////////////////////////////   END USER-SETTINGS   ////////////////////////////
1632
1633//#include "derived-settings-and-constants.h"
1634
1635/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
1636
1637#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
1638#define DERIVED_SETTINGS_AND_CONSTANTS_H
1639
1640/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
1641
1642//  crt-royale: A full-featured CRT shader, with cheese.
1643//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
1644//
1645//  This program is free software; you can redistribute it and/or modify it
1646//  under the terms of the GNU General Public License as published by the Free
1647//  Software Foundation; either version 2 of the License, or any later version.
1648//
1649//  This program is distributed in the hope that it will be useful, but WITHOUT
1650//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1651//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
1652//  more details.
1653//
1654//  You should have received a copy of the GNU General Public License along with
1655//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
1656//  Place, Suite 330, Boston, MA 02111-1307 USA
1657
1658
1659/////////////////////////////////  DESCRIPTION  ////////////////////////////////
1660
1661//  These macros and constants can be used across the whole codebase.
1662//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
1663
1664
1665///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
1666
1667//#include "../user-settings.h"
1668
1669/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
1670
1671#ifndef USER_SETTINGS_H
1672#define USER_SETTINGS_H
1673
1674/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
1675
1676//  The Cg compiler uses different "profiles" with different capabilities.
1677//  This shader requires a Cg compilation profile >= arbfp1, but a few options
1678//  require higher profiles like fp30 or fp40.  The shader can't detect profile
1679//  or driver capabilities, so instead you must comment or uncomment the lines
1680//  below with "//" before "#define."  Disable an option if you get compilation
1681//  errors resembling those listed.  Generally speaking, all of these options
1682//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
1683//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
1684
1685//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
1686//  Among other things, derivatives help us fix anisotropic filtering artifacts
1687//  with curved manually tiled phosphor mask coords.  Related errors:
1688//  error C3004: function "float2 ddx(float2);" not supported in this profile
1689//  error C3004: function "float2 ddy(float2);" not supported in this profile
1690    //#define DRIVERS_ALLOW_DERIVATIVES
1691
1692//  Fine derivatives: Unsupported on older ATI cards.
1693//  Fine derivatives enable 2x2 fragment block communication, letting us perform
1694//  fast single-pass blur operations.  If your card uses coarse derivatives and
1695//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
1696    #ifdef DRIVERS_ALLOW_DERIVATIVES
1697        #define DRIVERS_ALLOW_FINE_DERIVATIVES
1698    #endif
1699
1700//  Dynamic looping: Requires an fp30 or newer profile.
1701//  This makes phosphor mask resampling faster in some cases.  Related errors:
1702//  error C5013: profile does not support "for" statements and "for" could not
1703//  be unrolled
1704    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
1705
1706//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
1707//  Using one static loop avoids overhead if the user is right, but if the user
1708//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
1709//  binary search can potentially save some iterations.  However, it may fail:
1710//  error C6001: Temporary register limit of 32 exceeded; 35 registers
1711//  needed to compile program
1712    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
1713
1714//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
1715//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
1716//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
1717//  this profile
1718    //#define DRIVERS_ALLOW_TEX2DLOD
1719
1720//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
1721//  artifacts from anisotropic filtering and mipmapping.  Related errors:
1722//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
1723//  in this profile
1724    //#define DRIVERS_ALLOW_TEX2DBIAS
1725
1726//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
1727//  impose stricter limitations on register counts and instructions.  Enable
1728//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
1729//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
1730//  to compile program.
1731//  Enabling integrated graphics compatibility mode will automatically disable:
1732//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
1733//      (This may be reenabled in a later release.)
1734//  2.) RUNTIME_GEOMETRY_MODE
1735//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
1736    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
1737
1738
1739////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
1740
1741//  To disable a #define option, turn its line into a comment with "//."
1742
1743//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
1744//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
1745//  many of the options in this file and allow real-time tuning, but many of
1746//  them are slower.  Disabling them and using this text file will boost FPS.
1747#define RUNTIME_SHADER_PARAMS_ENABLE
1748//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
1749//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
1750#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
1751//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
1752#define RUNTIME_ANTIALIAS_WEIGHTS
1753//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
1754//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
1755//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
1756//  parameters?  This will require more math or dynamic branching.
1757#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
1758//  Specify the tilt at runtime?  This makes things about 3% slower.
1759#define RUNTIME_GEOMETRY_TILT
1760//  Specify the geometry mode at runtime?
1761#define RUNTIME_GEOMETRY_MODE
1762//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
1763//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
1764//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
1765#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
1766
1767//  PHOSPHOR MASK:
1768//  Manually resize the phosphor mask for best results (slower)?  Disabling this
1769//  removes the option to do so, but it may be faster without dynamic branches.
1770    #define PHOSPHOR_MASK_MANUALLY_RESIZE
1771//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
1772    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
1773//  Larger blurs are expensive, but we need them to blur larger triads.  We can
1774//  detect the right blur if the triad size is static or our profile allows
1775//  dynamic branches, but otherwise we use the largest blur the user indicates
1776//  they might need:
1777    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
1778    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
1779    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
1780    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
1781    //  Here's a helpful chart:
1782    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
1783    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1784    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1785    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1786    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1787    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
1788
1789
1790///////////////////////////////  USER PARAMETERS  //////////////////////////////
1791
1792//  Note: Many of these static parameters are overridden by runtime shader
1793//  parameters when those are enabled.  However, many others are static codepath
1794//  options that were cleaner or more convert to code as static constants.
1795
1796//  GAMMA:
1797    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
1798    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
1799
1800//  LEVELS MANAGEMENT:
1801    //  Control the final multiplicative image contrast:
1802    static const float levels_contrast_static = 1.0;            //  range [0, 4)
1803    //  We auto-dim to avoid clipping between passes and restore brightness
1804    //  later.  Control the dim factor here: Lower values clip less but crush
1805    //  blacks more (static only for now).
1806    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
1807
1808//  HALATION/DIFFUSION/BLOOM:
1809    //  Halation weight: How much energy should be lost to electrons bounding
1810    //  around under the CRT glass and exciting random phosphors?
1811    static const float halation_weight_static = 0.0;            //  range [0, 1]
1812    //  Refractive diffusion weight: How much light should spread/diffuse from
1813    //  refracting through the CRT glass?
1814    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
1815    //  Underestimate brightness: Bright areas bloom more, but we can base the
1816    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
1817    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
1818    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
1819    //  Blur all colors more than necessary for a softer phosphor bloom?
1820    static const float bloom_excess_static = 0.0;               //  range [0, 1]
1821    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
1822    //  blurred resize of the input (convergence offsets are applied as well).
1823    //  There are three filter options (static option only for now):
1824    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
1825    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
1826    //      and beam_max_sigma is low.
1827    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
1828    //      always uses a static sigma regardless of beam_max_sigma or
1829    //      mask_num_triads_desired.
1830    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
1831    //  These options are more pronounced for the fast, unbloomed shader version.
1832#ifndef RADEON_FIX
1833    static const float bloom_approx_filter_static = 2.0;
1834#else
1835    static const float bloom_approx_filter_static = 1.0;
1836#endif
1837
1838//  ELECTRON BEAM SCANLINE DISTRIBUTION:
1839    //  How many scanlines should contribute light to each pixel?  Using more
1840    //  scanlines is slower (especially for a generalized Gaussian) but less
1841    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
1842    //  max_beam_sigma at which the closest unused weight is guaranteed <
1843    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
1844    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
1845    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
1846    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
1847    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
1848    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
1849    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
1850    //  A generalized Gaussian beam varies shape with color too, now just width.
1851    //  It's slower but more flexible (static option only for now).
1852    static const bool beam_generalized_gaussian = true;
1853    //  What kind of scanline antialiasing do you want?
1854    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
1855    //  Integrals are slow (especially for generalized Gaussians) and rarely any
1856    //  better than 3x antialiasing (static option only for now).
1857    static const float beam_antialias_level = 1.0;              //  range [0, 2]
1858    //  Min/max standard deviations for scanline beams: Higher values widen and
1859    //  soften scanlines.  Depending on other options, low min sigmas can alias.
1860    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
1861    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
1862    //  Beam width varies as a function of color: A power function (0) is more
1863    //  configurable, but a spherical function (1) gives the widest beam
1864    //  variability without aliasing (static option only for now).
1865    static const float beam_spot_shape_function = 0.0;
1866    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
1867    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
1868    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
1869    //  Generalized Gaussian max shape parameters: Higher values give flatter
1870    //  scanline plateaus and steeper dropoffs, simultaneously widening and
1871    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
1872    //  values > ~40.0 cause artifacts with integrals.
1873    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
1874    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
1875    //  Generalized Gaussian shape power: Affects how quickly the distribution
1876    //  changes shape from Gaussian to steep/plateaued as color increases from 0
1877    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
1878    //  appear sharper for most colors.
1879    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
1880    //  What filter should be used to sample scanlines horizontally?
1881    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
1882    static const float beam_horiz_filter_static = 0.0;
1883    //  Standard deviation for horizontal Gaussian resampling:
1884    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
1885    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
1886    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
1887    //  limiting circuitry in some CRT's), or a weighted avg.?
1888    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
1889    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
1890    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
1891    //  later passes (static option only for now).
1892    static const bool beam_misconvergence = true;
1893    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
1894    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
1895    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
1896    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
1897    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
1898    //  Detect interlacing (static option only for now)?
1899    static const bool interlace_detect = true;
1900    //  Assume 1080-line sources are interlaced?
1901    static const bool interlace_1080i_static = false;
1902    //  For interlaced sources, assume TFF (top-field first) or BFF order?
1903    //  (Whether this matters depends on the nature of the interlaced input.)
1904    static const bool interlace_bff_static = false;
1905
1906//  ANTIALIASING:
1907    //  What AA level do you want for curvature/overscan/subpixels?  Options:
1908    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
1909    //  (Static option only for now)
1910    static const float aa_level = 12.0;                     //  range [0, 24]
1911    //  What antialiasing filter do you want (static option only)?  Options:
1912    //  0: Box (separable), 1: Box (cylindrical),
1913    //  2: Tent (separable), 3: Tent (cylindrical),
1914    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
1915    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
1916    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
1917    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
1918    static const float aa_filter = 6.0;                     //  range [0, 9]
1919    //  Flip the sample grid on odd/even frames (static option only for now)?
1920    static const bool aa_temporal = false;
1921    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
1922    //  the blue offset is the negative r offset; range [0, 0.5]
1923    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
1924    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
1925    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
1926    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
1927    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
1928    //  4.) C = 0.0 is a soft spline filter.
1929    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
1930    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
1931    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
1932
1933//  PHOSPHOR MASK:
1934    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
1935    static const float mask_type_static = 1.0;                  //  range [0, 2]
1936    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
1937    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
1938    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
1939    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
1940    //      is halfway decent with LUT mipmapping but atrocious without it.
1941    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
1942    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
1943    //      This mode reuses the same masks, so triads will be enormous unless
1944    //      you change the mask LUT filenames in your .cgp file.
1945    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
1946    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
1947    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
1948    //  will always be used to calculate the full bloom sigma statically.
1949    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
1950    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
1951    //  triads) will be rounded to the nearest integer tile size and clamped to
1952    //  obey minimum size constraints (imposed to reduce downsize taps) and
1953    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
1954    //  To increase the size limit, double the viewport-relative scales for the
1955    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
1956    //      range [1, mask_texture_small_size/mask_triads_per_tile]
1957    static const float mask_triad_size_desired_static = 24.0 / 8.0;
1958    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
1959    //  final size will be rounded and constrained as above); default 480.0
1960    static const float mask_num_triads_desired_static = 480.0;
1961    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
1962    //  more samples and avoid moire a bit better, but some is unavoidable
1963    //  depending on the destination size (static option for now).
1964    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
1965    //  The mask is resized using a variable number of taps in each dimension,
1966    //  but some Cg profiles always fetch a constant number of taps no matter
1967    //  what (no dynamic branching).  We can limit the maximum number of taps if
1968    //  we statically limit the minimum phosphor triad size.  Larger values are
1969    //  faster, but the limit IS enforced (static option only, forever);
1970    //      range [1, mask_texture_small_size/mask_triads_per_tile]
1971    //  TODO: Make this 1.0 and compensate with smarter sampling!
1972    static const float mask_min_allowed_triad_size = 2.0;
1973
1974//  GEOMETRY:
1975    //  Geometry mode:
1976    //  0: Off (default), 1: Spherical mapping (like cgwg's),
1977    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
1978    static const float geom_mode_static = 0.0;      //  range [0, 3]
1979    //  Radius of curvature: Measured in units of your viewport's diagonal size.
1980    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
1981    //  View dist is the distance from the player to their physical screen, in
1982    //  units of the viewport's diagonal size.  It controls the field of view.
1983    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
1984    //  Tilt angle in radians (clockwise around up and right vectors):
1985    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
1986    //  Aspect ratio: When the true viewport size is unknown, this value is used
1987    //  to help convert between the phosphor triad size and count, along with
1988    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
1989    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
1990    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
1991    //  default (256/224)*(54/47) = 1.313069909 (see below)
1992    static const float geom_aspect_ratio_static = 1.313069909;
1993    //  Before getting into overscan, here's some general aspect ratio info:
1994    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
1995    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
1996    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
1997    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
1998    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
1999    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
2000    //  a.) Enable Retroarch's "Crop Overscan"
2001    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
2002    //  Real consoles use horizontal black padding in the signal, but emulators
2003    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
2004    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
2005    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
2006    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
2007    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
2008    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
2009    //  without doing a. or b., but horizontal image borders will be tighter
2010    //  than vertical ones, messing up curvature and overscan.  Fixing the
2011    //  padding first corrects this.
2012    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
2013    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
2014    //  above: Values < 1.0 zoom out; range (0, inf)
2015    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
2016    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
2017    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
2018    //  with strong curvature (static option only for now).
2019    static const bool geom_force_correct_tangent_matrix = true;
2020
2021//  BORDERS:
2022    //  Rounded border size in texture uv coords:
2023    static const float border_size_static = 0.015;           //  range [0, 0.5]
2024    //  Border darkness: Moderate values darken the border smoothly, and high
2025    //  values make the image very dark just inside the border:
2026    static const float border_darkness_static = 2.0;        //  range [0, inf)
2027    //  Border compression: High numbers compress border transitions, narrowing
2028    //  the dark border area.
2029    static const float border_compress_static = 2.5;        //  range [1, inf)
2030
2031
2032#endif  //  USER_SETTINGS_H
2033
2034/////////////////////////////   END USER-SETTINGS   ////////////////////////////
2035
2036//#include "user-cgp-constants.h"
2037
2038/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
2039
2040#ifndef USER_CGP_CONSTANTS_H
2041#define USER_CGP_CONSTANTS_H
2042
2043//  IMPORTANT:
2044//  These constants MUST be set appropriately for the settings in crt-royale.cgp
2045//  (or whatever related .cgp file you're using).  If they aren't, you're likely
2046//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
2047//  set directly in the .cgp file to make things easier, but...they can't.
2048
2049//  PASS SCALES AND RELATED CONSTANTS:
2050//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
2051//  this shader: One does a viewport-scale bloom, and the other skips it.  The
2052//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
2053static const float bloom_approx_size_x = 320.0;
2054static const float bloom_approx_size_x_for_fake = 400.0;
2055//  Copy the viewport-relative scales of the phosphor mask resize passes
2056//  (MASK_RESIZE and the pass immediately preceding it):
2057static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
2058//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
2059static const float geom_max_aspect_ratio = 4.0/3.0;
2060
2061//  PHOSPHOR MASK TEXTURE CONSTANTS:
2062//  Set the following constants to reflect the properties of the phosphor mask
2063//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
2064//  based on user settings, then repeats a single tile until filling the screen.
2065//  The shader must know the input texture size (default 64x64), and to manually
2066//  resize, it must also know the horizontal triads per tile (default 8).
2067static const float2 mask_texture_small_size = float2(64.0, 64.0);
2068static const float2 mask_texture_large_size = float2(512.0, 512.0);
2069static const float mask_triads_per_tile = 8.0;
2070//  We need the average brightness of the phosphor mask to compensate for the
2071//  dimming it causes.  The following four values are roughly correct for the
2072//  masks included with the shader.  Update the value for any LUT texture you
2073//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
2074//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
2075//#define PHOSPHOR_MASK_GRILLE14
2076static const float mask_grille14_avg_color = 50.6666666/255.0;
2077    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
2078    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
2079static const float mask_grille15_avg_color = 53.0/255.0;
2080    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
2081    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
2082static const float mask_slot_avg_color = 46.0/255.0;
2083    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
2084    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
2085static const float mask_shadow_avg_color = 41.0/255.0;
2086    //  TileableLinearShadowMask*.png
2087    //  TileableLinearShadowMaskEDP*.png
2088
2089#ifdef PHOSPHOR_MASK_GRILLE14
2090    static const float mask_grille_avg_color = mask_grille14_avg_color;
2091#else
2092    static const float mask_grille_avg_color = mask_grille15_avg_color;
2093#endif
2094
2095
2096#endif  //  USER_CGP_CONSTANTS_H
2097
2098//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
2099
2100////////////////////////////////  END INCLUDES  ////////////////////////////////
2101
2102///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
2103
2104//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
2105#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
2106
2107//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
2108#ifndef SIMULATE_CRT_ON_LCD
2109    #define SIMULATE_CRT_ON_LCD
2110#endif
2111
2112//  Manually tiling a manually resized texture creates texture coord derivative
2113//  discontinuities and confuses anisotropic filtering, causing discolored tile
2114//  seams in the phosphor mask.  Workarounds:
2115//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
2116//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
2117//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
2118//  b.) "Tile flat twice" requires drawing two full tiles without border padding
2119//      to the resized mask FBO, and it's incompatible with same-pass curvature.
2120//      (Same-pass curvature isn't used but could be in the future...maybe.)
2121//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
2122//      border padding to the resized mask FBO, but it works with same-pass
2123//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
2124//  Precedence: a, then, b, then c (if multiple strategies are #defined).
2125    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
2126    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
2127    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
2128//  Also, manually resampling the phosphor mask is slightly blurrier with
2129//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
2130//  creates artifacts, but only with the fully bloomed shader.)  The difference
2131//  is subtle with small triads, but you can fix it for a small cost.
2132    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2133
2134
2135//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
2136
2137//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
2138//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
2139//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
2140//  #defined by either user-settings.h or a wrapper .cg that #includes the
2141//  current .cg pass.)
2142#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
2143    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
2144        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
2145    #endif
2146    #ifdef RUNTIME_GEOMETRY_MODE
2147        #undef RUNTIME_GEOMETRY_MODE
2148    #endif
2149    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
2150    //  inferior in most cases, so replace 2.0 with 0.0:
2151    static const float bloom_approx_filter =
2152        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
2153#else
2154    static const float bloom_approx_filter = bloom_approx_filter_static;
2155#endif
2156
2157//  Disable slow runtime paths if static parameters are used.  Most of these
2158//  won't be a problem anyway once the params are disabled, but some will.
2159#ifndef RUNTIME_SHADER_PARAMS_ENABLE
2160    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
2161        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
2162    #endif
2163    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
2164        #undef RUNTIME_ANTIALIAS_WEIGHTS
2165    #endif
2166    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
2167        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
2168    #endif
2169    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
2170        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
2171    #endif
2172    #ifdef RUNTIME_GEOMETRY_TILT
2173        #undef RUNTIME_GEOMETRY_TILT
2174    #endif
2175    #ifdef RUNTIME_GEOMETRY_MODE
2176        #undef RUNTIME_GEOMETRY_MODE
2177    #endif
2178    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2179        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2180    #endif
2181#endif
2182
2183//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
2184#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
2185    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2186#endif
2187#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2188    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
2189#endif
2190//  Rule out unavailable anisotropic compatibility strategies:
2191#ifndef DRIVERS_ALLOW_DERIVATIVES
2192    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2193        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2194    #endif
2195#endif
2196#ifndef DRIVERS_ALLOW_TEX2DLOD
2197    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
2198        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
2199    #endif
2200    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2201        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2202    #endif
2203    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
2204        #undef ANTIALIAS_DISABLE_ANISOTROPIC
2205    #endif
2206#endif
2207#ifndef DRIVERS_ALLOW_TEX2DBIAS
2208    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2209        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2210    #endif
2211    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
2212        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
2213    #endif
2214#endif
2215//  Prioritize anisotropic tiling compatibility strategies by performance and
2216//  disable unused strategies.  This concentrates all the nesting in one place.
2217#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
2218    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2219        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2220    #endif
2221    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2222        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2223    #endif
2224    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2225        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2226    #endif
2227#else
2228    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2229        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2230            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2231        #endif
2232        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2233            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2234        #endif
2235    #else
2236        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
2237        //  flat texture coords in the same pass, but that's all we use.
2238        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2239            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2240                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2241            #endif
2242        #endif
2243    #endif
2244#endif
2245//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
2246//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
2247#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
2248    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
2249#endif
2250#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
2251    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
2252#endif
2253//  Prioritize anisotropic resampling compatibility strategies the same way:
2254#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2255    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
2256        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
2257    #endif
2258#endif
2259
2260
2261///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
2262
2263//  If we can use the large mipmapped LUT without mipmapping artifacts, we
2264//  should: It gives us more options for using fewer samples.
2265#ifdef DRIVERS_ALLOW_TEX2DLOD
2266    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
2267        //  TODO: Take advantage of this!
2268        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
2269        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
2270    #else
2271        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
2272    #endif
2273#else
2274    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
2275#endif
2276
2277
2278//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
2279//  main_fragment, or a static alias of one of the above.  This makes it hard
2280//  to select the phosphor mask at runtime: We can't even assign to a uniform
2281//  global in the vertex shader or select a sampler2D in the vertex shader and
2282//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
2283//  because it just gives us the input texture or a black screen.  However, we
2284//  can get around these limitations by calling tex2D three times with different
2285//  uniform samplers (or resizing the phosphor mask three times altogether).
2286//  With dynamic branches, we can process only one of these branches on top of
2287//  quickly discarding fragments we don't need (cgc seems able to overcome
2288//  limigations around dependent texture fetches inside of branches).  Without
2289//  dynamic branches, we have to process every branch for every fragment...which
2290//  is slower.  Runtime sampling mode selection is slower without dynamic
2291//  branches as well.  Let the user's static #defines decide if it's worth it.
2292#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
2293    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2294#else
2295    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2296        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2297    #endif
2298#endif
2299
2300//  We need to render some minimum number of tiles in the resize passes.
2301//  We need at least 1.0 just to repeat a single tile, and we need extra
2302//  padding beyond that for anisotropic filtering, discontinuitity fixing,
2303//  antialiasing, same-pass curvature (not currently used), etc.  First
2304//  determine how many border texels and tiles we need, based on how the result
2305//  will be sampled:
2306#ifdef GEOMETRY_EARLY
2307        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
2308        //  Most antialiasing filters have a base radius of 4.0 pixels:
2309        static const float max_aa_base_pixel_border = 4.0 +
2310            max_subpixel_offset;
2311#else
2312    static const float max_aa_base_pixel_border = 0.0;
2313#endif
2314//  Anisotropic filtering adds about 0.5 to the pixel border:
2315#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
2316    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
2317#else
2318    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
2319#endif
2320//  Fixing discontinuities adds 1.0 more to the pixel border:
2321#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
2322    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
2323#else
2324    static const float max_tiled_pixel_border = max_aniso_pixel_border;
2325#endif
2326//  Convert the pixel border to an integer texel border.  Assume same-pass
2327//  curvature about triples the texel frequency:
2328#ifdef GEOMETRY_EARLY
2329    static const float max_mask_texel_border =
2330        ceil(max_tiled_pixel_border * 3.0);
2331#else
2332    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
2333#endif
2334//  Convert the texel border to a tile border using worst-case assumptions:
2335static const float max_mask_tile_border = max_mask_texel_border/
2336    (mask_min_allowed_triad_size * mask_triads_per_tile);
2337
2338//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
2339//  the starting texel (inside borders) for sampling it.
2340#ifndef GEOMETRY_EARLY
2341    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
2342        //  Special case: Render two tiles without borders.  Anisotropic
2343        //  filtering doesn't seem to be a problem here.
2344        static const float mask_resize_num_tiles = 1.0 + 1.0;
2345        static const float mask_start_texels = 0.0;
2346    #else
2347        static const float mask_resize_num_tiles = 1.0 +
2348            2.0 * max_mask_tile_border;
2349        static const float mask_start_texels = max_mask_texel_border;
2350    #endif
2351#else
2352    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
2353    static const float mask_start_texels = max_mask_texel_border;
2354#endif
2355
2356//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
2357//  mask_resize_viewport_scale.  This limits the maximum final triad size.
2358//  Estimate the minimum number of triads we can split the screen into in each
2359//  dimension (we'll be as correct as mask_resize_viewport_scale is):
2360static const float mask_resize_num_triads =
2361    mask_resize_num_tiles * mask_triads_per_tile;
2362static const float2 min_allowed_viewport_triads =
2363    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
2364
2365
2366////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
2367
2368static const float pi = 3.141592653589;
2369//  We often want to find the location of the previous texel, e.g.:
2370//      const float2 curr_texel = uv * texture_size;
2371//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
2372//      const float2 prev_texel_uv = prev_texel / texture_size;
2373//  However, many GPU drivers round incorrectly around exact texel locations.
2374//  We need to subtract a little less than 0.5 before flooring, and some GPU's
2375//  require this value to be farther from 0.5 than others; define it here.
2376//      const float2 prev_texel =
2377//          floor(curr_texel - float2(under_half)) + float2(0.5);
2378static const float under_half = 0.4995;
2379
2380
2381#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
2382
2383////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
2384
2385////////////////////////////////  END INCLUDES  ////////////////////////////////
2386
2387//  Override some parameters for gamma-management.h and tex2Dantialias.h:
2388#define OVERRIDE_DEVICE_GAMMA
2389static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
2390#define ANTIALIAS_OVERRIDE_BASICS
2391#define ANTIALIAS_OVERRIDE_PARAMETERS
2392
2393//  Disable runtime shader params if the user doesn't explicitly want them.
2394//  Static constants will be defined in place of uniforms of the same name.
2395#ifndef RUNTIME_SHADER_PARAMS_ENABLE
2396    #undef PARAMETER_UNIFORM
2397#endif
2398
2399#ifdef PARAMETER_UNIFORM
2400	uniform COMPAT_PRECISION float crt_gamma;
2401	uniform COMPAT_PRECISION float lcd_gamma;
2402	uniform COMPAT_PRECISION float levels_contrast;
2403	uniform COMPAT_PRECISION float halation_weight;
2404	uniform COMPAT_PRECISION float diffusion_weight;
2405	uniform COMPAT_PRECISION float bloom_underestimate_levels;
2406	uniform COMPAT_PRECISION float bloom_excess;
2407	uniform COMPAT_PRECISION float beam_min_sigma;
2408	uniform COMPAT_PRECISION float beam_max_sigma;
2409	uniform COMPAT_PRECISION float beam_spot_power;
2410	uniform COMPAT_PRECISION float beam_min_shape;
2411	uniform COMPAT_PRECISION float beam_max_shape;
2412	uniform COMPAT_PRECISION float beam_shape_power;
2413	uniform COMPAT_PRECISION float beam_horiz_sigma;
2414	#ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
2415		uniform COMPAT_PRECISION float beam_horiz_filter;
2416		uniform COMPAT_PRECISION float beam_horiz_linear_rgb_weight;
2417	#else
2418        COMPAT_PRECISION float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
2419        COMPAT_PRECISION float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
2420    #endif
2421	uniform COMPAT_PRECISION float convergence_offset_x_r;
2422	uniform COMPAT_PRECISION float convergence_offset_x_g;
2423	uniform COMPAT_PRECISION float convergence_offset_x_b;
2424	uniform COMPAT_PRECISION float convergence_offset_y_r;
2425	uniform COMPAT_PRECISION float convergence_offset_y_g;
2426	uniform COMPAT_PRECISION float convergence_offset_y_b;
2427	#ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2428        uniform COMPAT_PRECISION float mask_type;
2429    #else
2430        COMPAT_PRECISION float mask_type = clamp(mask_type_static, 0.0, 2.0);
2431    #endif
2432	uniform COMPAT_PRECISION float mask_specify_num_triads;
2433    uniform COMPAT_PRECISION float mask_triad_size_desired;
2434	uniform COMPAT_PRECISION float mask_sample_mode_desired;
2435	uniform COMPAT_PRECISION float mask_num_triads_desired;
2436	uniform COMPAT_PRECISION float aa_subpixel_r_offset_x_runtime;
2437	uniform COMPAT_PRECISION float aa_subpixel_r_offset_y_runtime;
2438	#ifdef RUNTIME_ANTIALIAS_WEIGHTS
2439		uniform COMPAT_PRECISION float aa_cubic_c;
2440		uniform COMPAT_PRECISION float aa_gauss_sigma;
2441    #else
2442        COMPAT_PRECISION float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
2443        COMPAT_PRECISION float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
2444    #endif
2445	uniform COMPAT_PRECISION float geom_mode_runtime;
2446	uniform COMPAT_PRECISION float geom_radius;
2447	uniform COMPAT_PRECISION float geom_view_dist;
2448	uniform COMPAT_PRECISION float geom_tilt_angle_x;
2449	uniform COMPAT_PRECISION float geom_tilt_angle_y;
2450	uniform COMPAT_PRECISION float geom_aspect_ratio_x;
2451	uniform COMPAT_PRECISION float geom_aspect_ratio_y;
2452	uniform COMPAT_PRECISION float geom_overscan_x;
2453	uniform COMPAT_PRECISION float geom_overscan_y;
2454	uniform COMPAT_PRECISION float border_size;
2455	uniform COMPAT_PRECISION float border_darkness;
2456	uniform COMPAT_PRECISION float border_compress;
2457	uniform COMPAT_PRECISION float interlace_bff;
2458	uniform COMPAT_PRECISION float interlace_1080i;
2459#else
2460    //  Use constants from user-settings.h, and limit ranges appropriately:
2461    COMPAT_PRECISION float crt_gamma = max(0.0, crt_gamma_static);
2462    COMPAT_PRECISION float lcd_gamma = max(0.0, lcd_gamma_static);
2463    COMPAT_PRECISION float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0);
2464    COMPAT_PRECISION float halation_weight = clamp(halation_weight_static, 0.0, 1.0);
2465    COMPAT_PRECISION float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0);
2466    COMPAT_PRECISION float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
2467    COMPAT_PRECISION float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0);
2468    COMPAT_PRECISION float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static);
2469    COMPAT_PRECISION float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static);
2470    COMPAT_PRECISION float beam_spot_power = max(beam_spot_power_static, 0.0);
2471    COMPAT_PRECISION float beam_min_shape = max(2.0, beam_min_shape_static);
2472    COMPAT_PRECISION float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
2473    COMPAT_PRECISION float beam_shape_power = max(0.0, beam_shape_power_static);
2474    COMPAT_PRECISION float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
2475    COMPAT_PRECISION float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
2476    COMPAT_PRECISION float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
2477    //  Unpack static vector elements to match scalar uniforms:
2478    COMPAT_PRECISION float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0);
2479    COMPAT_PRECISION float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0);
2480    COMPAT_PRECISION float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0);
2481    COMPAT_PRECISION float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0);
2482    COMPAT_PRECISION float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0);
2483    COMPAT_PRECISION float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0);
2484    COMPAT_PRECISION float mask_type = clamp(mask_type_static, 0.0, 2.0);
2485    COMPAT_PRECISION float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0);
2486    COMPAT_PRECISION float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0);
2487    COMPAT_PRECISION float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0);
2488    COMPAT_PRECISION float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0);
2489    COMPAT_PRECISION float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
2490    COMPAT_PRECISION float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
2491    COMPAT_PRECISION float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
2492    COMPAT_PRECISION float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
2493    COMPAT_PRECISION float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0);
2494    COMPAT_PRECISION float geom_radius = max(1.0/(2.0*pi), geom_radius_static);         //  Clamp to [1/(2*pi), 1024]?
2495    COMPAT_PRECISION float geom_view_dist = max(0.5, geom_view_dist_static);            //  Clamp to [0.5, 1024]?
2496    COMPAT_PRECISION float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi);
2497    COMPAT_PRECISION float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi);
2498    COMPAT_PRECISION float geom_aspect_ratio_x = geom_aspect_ratio_static;              //  Force >= 1?
2499    COMPAT_PRECISION float geom_aspect_ratio_y = 1.0;
2500    COMPAT_PRECISION float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x);
2501    COMPAT_PRECISION float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y);
2502    COMPAT_PRECISION float border_size = clamp(border_size_static, 0.0, 0.5);           //  0.5 reaches to image center
2503    COMPAT_PRECISION float border_darkness = max(0.0, border_darkness_static);
2504    COMPAT_PRECISION float border_compress = max(1.0, border_compress_static);          //  < 1.0 darkens whole image
2505    COMPAT_PRECISION float interlace_bff = float(interlace_bff_static);
2506    COMPAT_PRECISION float interlace_1080i = float(interlace_1080i_static);
2507#endif
2508
2509//  Provide accessors for vector constants that pack scalar uniforms:
2510inline float2 get_aspect_vector(const float geom_aspect_ratio)
2511{
2512    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
2513    //  the absolute scale from affecting the uv-mapping for curvature:
2514    const float geom_clamped_aspect_ratio =
2515        min(geom_aspect_ratio, geom_max_aspect_ratio);
2516    const float2 geom_aspect =
2517        normalize(float2(geom_clamped_aspect_ratio, 1.0));
2518    return geom_aspect;
2519}
2520
2521inline float2 get_geom_overscan_vector()
2522{
2523    return float2(geom_overscan_x, geom_overscan_y);
2524}
2525
2526inline float2 get_geom_tilt_angle_vector()
2527{
2528    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
2529}
2530
2531inline float3 get_convergence_offsets_x_vector()
2532{
2533    return float3(convergence_offset_x_r, convergence_offset_x_g,
2534        convergence_offset_x_b);
2535}
2536
2537inline float3 get_convergence_offsets_y_vector()
2538{
2539    return float3(convergence_offset_y_r, convergence_offset_y_g,
2540        convergence_offset_y_b);
2541}
2542
2543inline float2 get_convergence_offsets_r_vector()
2544{
2545    return float2(convergence_offset_x_r, convergence_offset_y_r);
2546}
2547
2548inline float2 get_convergence_offsets_g_vector()
2549{
2550    return float2(convergence_offset_x_g, convergence_offset_y_g);
2551}
2552
2553inline float2 get_convergence_offsets_b_vector()
2554{
2555    return float2(convergence_offset_x_b, convergence_offset_y_b);
2556}
2557
2558inline float2 get_aa_subpixel_r_offset()
2559{
2560    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
2561        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
2562            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
2563            return float2(aa_subpixel_r_offset_x_runtime,
2564                aa_subpixel_r_offset_y_runtime);
2565        #else
2566            return aa_subpixel_r_offset_static;
2567        #endif
2568    #else
2569        return aa_subpixel_r_offset_static;
2570    #endif
2571}
2572
2573//  Provide accessors settings which still need "cooking:"
2574inline float get_mask_amplify()
2575{
2576    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
2577    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
2578    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
2579    return mask_type < 0.5 ? mask_grille_amplify :
2580        mask_type < 1.5 ? mask_slot_amplify :
2581        mask_shadow_amplify;
2582}
2583
2584inline float get_mask_sample_mode()
2585{
2586    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
2587        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
2588            return mask_sample_mode_desired;
2589        #else
2590            return clamp(mask_sample_mode_desired, 1.0, 2.0);
2591        #endif
2592    #else
2593        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
2594            return mask_sample_mode_static;
2595        #else
2596            return clamp(mask_sample_mode_static, 1.0, 2.0);
2597        #endif
2598    #endif
2599}
2600
2601#endif  //  BIND_SHADER_PARAMS_H
2602
2603////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
2604
2605#ifndef RUNTIME_GEOMETRY_TILT
2606    //  Create a local-to-global rotation matrix for the CRT's coordinate frame
2607    //  and its global-to-local inverse.  See the vertex shader for details.
2608    //  It's faster to compute these statically if possible.
2609    static const float2 sin_tilt = sin(geom_tilt_angle_static);
2610    static const float2 cos_tilt = cos(geom_tilt_angle_static);
2611    static const float3x3 geom_local_to_global_static = float3x3(
2612        cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
2613        0.0, cos_tilt.y, -sin_tilt.y,
2614        -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
2615    static const float3x3 geom_global_to_local_static = float3x3(
2616        cos_tilt.x, 0.0, -sin_tilt.x,
2617        sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x,
2618        cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x);
2619#endif
2620
2621//////////////////////////////////  INCLUDES  //////////////////////////////////
2622
2623//#include "../../../../include/gamma-management.h"
2624
2625////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
2626
2627#ifndef GAMMA_MANAGEMENT_H
2628#define GAMMA_MANAGEMENT_H
2629
2630/////////////////////////////////  MIT LICENSE  ////////////////////////////////
2631
2632//  Copyright (C) 2014 TroggleMonkey
2633//
2634//  Permission is hereby granted, free of charge, to any person obtaining a copy
2635//  of this software and associated documentation files (the "Software"), to
2636//  deal in the Software without restriction, including without limitation the
2637//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
2638//  sell copies of the Software, and to permit persons to whom the Software is
2639//  furnished to do so, subject to the following conditions:
2640//
2641//  The above copyright notice and this permission notice shall be included in
2642//  all copies or substantial portions of the Software.
2643//
2644//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2645//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2646//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2647//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2648//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2649//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2650//  IN THE SOFTWARE.
2651
2652/////////////////////////////////  DESCRIPTION  ////////////////////////////////
2653
2654//  This file provides gamma-aware tex*D*() and encode_output() functions.
2655//  Requires:   Before #include-ing this file, the including file must #define
2656//              the following macros when applicable and follow their rules:
2657//              1.) #define FIRST_PASS if this is the first pass.
2658//              2.) #define LAST_PASS if this is the last pass.
2659//              3.) If sRGB is available, set srgb_framebufferN = "true" for
2660//                  every pass except the last in your .cgp preset.
2661//              4.) If sRGB isn't available but you want gamma-correctness with
2662//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
2663//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
2664//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
2665//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
2666//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
2667//              If an option in [5, 8] is #defined in the first or last pass, it
2668//              should be #defined for both.  It shouldn't make a difference
2669//              whether it's #defined for intermediate passes or not.
2670//  Optional:   The including file (or an earlier included file) may optionally
2671//              #define a number of macros indicating it will override certain
2672//              macros and associated constants are as follows:
2673//              static constants with either static or uniform constants.  The
2674//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
2675//                  static const float ntsc_gamma
2676//                  static const float pal_gamma
2677//                  static const float crt_reference_gamma_high
2678//                  static const float crt_reference_gamma_low
2679//                  static const float lcd_reference_gamma
2680//                  static const float crt_office_gamma
2681//                  static const float lcd_office_gamma
2682//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
2683//                  static const float crt_gamma
2684//                  static const float gba_gamma
2685//                  static const float lcd_gamma
2686//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
2687//                  static const float input_gamma
2688//                  static const float intermediate_gamma
2689//                  static const float output_gamma
2690//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
2691//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
2692//                  static const bool assume_opaque_alpha
2693//              The gamma constant overrides must be used in every pass or none,
2694//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
2695//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
2696//  Usage:      After setting macros appropriately, ignore gamma correction and
2697//              replace all tex*D*() calls with equivalent gamma-aware
2698//              tex*D*_linearize calls, except:
2699//              1.) When you read an LUT, use regular tex*D or a gamma-specified
2700//                  function, depending on its gamma encoding:
2701//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
2702//              2.) If you must read pass0's original input in a later pass, use
2703//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
2704//                  input with gamma-corrected bilinear filtering, consider
2705//                  creating a first linearizing pass and reading from the input
2706//                  of pass1 later.
2707//              Then, return encode_output(color) from every fragment shader.
2708//              Finally, use the global gamma_aware_bilinear boolean if you want
2709//              to statically branch based on whether bilinear filtering is
2710//              gamma-correct or not (e.g. for placing Gaussian blur samples).
2711//
2712//  Detailed Policy:
2713//  tex*D*_linearize() functions enforce a consistent gamma-management policy
2714//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
2715//  their input texture has the same encoding characteristics as the input for
2716//  the current pass (which doesn't apply to the exceptions listed above).
2717//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
2718//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
2719//  following two pipelines.
2720//  Typical pipeline with intermediate sRGB framebuffers:
2721//      linear_color = pow(pass0_encoded_color, input_gamma);
2722//      intermediate_output = linear_color;     //  Automatic sRGB encoding
2723//      linear_color = intermediate_output;     //  Automatic sRGB decoding
2724//      final_output = pow(intermediate_output, 1.0/output_gamma);
2725//  Typical pipeline without intermediate sRGB framebuffers:
2726//      linear_color = pow(pass0_encoded_color, input_gamma);
2727//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
2728//      linear_color = pow(intermediate_output, intermediate_gamma);
2729//      final_output = pow(intermediate_output, 1.0/output_gamma);
2730//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
2731//  easily get gamma-correctness without banding on devices where sRGB isn't
2732//  supported.
2733//
2734//  Use This Header to Maximize Code Reuse:
2735//  The purpose of this header is to provide a consistent interface for texture
2736//  reads and output gamma-encoding that localizes and abstracts away all the
2737//  annoying details.  This greatly reduces the amount of code in each shader
2738//  pass that depends on the pass number in the .cgp preset or whether sRGB
2739//  FBO's are being used: You can trivially change the gamma behavior of your
2740//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
2741//  code in your first, Nth, and last passes, you can even put it all in another
2742//  header file and #include it from skeleton .cg files that #define the
2743//  appropriate pass-specific settings.
2744//
2745//  Rationale for Using Three Macros:
2746//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
2747//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
2748//  a lower maintenance burden on each pass.  At first glance it seems we could
2749//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
2750//  This works for simple use cases where input_gamma == output_gamma, but it
2751//  breaks down for more complex scenarios like CRT simulation, where the pass
2752//  number determines the gamma encoding of the input and output.
2753
2754
2755///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
2756
2757//  Set standard gamma constants, but allow users to override them:
2758#ifndef OVERRIDE_STANDARD_GAMMA
2759    //  Standard encoding gammas:
2760    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
2761    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
2762    //  Typical device decoding gammas (only use for emulating devices):
2763    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
2764    //  gammas: The standards purposely undercorrected for an analog CRT's
2765    //  assumed 2.5 reference display gamma to maintain contrast in assumed
2766    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
2767    //  These unstated assumptions about display gamma and perceptual rendering
2768    //  intent caused a lot of confusion, and more modern CRT's seemed to target
2769    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
2770    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
2771    //  displays designed to view sRGB in bright environments.  (Standards are
2772    //  also in flux again with BT.1886, but it's underspecified for displays.)
2773    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
2774    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
2775    static const float lcd_reference_gamma = 2.5;       //  To match CRT
2776    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
2777    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
2778#endif  //  OVERRIDE_STANDARD_GAMMA
2779
2780//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
2781//  but only if they're aware of it.
2782#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
2783    static const bool assume_opaque_alpha = false;
2784#endif
2785
2786
2787///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
2788
2789//  gamma-management.h should be compatible with overriding gamma values with
2790//  runtime user parameters, but we can only define other global constants in
2791//  terms of static constants, not uniform user parameters.  To get around this
2792//  limitation, we need to define derived constants using functions.
2793
2794//  Set device gamma constants, but allow users to override them:
2795#ifdef OVERRIDE_DEVICE_GAMMA
2796    //  The user promises to globally define the appropriate constants:
2797    inline float get_crt_gamma()    {   return crt_gamma;   }
2798    inline float get_gba_gamma()    {   return gba_gamma;   }
2799    inline float get_lcd_gamma()    {   return lcd_gamma;   }
2800#else
2801    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
2802    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
2803    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
2804#endif  //  OVERRIDE_DEVICE_GAMMA
2805
2806//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
2807#ifdef OVERRIDE_FINAL_GAMMA
2808    //  The user promises to globally define the appropriate constants:
2809    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
2810    inline float get_input_gamma()          {   return input_gamma;         }
2811    inline float get_output_gamma()         {   return output_gamma;        }
2812#else
2813    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
2814    //  ensure middle passes don't need to care if anything is being simulated:
2815    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
2816    #ifdef SIMULATE_CRT_ON_LCD
2817        inline float get_input_gamma()      {   return get_crt_gamma();     }
2818        inline float get_output_gamma()     {   return get_lcd_gamma();     }
2819    #else
2820    #ifdef SIMULATE_GBA_ON_LCD
2821        inline float get_input_gamma()      {   return get_gba_gamma();     }
2822        inline float get_output_gamma()     {   return get_lcd_gamma();     }
2823    #else
2824    #ifdef SIMULATE_LCD_ON_CRT
2825        inline float get_input_gamma()      {   return get_lcd_gamma();     }
2826        inline float get_output_gamma()     {   return get_crt_gamma();     }
2827    #else
2828    #ifdef SIMULATE_GBA_ON_CRT
2829        inline float get_input_gamma()      {   return get_gba_gamma();     }
2830        inline float get_output_gamma()     {   return get_crt_gamma();     }
2831    #else   //  Don't simulate anything:
2832        inline float get_input_gamma()      {   return ntsc_gamma;          }
2833        inline float get_output_gamma()     {   return ntsc_gamma;          }
2834    #endif  //  SIMULATE_GBA_ON_CRT
2835    #endif  //  SIMULATE_LCD_ON_CRT
2836    #endif  //  SIMULATE_GBA_ON_LCD
2837    #endif  //  SIMULATE_CRT_ON_LCD
2838#endif  //  OVERRIDE_FINAL_GAMMA
2839
2840//  Set decoding/encoding gammas for the current pass.  Use static constants for
2841//  linearize_input and gamma_encode_output, because they aren't derived, and
2842//  they let the compiler do dead-code elimination.
2843#ifndef GAMMA_ENCODE_EVERY_FBO
2844    #ifdef FIRST_PASS
2845        static const bool linearize_input = true;
2846        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
2847    #else
2848        static const bool linearize_input = false;
2849        inline float get_pass_input_gamma()     {   return 1.0;                 }
2850    #endif
2851    #ifdef LAST_PASS
2852        static const bool gamma_encode_output = true;
2853        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
2854    #else
2855        static const bool gamma_encode_output = false;
2856        inline float get_pass_output_gamma()    {   return 1.0;                 }
2857    #endif
2858#else
2859    static const bool linearize_input = true;
2860    static const bool gamma_encode_output = true;
2861    #ifdef FIRST_PASS
2862        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
2863    #else
2864        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
2865    #endif
2866    #ifdef LAST_PASS
2867        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
2868    #else
2869        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
2870    #endif
2871#endif
2872
2873//  Users might want to know if bilinear filtering will be gamma-correct:
2874static const bool gamma_aware_bilinear = !linearize_input;
2875
2876
2877//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
2878
2879inline float4 encode_output(const float4 color)
2880{
2881    if(gamma_encode_output)
2882    {
2883        if(assume_opaque_alpha)
2884        {
2885            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
2886        }
2887        else
2888        {
2889            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
2890        }
2891    }
2892    else
2893    {
2894        return color;
2895    }
2896}
2897
2898inline float4 decode_input(const float4 color)
2899{
2900    if(linearize_input)
2901    {
2902        if(assume_opaque_alpha)
2903        {
2904            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
2905        }
2906        else
2907        {
2908            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
2909        }
2910    }
2911    else
2912    {
2913        return color;
2914    }
2915}
2916
2917inline float4 decode_gamma_input(const float4 color, const float3 gamma)
2918{
2919    if(assume_opaque_alpha)
2920    {
2921        return float4(pow(color.rgb, gamma), 1.0);
2922    }
2923    else
2924    {
2925        return float4(pow(color.rgb, gamma), color.a);
2926    }
2927}
2928
2929//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
2930//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
2931// EDIT: it's the 'const' in front of the coords that's doing it
2932
2933///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
2934
2935//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
2936//  Provide a wide array of linearizing texture lookup wrapper functions.  The
2937//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
2938//  lookups are provided for completeness in case that changes someday.  Nobody
2939//  is likely to use the *fetch and *proj functions, but they're included just
2940//  in case.  The only tex*D texture sampling functions omitted are:
2941//      - tex*Dcmpbias
2942//      - tex*Dcmplod
2943//      - tex*DARRAY*
2944//      - tex*DMS*
2945//      - Variants returning integers
2946//  Standard line length restrictions are ignored below for vertical brevity.
2947/*
2948//  tex1D:
2949inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
2950{   return decode_input(tex1D(tex, tex_coords));   }
2951
2952inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
2953{   return decode_input(tex1D(tex, tex_coords));   }
2954
2955inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
2956{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
2957
2958inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
2959{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
2960
2961inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
2962{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
2963
2964inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
2965{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
2966
2967inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
2968{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
2969
2970inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
2971{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
2972
2973//  tex1Dbias:
2974inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
2975{   return decode_input(tex1Dbias(tex, tex_coords));   }
2976
2977inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
2978{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
2979
2980//  tex1Dfetch:
2981inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
2982{   return decode_input(tex1Dfetch(tex, tex_coords));  }
2983
2984inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
2985{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
2986
2987//  tex1Dlod:
2988inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
2989{   return decode_input(tex1Dlod(tex, tex_coords));    }
2990
2991inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
2992{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
2993
2994//  tex1Dproj:
2995inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
2996{   return decode_input(tex1Dproj(tex, tex_coords));   }
2997
2998inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
2999{   return decode_input(tex1Dproj(tex, tex_coords));   }
3000
3001inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
3002{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
3003
3004inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
3005{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
3006*/
3007//  tex2D:
3008inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
3009{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
3010
3011inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
3012{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
3013
3014inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
3015{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
3016
3017inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
3018{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
3019
3020//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
3021//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
3022
3023//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
3024//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
3025
3026//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
3027//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
3028
3029//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
3030//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
3031
3032//  tex2Dbias:
3033//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
3034//{   return decode_input(tex2Dbias(tex, tex_coords));   }
3035
3036//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
3037//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
3038
3039//  tex2Dfetch:
3040//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
3041//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
3042
3043//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
3044//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
3045
3046//  tex2Dlod:
3047inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
3048{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
3049
3050inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
3051{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
3052/*
3053//  tex2Dproj:
3054inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
3055{   return decode_input(tex2Dproj(tex, tex_coords));   }
3056
3057inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
3058{   return decode_input(tex2Dproj(tex, tex_coords));   }
3059
3060inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
3061{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
3062
3063inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
3064{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
3065*/
3066/*
3067//  tex3D:
3068inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
3069{   return decode_input(tex3D(tex, tex_coords));   }
3070
3071inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
3072{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
3073
3074inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
3075{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
3076
3077inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
3078{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
3079
3080//  tex3Dbias:
3081inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
3082{   return decode_input(tex3Dbias(tex, tex_coords));   }
3083
3084inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
3085{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
3086
3087//  tex3Dfetch:
3088inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
3089{   return decode_input(tex3Dfetch(tex, tex_coords));  }
3090
3091inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
3092{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
3093
3094//  tex3Dlod:
3095inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
3096{   return decode_input(tex3Dlod(tex, tex_coords));    }
3097
3098inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
3099{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
3100
3101//  tex3Dproj:
3102inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
3103{   return decode_input(tex3Dproj(tex, tex_coords));   }
3104
3105inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
3106{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
3107/////////*
3108
3109//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
3110//  This narrow selection of nonstandard tex2D* functions can be useful:
3111
3112//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
3113//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
3114//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
3115
3116//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
3117//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
3118
3119
3120//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
3121//  Provide a narrower selection of tex2D* wrapper functions that decode an
3122//  input sample with a specified gamma value.  These are useful for reading
3123//  LUT's and for reading the input of pass0 in a later pass.
3124
3125//  tex2D:
3126inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
3127{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
3128
3129inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
3130{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
3131
3132//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
3133//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
3134
3135//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
3136//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
3137
3138//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
3139//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
3140
3141//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
3142//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
3143
3144//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
3145//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
3146
3147//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
3148//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
3149/*
3150//  tex2Dbias:
3151inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
3152{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
3153
3154inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
3155{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
3156
3157//  tex2Dfetch:
3158inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
3159{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
3160
3161inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
3162{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
3163*/
3164//  tex2Dlod:
3165inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
3166{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
3167
3168inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
3169{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
3170
3171
3172#endif  //  GAMMA_MANAGEMENT_H
3173
3174////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
3175
3176//#include "tex2Dantialias.h"
3177
3178/////////////////////////  BEGIN TEX2DANTIALIAS  /////////////////////////
3179
3180#ifndef TEX2DANTIALIAS_H
3181#define TEX2DANTIALIAS_H
3182
3183/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
3184
3185//  crt-royale: A full-featured CRT shader, with cheese.
3186//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
3187//
3188//  This program is free software; you can redistribute it and/or modify it
3189//  under the terms of the GNU General Public License as published by the Free
3190//  Software Foundation; either version 2 of the License, or any later version.
3191//
3192//  This program is distributed in the hope that it will be useful, but WITHOUT
3193//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
3194//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
3195//  more details.
3196//
3197//  You should have received a copy of the GNU General Public License along with
3198//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
3199//  Place, Suite 330, Boston, MA 02111-1307 USA
3200
3201
3202/////////////////////////////////  DESCRIPTION  ////////////////////////////////
3203
3204//  This file provides antialiased and subpixel-aware tex2D lookups.
3205//  Requires:   All functions share these requirements:
3206//              1.) All requirements of gamma-management.h must be satisfied!
3207//              2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe-
3208//                  space offsets to texture uv offsets.  You can get this with:
3209//                      const float2 duv_dx = ddx(tex_uv);
3210//                      const float2 duv_dy = ddy(tex_uv);
3211//                      const float2x2 pixel_to_tex_uv = float2x2(
3212//                          duv_dx.x, duv_dy.x,
3213//                          duv_dx.y, duv_dy.y);
3214//                  This is left to the user in case the current Cg profile
3215//                  doesn't support ddx()/ddy().  Ideally, the user could find
3216//                  calculate a distorted tangent-space mapping analytically.
3217//                  If not, a simple flat mapping can be obtained with:
3218//                      const float2 xy_to_uv_scale = output_size *
3219//                          video_size/texture_size;
3220//                      const float2x2 pixel_to_tex_uv = float2x2(
3221//                          xy_to_uv_scale.x, 0.0,
3222//                          0.0, xy_to_uv_scale.y);
3223//  Optional:   To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and:
3224//              1.) Set an antialiasing level:
3225//                      static const float aa_level = {0 (none),
3226//                          1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24}
3227//              2.) Set a filter type:
3228//                      static const float aa_filter = {
3229//                          0 (Box, Separable), 1 (Box, Cylindrical),
3230//                          2 (Tent, Separable), 3 (Tent, Cylindrical)
3231//                          4 (Gaussian, Separable), 5 (Gaussian, Cylindrical)
3232//                          6 (Cubic, Separable), 7 (Cubic, Cylindrical)
3233//                          8 (Lanczos Sinc, Separable),
3234//                          9 (Lanczos Jinc, Cylindrical)}
3235//                  If the input is unknown, a separable box filter is used.
3236//                  Note: Lanczos Jinc is terrible for sparse sampling, and
3237//                  using aa_axis_importance (see below) defeats the purpose.
3238//              3.) Mirror the sample pattern on odd frames?
3239//                      static const bool aa_temporal = {true, false]
3240//                  This helps rotational invariance but can look "fluttery."
3241//              The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override
3242//              (all of) the following default parameters with static or uniform
3243//              constants (or an accessor function for subpixel offsets):
3244//              1.) Cubic parameters:
3245//                      static const float aa_cubic_c = 0.5;
3246//                  See http://www.imagemagick.org/Usage/filter/#mitchell
3247//              2.) Gaussian parameters:
3248//                      static const float aa_gauss_sigma =
3249//                          0.5/aa_pixel_diameter;
3250//              3.) Set subpixel offsets.  This requires an accessor function
3251//                  for compatibility with scalar runtime shader   Return
3252//                  a float2 pixel offset in [-0.5, 0.5] for the red subpixel:
3253//                      float2 get_aa_subpixel_r_offset()
3254//              The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to
3255//              override (all of) the following default static values.  However,
3256//              the file's structure requires them to be declared static const:
3257//              1.) static const float aa_lanczos_lobes = 3.0;
3258//              2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter;
3259//                  Note the default tent/Gaussian support radii may appear
3260//                  arbitrary, but extensive testing found them nearly optimal
3261//                  for tough cases like strong distortion at low AA levels.
3262//                  (The Gaussian default is only best for practical gauss_sigma
3263//                  values; much larger gauss_sigmas ironically prefer slightly
3264//                  smaller support given sparse sampling, and vice versa.)
3265//              3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter;
3266//              4.) static const float2 aa_xy_axis_importance:
3267//                  The sparse N-queens sampling grid interacts poorly with
3268//                  negative-lobed 2D filters.  However, if aliasing is much
3269//                  stronger in one direction (e.g. horizontally with a phosphor
3270//                  mask), it can be useful to downplay sample offsets along the
3271//                  other axis.  The support radius in each direction scales with
3272//                  aa_xy_axis_importance down to a minimum of 0.5 (box support),
3273//                  after which point only the offsets used for calculating
3274//                  weights continue to scale downward.  This works as follows:
3275//                  If aa_xy_axis_importance = float2(1.0, 1.0/support_radius),
3276//                  the vertical support radius will drop to 1.0, and we'll just
3277//                  filter vertical offsets with the first filter lobe, while
3278//                  horizontal offsets go through the full multi-lobe filter.
3279//                  If aa_xy_axis_importance = float2(1.0, 0.0), the vertical
3280//                  support radius will drop to box support, and the vertical
3281//                  offsets will be ignored entirely (essentially giving us a
3282//                  box filter vertically).  The former is potentially smoother
3283//                  (but less predictable) and the default behavior of Lanczos
3284//                  jinc, whereas the latter is sharper and the default behavior
3285//                  of cubics and Lanczos sinc.
3286//              5.) static const float aa_pixel_diameter: You can expand the
3287//                  pixel diameter to e.g. sqrt(2.0), which may be a better
3288//                  support range for cylindrical filters (they don't
3289//                  currently discard out-of-circle samples though).
3290//              Finally, there are two miscellaneous options:
3291//              1.) If you want to antialias a manually tiled texture, you can
3292//                  #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to
3293//                  fix incompatibilities with anisotropic filtering.  This is
3294//                  slower, and the Cg profile must support tex2Dlod().
3295//              2.) If aa_cubic_c is a runtime uniform, you can #define
3296//                  RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per
3297//                  fragment instead of at the usage site (which is used by
3298//                  default, because it enables static evaluation).
3299//  Description:
3300//  Each antialiased lookup follows these steps:
3301//  1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5]
3302//      pixels, spanning the diameter of a rectangular box filter.
3303//  2.) Scale these offsets by the support diameter of the user's chosen filter.
3304//  3.) Using these pixel offsets from the pixel center, compute the offsets to
3305//      predefined subpixel locations.
3306//  4.) Compute filter weights based on subpixel offsets.
3307//  Much of that can often be done at compile-time.  At runtime:
3308//  1.) Project pixel-space offsets into uv-space with a matrix multiplication
3309//      to get the uv offsets for each sample.  Rectangular pixels have a
3310//      diameter of 1.0.  Circular pixels are not currently supported, but they
3311//      might be better with a diameter of sqrt(2.0) to ensure there are no gaps
3312//      between them.
3313//  2.) Load, weight, and sum samples.
3314//  We use a sparse bilinear sampling grid, so there are two major implications:
3315//  1.) We can directly project the pixel-space support box into uv-space even
3316//      if we're upsizing.  This wouldn't be the case for nearest neighbor,
3317//      where we'd have to expand the uv-space diameter to at least the support
3318//      size to ensure sufficient filter support.  In our case, this allows us
3319//      to treat upsizing the same as downsizing and use static weighting. :)
3320//  2.) For decent results, negative-lobed filters must be computed based on
3321//      separable weights, not radial distances, because the sparse sampling
3322//      makes no guarantees about radial distributions.  Even then, it's much
3323//      better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g.
3324//      Lanczos2 horizontally and a box filter vertically.  This is mainly due
3325//      to the sparse N-queens sampling and a statistically enormous positive or
3326//      negative covariance between horizontal and vertical weights.
3327//
3328//  Design Decision Comments:
3329//  "aa_temporal" mirrors the sample pattern on odd frames along the axis that
3330//  keeps subpixel weights constant.  This helps with rotational invariance, but
3331//  it can cause distracting fluctuations, and horizontal and vertical edges
3332//  will look the same.  Using a different pattern on a shifted grid would
3333//  exploit temporal AA better, but it would require a dynamic branch or a lot
3334//  of conditional moves, so it's prohibitively slow for the minor benefit.
3335
3336
3337/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
3338
3339#ifndef ANTIALIAS_OVERRIDE_BASICS
3340    //  The following settings must be static constants:
3341    static const float aa_level = 12.0;
3342    static const float aa_filter = 0.0;
3343    static const bool aa_temporal = false;
3344#endif
3345
3346#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS
3347    //  Users may override these parameters, but the file structure requires
3348    //  them to be static constants; see the descriptions above.
3349    static const float aa_pixel_diameter = 1.0;
3350    static const float aa_lanczos_lobes = 3.0;
3351    static const float aa_gauss_support = 1.0 / aa_pixel_diameter;
3352    static const float aa_tent_support = 1.0 / aa_pixel_diameter;
3353
3354    //  If we're using a negative-lobed filter, default to using it horizontally
3355    //  only, and use only the first lobe vertically or a box filter, over a
3356    //  correspondingly smaller range.  This compensates for the sparse sampling
3357    //  grid's typically large positive/negative x/y covariance.
3358    static const float2 aa_xy_axis_importance =
3359        aa_filter < 5.5 ? float2(1.0) :         //  Box, tent, Gaussian
3360        aa_filter < 8.5 ? float2(1.0, 0.0) :    //  Cubic and Lanczos sinc
3361        aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
3362        float2(1.0);                            //  Default to box
3363#endif
3364
3365#ifndef ANTIALIAS_OVERRIDE_PARAMETERS
3366    //  Users may override these values with their own uniform or static consts.
3367    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
3368    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
3369    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
3370    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
3371    //  4.) C = 0.0 is a soft spline filter.
3372    static const float aa_cubic_c = 0.5;
3373    static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
3374    //  Users may override the subpixel offset accessor function with their own.
3375    //  A function is used for compatibility with scalar runtime shader
3376    inline float2 get_aa_subpixel_r_offset()
3377    {
3378        return float2(0.0, 0.0);
3379    }
3380#endif
3381
3382
3383//////////////////////////////////  INCLUDES  //////////////////////////////////
3384
3385//#include "../../../../include/gamma-management.h"
3386
3387
3388//////////////////////////////////  CONSTANTS  /////////////////////////////////
3389
3390static const float aa_box_support = 0.5;
3391static const float aa_cubic_support = 2.0;
3392
3393
3394////////////////////////////  GLOBAL NON-CONSTANTS  ////////////////////////////
3395
3396//  We'll want to define these only once per fragment at most.
3397#ifdef RUNTIME_ANTIALIAS_WEIGHTS
3398    float aa_cubic_b;
3399    float cubic_branch1_x3_coeff;
3400    float cubic_branch1_x2_coeff;
3401    float cubic_branch1_x0_coeff;
3402    float cubic_branch2_x3_coeff;
3403    float cubic_branch2_x2_coeff;
3404    float cubic_branch2_x1_coeff;
3405    float cubic_branch2_x0_coeff;
3406#endif
3407
3408
3409///////////////////////////////////  HELPERS  //////////////////////////////////
3410
3411void assign_aa_cubic_constants()
3412{
3413    //  Compute cubic coefficients on demand at runtime, and save them to global
3414    //  uniforms.  The B parameter is computed from C, because "Keys cubics"
3415    //  with B = 1 - 2C are considered the highest quality.
3416    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
3417        if(aa_filter > 5.5 && aa_filter < 7.5)
3418        {
3419            aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
3420            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
3421            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
3422            cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
3423            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
3424            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
3425            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
3426            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
3427        }
3428    #endif
3429}
3430
3431inline float4 get_subpixel_support_diam_and_final_axis_importance()
3432{
3433    //  Statically select the base support radius:
3434    static const float base_support_radius =
3435        aa_filter < 1.5 ? aa_box_support :
3436        aa_filter < 3.5 ? aa_tent_support :
3437        aa_filter < 5.5 ? aa_gauss_support :
3438        aa_filter < 7.5 ? aa_cubic_support :
3439        aa_filter < 9.5 ? aa_lanczos_lobes :
3440        aa_box_support; //  Default to box
3441    //  Expand the filter support for subpixel filtering.
3442    const float2 subpixel_support_radius_raw =
3443        float2(base_support_radius) + abs(get_aa_subpixel_r_offset());
3444    if(aa_filter < 1.5)
3445    {
3446        //  Ignore aa_xy_axis_importance for box filtering.
3447        const float2 subpixel_support_diam =
3448            2.0 * subpixel_support_radius_raw;
3449        const float2 final_axis_importance = float2(1.0);
3450        return float4(subpixel_support_diam, final_axis_importance);
3451    }
3452    else
3453    {
3454        //  Scale the support window by aa_xy_axis_importance, but don't narrow
3455        //  it further than box support.  This allows decent vertical AA without
3456        //  messing up horizontal weights or using something silly like Lanczos4
3457        //  horizontally with a huge vertical average over an 8-pixel radius.
3458        const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support),
3459            subpixel_support_radius_raw * aa_xy_axis_importance);
3460        //  Adjust aa_xy_axis_importance to compensate for what's already done:
3461        const float2 final_axis_importance = aa_xy_axis_importance *
3462            subpixel_support_radius_raw/subpixel_support_radius;
3463        const float2 subpixel_support_diam = 2.0 * subpixel_support_radius;
3464        return float4(subpixel_support_diam, final_axis_importance);
3465    }
3466}
3467
3468
3469///////////////////////////  FILTER WEIGHT FUNCTIONS  //////////////////////////
3470
3471inline float eval_box_filter(const float dist)
3472{
3473    return float(abs(dist) <= aa_box_support);
3474}
3475
3476inline float eval_separable_box_filter(const float2 offset)
3477{
3478    return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support))));
3479}
3480
3481inline float eval_tent_filter(const float dist)
3482{
3483    return clamp((aa_tent_support - dist)/
3484        aa_tent_support, 0.0, 1.0);
3485}
3486
3487inline float eval_gaussian_filter(const float dist)
3488{
3489    return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma));
3490}
3491
3492inline float eval_cubic_filter(const float dist)
3493{
3494    //  Compute coefficients like assign_aa_cubic_constants(), but statically.
3495    #ifndef RUNTIME_ANTIALIAS_WEIGHTS
3496        //  When runtime weights are used, these values are instead written to
3497        //  global uniforms at the beginning of each tex2Daa* call.
3498        const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
3499        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
3500        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
3501        const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
3502        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
3503        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
3504        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
3505        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
3506    #endif
3507    const float abs_dist = abs(dist);
3508    //  Compute the cubic based on the Horner's method formula in:
3509    //  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
3510    return (abs_dist < 1.0 ?
3511        (cubic_branch1_x3_coeff*abs_dist +
3512            cubic_branch1_x2_coeff)*abs_dist*abs_dist +
3513            cubic_branch1_x0_coeff :
3514        abs_dist < 2.0 ?
3515            ((cubic_branch2_x3_coeff*abs_dist +
3516                cubic_branch2_x2_coeff)*abs_dist +
3517                cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff :
3518            0.0)/6.0;
3519}
3520
3521inline float eval_separable_cubic_filter(const float2 offset)
3522{
3523    //  This is faster than using a specific float2 version:
3524    return eval_cubic_filter(offset.x) *
3525        eval_cubic_filter(offset.y);
3526}
3527
3528inline float2 eval_sinc_filter(const float2 offset)
3529{
3530    //  It's faster to let the caller handle the zero case, or at least it
3531    //  was when I used macros and the shader preset took a full minute to load.
3532    const float2 pi_offset = pi * offset;
3533    return sin(pi_offset)/pi_offset;
3534}
3535
3536inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe)
3537{
3538    //  Note: For sparse sampling, you really need to pick an axis to use
3539    //  Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)).
3540    const float2 offset = FIX_ZERO(offset_unsafe);
3541    const float2 xy_weights = eval_sinc_filter(offset) *
3542        eval_sinc_filter(offset/aa_lanczos_lobes);
3543    return xy_weights.x * xy_weights.y;
3544}
3545
3546inline float eval_jinc_filter_unorm(const float x)
3547{
3548    //  This is a Jinc approximation for x in [0, 45).  We'll use x in range
3549    //  [0, 4*pi) or so.  There are faster/closer approximations based on
3550    //  piecewise cubics from [0, 45) and asymptotic approximations beyond that,
3551    //  but this has a maximum absolute error < 1/512, and it's simpler/faster
3552    //  for shaders...not that it's all that useful for sparse sampling anyway.
3553    const float point3845_x = 0.38448566093564*x;
3554    const float exp_term = exp(-(point3845_x*point3845_x));
3555    const float point8154_plus_x = 0.815362332840791 + x;
3556    const float cos_term = cos(point8154_plus_x);
3557    return (
3558        0.0264727330997042*min(x, 6.83134964622778) +
3559        0.680823557250528*exp_term +
3560        -0.0597255978950933*min(7.41043194481873, x)*cos_term /
3561            (point8154_plus_x + 0.0646074538634482*(x*x) +
3562            cos(x)*max(exp_term, cos(x) + cos_term)) -
3563        0.180837503591406);
3564}
3565
3566inline float eval_jinc_filter(const float dist)
3567{
3568    return eval_jinc_filter_unorm(pi * dist);
3569}
3570
3571inline float eval_lanczos_jinc_filter(const float dist)
3572{
3573    return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes);
3574}
3575
3576
3577inline float3 eval_unorm_rgb_weights(const float2 offset,
3578    const float2 final_axis_importance)
3579{
3580    //  Requires:   1.) final_axis_impportance must be computed according to
3581    //                  get_subpixel_support_diam_and_final_axis_importance().
3582    //              2.) aa_filter must be a global constant.
3583    //              3.) offset must be an xy pixel offset in the range:
3584    //                      ([-subpixel_support_diameter.x/2,
3585    //                      subpixel_support_diameter.x/2],
3586    //                      [-subpixel_support_diameter.y/2,
3587    //                      subpixel_support_diameter.y/2])
3588    //  Returns:    Sample weights at R/G/B destination subpixels for the
3589    //              given xy pixel offset.
3590    const float2 offset_g = offset * final_axis_importance;
3591    const float2 aa_r_offset = get_aa_subpixel_r_offset();
3592    const float2 offset_r = offset_g - aa_r_offset * final_axis_importance;
3593    const float2 offset_b = offset_g + aa_r_offset * final_axis_importance;
3594    //  Statically select a filter:
3595    if(aa_filter < 0.5)
3596    {
3597        return float3(eval_separable_box_filter(offset_r),
3598            eval_separable_box_filter(offset_g),
3599            eval_separable_box_filter(offset_b));
3600    }
3601    else if(aa_filter < 1.5)
3602    {
3603        return float3(eval_box_filter(length(offset_r)),
3604            eval_box_filter(length(offset_g)),
3605            eval_box_filter(length(offset_b)));
3606    }
3607    else if(aa_filter < 2.5)
3608    {
3609        return float3(
3610            eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y),
3611            eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y),
3612            eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y));
3613    }
3614    else if(aa_filter < 3.5)
3615    {
3616        return float3(eval_tent_filter(length(offset_r)),
3617            eval_tent_filter(length(offset_g)),
3618            eval_tent_filter(length(offset_b)));
3619    }
3620    else if(aa_filter < 4.5)
3621    {
3622        return float3(
3623            eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y),
3624            eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y),
3625            eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y));
3626    }
3627    else if(aa_filter < 5.5)
3628    {
3629        return float3(eval_gaussian_filter(length(offset_r)),
3630            eval_gaussian_filter(length(offset_g)),
3631            eval_gaussian_filter(length(offset_b)));
3632    }
3633    else if(aa_filter < 6.5)
3634    {
3635        return float3(
3636            eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y),
3637            eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y),
3638            eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y));
3639    }
3640    else if(aa_filter < 7.5)
3641    {
3642        return float3(eval_cubic_filter(length(offset_r)),
3643            eval_cubic_filter(length(offset_g)),
3644            eval_cubic_filter(length(offset_b)));
3645    }
3646    else if(aa_filter < 8.5)
3647    {
3648        return float3(eval_separable_lanczos_sinc_filter(offset_r),
3649            eval_separable_lanczos_sinc_filter(offset_g),
3650            eval_separable_lanczos_sinc_filter(offset_b));
3651    }
3652    else if(aa_filter < 9.5)
3653    {
3654        return float3(eval_lanczos_jinc_filter(length(offset_r)),
3655            eval_lanczos_jinc_filter(length(offset_g)),
3656            eval_lanczos_jinc_filter(length(offset_b)));
3657    }
3658    else
3659    {
3660        //  Default to a box, because Lanczos Jinc is so bad. ;)
3661        return float3(eval_separable_box_filter(offset_r),
3662            eval_separable_box_filter(offset_g),
3663            eval_separable_box_filter(offset_b));
3664    }
3665}
3666
3667
3668//////////////////////////////  HELPER FUNCTIONS  //////////////////////////////
3669
3670inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s)
3671{
3672    //  If we're manually tiling a texture, anisotropic filtering can get
3673    //  confused.  This is one workaround:
3674    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
3675        //  TODO: Use tex2Dlod_linearize with a calculated mip level.
3676        return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0));
3677    #else
3678        return tex2D_linearize(samp, s);
3679    #endif
3680}
3681
3682inline float2 get_frame_sign(const float frame)
3683{
3684    if(aa_temporal)
3685    {
3686        //  Mirror the sampling pattern for odd frames in a direction that
3687        //  lets us keep the same subpixel sample weights:
3688        const float frame_odd = float(fmod(frame, 2.0) > 0.5);
3689        const float2 aa_r_offset = get_aa_subpixel_r_offset();
3690        const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0)));
3691        return mirror;
3692    }
3693    else
3694    {
3695        return float2(1.0, 1.0);
3696    }
3697}
3698
3699
3700/////////////////////////  ANTIALIASED TEXTURE LOOKUPS  ////////////////////////
3701
3702float3 tex2Daa_subpixel_weights_only(const sampler2D tex,
3703    const float2 tex_uv, const float2x2 pixel_to_tex_uv)
3704{
3705    //  This function is unlike the others: Just perform a single independent
3706    //  lookup for each subpixel.  It may be very aliased.
3707    const float2 aa_r_offset = get_aa_subpixel_r_offset();
3708    const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset);
3709    const float color_g = tex2D_linearize(tex, tex_uv).g;
3710    const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r;
3711    const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b;
3712    return float3(color_r, color_g, color_b);
3713}
3714
3715//  The tex2Daa* functions compile very slowly due to all the macros and
3716//  compile-time math, so only include the ones we'll actually use!
3717float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv,
3718    const float2x2 pixel_to_tex_uv, const float frame)
3719{
3720    //  Use an RGMS4 pattern (4-queens):
3721    //  . . Q .  : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4
3722    //  Q . . .  : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4
3723    //  . . . Q  : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4
3724    //  . Q . .  : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4
3725    //  Static screenspace sample offsets (compute some implicitly):
3726    static const float grid_size = 4.0;
3727    assign_aa_cubic_constants();
3728    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
3729    const float2 subpixel_support_diameter = ssd_fai.xy;
3730    const float2 final_axis_importance = ssd_fai.zw;
3731    const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter;
3732    const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step;
3733    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
3734    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
3735    const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step;
3736    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
3737    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
3738    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
3739    const float3 w2 = w1.bgr;
3740    const float3 w3 = w0.bgr;
3741    //  Get the weight sum to normalize the total to 1.0 later:
3742    const float3 half_sum = w0 + w1;
3743    const float3 w_sum = half_sum + half_sum.bgr;
3744    const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum);
3745    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
3746    const float2x2 true_pixel_to_tex_uv =
3747        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
3748    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
3749    //  diagonal symmetry:
3750    const float2 frame_sign = get_frame_sign(frame);
3751    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
3752    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
3753    //  Load samples, linearizing if necessary, etc.:
3754    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
3755    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
3756    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
3757    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
3758    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
3759    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
3760        w2 * sample2 + w3 * sample3);
3761}
3762
3763float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv,
3764    const float2x2 pixel_to_tex_uv, const float frame)
3765{
3766    //  Use a diagonally symmetric 5-queens pattern:
3767    //  . Q . . .  : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5
3768    //  . . . . Q  : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5
3769    //  . . Q . .  : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5
3770    //  Q . . . .  : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5
3771    //  . . . Q .  : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5
3772    //  Static screenspace sample offsets (compute some implicitly):
3773    static const float grid_size = 5.0;
3774    assign_aa_cubic_constants();
3775    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
3776    const float2 subpixel_support_diameter = ssd_fai.xy;
3777    const float2 final_axis_importance = ssd_fai.zw;
3778    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
3779    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
3780    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
3781    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
3782    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
3783    const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step;
3784    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
3785    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
3786    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
3787    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
3788    const float3 w3 = w1.bgr;
3789    const float3 w4 = w0.bgr;
3790    //  Get the weight sum to normalize the total to 1.0 later:
3791    const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4);
3792    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
3793    const float2x2 true_pixel_to_tex_uv =
3794        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
3795    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
3796    //  diagonal symmetry:
3797    const float2 frame_sign = get_frame_sign(frame);
3798    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
3799    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
3800    //  Load samples, linearizing if necessary, etc.:
3801    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
3802    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
3803    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
3804    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
3805    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
3806    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
3807    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
3808        w2 * sample2 + w3 * sample3 + w4 * sample4);
3809}
3810
3811float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv,
3812    const float2x2 pixel_to_tex_uv, const float frame)
3813{
3814    //  Use a diagonally symmetric 6-queens pattern with a stronger horizontal
3815    //  than vertical slant:
3816    //  . . . . Q .  : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6
3817    //  . . Q . . .  : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6
3818    //  Q . . . . .  : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6
3819    //  . . . . . Q  : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6
3820    //  . . . Q . .  : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6
3821    //  . Q . . . .  : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6
3822    //  Static screenspace sample offsets (compute some implicitly):
3823    static const float grid_size = 6.0;
3824    assign_aa_cubic_constants();
3825    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
3826    const float2 subpixel_support_diameter = ssd_fai.xy;
3827    const float2 final_axis_importance = ssd_fai.zw;
3828    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
3829    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
3830    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
3831    const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step;
3832    const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step;
3833    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
3834    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
3835    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
3836    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
3837    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
3838    const float3 w3 = w2.bgr;
3839    const float3 w4 = w1.bgr;
3840    const float3 w5 = w0.bgr;
3841    //  Get the weight sum to normalize the total to 1.0 later:
3842    const float3 half_sum = w0 + w1 + w2;
3843    const float3 w_sum = half_sum + half_sum.bgr;
3844    const float3 w_sum_inv = float3(1.0)/(w_sum);
3845    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
3846    const float2x2 true_pixel_to_tex_uv =
3847        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
3848    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
3849    //  diagonal symmetry:
3850    const float2 frame_sign = get_frame_sign(frame);
3851    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
3852    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
3853    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
3854    //  Load samples, linearizing if necessary, etc.:
3855    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
3856    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
3857    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
3858    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
3859    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
3860    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
3861    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
3862    return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 +
3863        w3 * sample3 + w4 * sample4 + w5 * sample5);
3864}
3865
3866float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv,
3867    const float2x2 pixel_to_tex_uv, const float frame)
3868{
3869    //  Use a diagonally symmetric 7-queens pattern with a queen in the center:
3870    //  . Q . . . . .  : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7
3871    //  . . . . Q . .  : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7
3872    //  Q . . . . . .  : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7
3873    //  . . . Q . . .  : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7
3874    //  . . . . . . Q  : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7
3875    //  . . Q . . . .  : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7
3876    //  . . . . . Q .  : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7
3877    static const float grid_size = 7.0;
3878    assign_aa_cubic_constants();
3879    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
3880    const float2 subpixel_support_diameter = ssd_fai.xy;
3881    const float2 final_axis_importance = ssd_fai.zw;
3882    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
3883    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
3884    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
3885    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
3886    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
3887    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
3888    const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step;
3889    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
3890    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
3891    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
3892    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
3893    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
3894    const float3 w4 = w2.bgr;
3895    const float3 w5 = w1.bgr;
3896    const float3 w6 = w0.bgr;
3897    //  Get the weight sum to normalize the total to 1.0 later:
3898    const float3 half_sum = w0 + w1 + w2;
3899    const float3 w_sum = half_sum + half_sum.bgr + w3;
3900    const float3 w_sum_inv = float3(1.0)/(w_sum);
3901    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
3902    const float2x2 true_pixel_to_tex_uv =
3903        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
3904    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
3905    //  diagonal symmetry:
3906    const float2 frame_sign = get_frame_sign(frame);
3907    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
3908    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
3909    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
3910    //  Load samples, linearizing if necessary, etc.:
3911    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
3912    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
3913    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
3914    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
3915    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
3916    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
3917    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
3918    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
3919    return w_sum_inv * (
3920        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
3921        w4 * sample4 + w5 * sample5 + w6 * sample6);
3922}
3923
3924float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv,
3925    const float2x2 pixel_to_tex_uv, const float frame)
3926{
3927    //  Use a diagonally symmetric 8-queens pattern.
3928    //  . . Q . . . . .  : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8
3929    //  . . . . Q . . .  : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8
3930    //  . Q . . . . . .  : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8
3931    //  . . . . . . . Q  : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8
3932    //  Q . . . . . . .  : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8
3933    //  . . . . . . Q .  : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8
3934    //  . . . Q . . . .  : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8
3935    //  . . . . . Q . .  : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8
3936    static const float grid_size = 8.0;
3937    assign_aa_cubic_constants();
3938    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
3939    const float2 subpixel_support_diameter = ssd_fai.xy;
3940    const float2 final_axis_importance = ssd_fai.zw;
3941    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
3942    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
3943    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
3944    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
3945    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
3946    const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step;
3947    const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step;
3948    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
3949    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
3950    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
3951    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
3952    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
3953    const float3 w4 = w3.bgr;
3954    const float3 w5 = w2.bgr;
3955    const float3 w6 = w1.bgr;
3956    const float3 w7 = w0.bgr;
3957    //  Get the weight sum to normalize the total to 1.0 later:
3958    const float3 half_sum = w0 + w1 + w2 + w3;
3959    const float3 w_sum = half_sum + half_sum.bgr;
3960    const float3 w_sum_inv = float3(1.0)/(w_sum);
3961    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
3962    const float2x2 true_pixel_to_tex_uv =
3963        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
3964    //  Get uv sample offsets, and mirror on odd frames if directed:
3965    const float2 frame_sign = get_frame_sign(frame);
3966    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
3967    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
3968    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
3969    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
3970    //  Load samples, linearizing if necessary, etc.:
3971    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
3972    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
3973    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
3974    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
3975    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
3976    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
3977    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
3978    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
3979    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
3980    return w_sum_inv * (
3981        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
3982        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7);
3983}
3984
3985float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv,
3986    const float2x2 pixel_to_tex_uv, const float frame)
3987{
3988    //  Use a diagonally symmetric 12-superqueens pattern where no 3 points are
3989    //  exactly collinear.
3990    //  . . . Q . . . . . . . .  : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12
3991    //  . . . . . . . . . Q . .  : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12
3992    //  . . . . . . Q . . . . .  : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12
3993    //  . Q . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12
3994    //  . . . . . . . . . . . Q  : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12
3995    //  . . . . Q . . . . . . .  : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12
3996    //  . . . . . . . Q . . . .  : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12
3997    //  Q . . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12
3998    //  . . . . . . . . . . Q .  : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12
3999    //  . . . . . Q . . . . . .  : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12
4000    //  . . Q . . . . . . . . .  : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12
4001    //  . . . . . . . . Q . . .  : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12
4002    static const float grid_size = 12.0;
4003    assign_aa_cubic_constants();
4004    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4005    const float2 subpixel_support_diameter = ssd_fai.xy;
4006    const float2 final_axis_importance = ssd_fai.zw;
4007    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
4008    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
4009    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
4010    const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step;
4011    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
4012    const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step;
4013    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
4014    const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step;
4015    const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step;
4016    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
4017    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
4018    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
4019    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
4020    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
4021    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
4022    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
4023    const float3 w6 = w5.bgr;
4024    const float3 w7 = w4.bgr;
4025    const float3 w8 = w3.bgr;
4026    const float3 w9 = w2.bgr;
4027    const float3 w10 = w1.bgr;
4028    const float3 w11 = w0.bgr;
4029    //  Get the weight sum to normalize the total to 1.0 later:
4030    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
4031    const float3 w_sum = half_sum + half_sum.bgr;
4032    const float3 w_sum_inv = float3(1.0)/w_sum;
4033    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
4034    const float2x2 true_pixel_to_tex_uv =
4035        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4036    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
4037    //  diagonal symmetry:
4038    const float2 frame_sign = get_frame_sign(frame);
4039    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
4040    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
4041    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
4042    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
4043    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
4044    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
4045    //  Load samples, linearizing if necessary, etc.:
4046    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
4047    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
4048    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
4049    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
4050    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
4051    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
4052    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
4053    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
4054    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
4055    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
4056    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
4057    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
4058    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
4059    return w_sum_inv * (
4060        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
4061        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
4062        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11);
4063}
4064
4065float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv,
4066    const float2x2 pixel_to_tex_uv, const float frame)
4067{
4068    //  Use a diagonally symmetric 16-superqueens pattern where no 3 points are
4069    //  exactly collinear.
4070    //  . . Q . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16
4071    //  . . . . . . . . . Q . . . . . .  : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16
4072    //  . . . . . . . . . . . . Q . . .  : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16
4073    //  . . . . Q . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16
4074    //  . . . . . . . . Q . . . . . . .  : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16
4075    //  . . . . . . . . . . . . . . Q .  : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16
4076    //  Q . . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16
4077    //  . . . . . . . . . . Q . . . . .  : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16
4078    //  . . . . . Q . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16
4079    //  . . . . . . . . . . . . . . . Q  : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16
4080    //  . Q . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16
4081    //  . . . . . . . Q . . . . . . . .  : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16
4082    //  . . . . . . . . . . . Q . . . .  : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16
4083    //  . . . Q . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16
4084    //  . . . . . . Q . . . . . . . . .  : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16
4085    //  . . . . . . . . . . . . . Q . .  : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16
4086    static const float grid_size = 16.0;
4087    assign_aa_cubic_constants();
4088    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4089    const float2 subpixel_support_diameter = ssd_fai.xy;
4090    const float2 final_axis_importance = ssd_fai.zw;
4091    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
4092    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
4093    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
4094    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
4095    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
4096    const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step;
4097    const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step;
4098    const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step;
4099    const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step;
4100    const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step;
4101    const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step;
4102    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
4103    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
4104    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
4105    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
4106    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
4107    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
4108    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
4109    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
4110    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
4111    const float3 w8 = w7.bgr;
4112    const float3 w9 = w6.bgr;
4113    const float3 w10 = w5.bgr;
4114    const float3 w11 = w4.bgr;
4115    const float3 w12 = w3.bgr;
4116    const float3 w13 = w2.bgr;
4117    const float3 w14 = w1.bgr;
4118    const float3 w15 = w0.bgr;
4119    //  Get the weight sum to normalize the total to 1.0 later:
4120    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
4121    const float3 w_sum = half_sum + half_sum.bgr;
4122    const float3 w_sum_inv = float3(1.0)/(w_sum);
4123    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
4124    const float2x2 true_pixel_to_tex_uv =
4125        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4126    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
4127    //  diagonal symmetry:
4128    const float2 frame_sign = get_frame_sign(frame);
4129    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
4130    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
4131    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
4132    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
4133    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
4134    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
4135    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
4136    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
4137    //  Load samples, linearizing if necessary, etc.:
4138    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
4139    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
4140    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
4141    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
4142    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
4143    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
4144    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
4145    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
4146    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
4147    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
4148    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
4149    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
4150    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
4151    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
4152    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
4153    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
4154    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
4155    return w_sum_inv * (
4156        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
4157        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
4158        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
4159        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
4160}
4161
4162float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv,
4163    const float2x2 pixel_to_tex_uv, const float frame)
4164{
4165    //  Use a diagonally symmetric 20-superqueens pattern where no 3 points are
4166    //  exactly collinear and superqueens have a squared attack radius of 13.
4167    //  . . . . . . . Q . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20
4168    //  . . . . . . . . . . . . . . . . Q . . .  : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20
4169    //  . . . . . . . . . . . Q . . . . . . . .  : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20
4170    //  . Q . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20
4171    //  . . . . . Q . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20
4172    //  . . . . . . . . . . . . . . . Q . . . .  : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20
4173    //  . . . . . . . . . . Q . . . . . . . . .  : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20
4174    //  . . . . . . . . . . . . . . . . . . . Q  : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20
4175    //  . . Q . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20
4176    //  . . . . . . Q . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20
4177    //  . . . . . . . . . . . . . Q . . . . . .  : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20
4178    //  . . . . . . . . . . . . . . . . . Q . .  : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20
4179    //  Q . . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20
4180    //  . . . . . . . . . Q . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20
4181    //  . . . . Q . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20
4182    //  . . . . . . . . . . . . . . Q . . . . .  : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20
4183    //  . . . . . . . . . . . . . . . . . . Q .  : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20
4184    //  . . . . . . . . Q . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20
4185    //  . . . Q . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20
4186    //  . . . . . . . . . . . . Q . . . . . . .  : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20
4187    static const float grid_size = 20.0;
4188    assign_aa_cubic_constants();
4189    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4190    const float2 subpixel_support_diameter = ssd_fai.xy;
4191    const float2 final_axis_importance = ssd_fai.zw;
4192    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
4193    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
4194    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
4195    const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step;
4196    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
4197    const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step;
4198    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
4199    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
4200    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
4201    const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step;
4202    const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step;
4203    const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step;
4204    const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step;
4205    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
4206    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
4207    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
4208    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
4209    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
4210    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
4211    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
4212    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
4213    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
4214    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
4215    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
4216    const float3 w10 = w9.bgr;
4217    const float3 w11 = w8.bgr;
4218    const float3 w12 = w7.bgr;
4219    const float3 w13 = w6.bgr;
4220    const float3 w14 = w5.bgr;
4221    const float3 w15 = w4.bgr;
4222    const float3 w16 = w3.bgr;
4223    const float3 w17 = w2.bgr;
4224    const float3 w18 = w1.bgr;
4225    const float3 w19 = w0.bgr;
4226    //  Get the weight sum to normalize the total to 1.0 later:
4227    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
4228    const float3 w_sum = half_sum + half_sum.bgr;
4229    const float3 w_sum_inv = float3(1.0)/(w_sum);
4230    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
4231    const float2x2 true_pixel_to_tex_uv =
4232        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4233    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
4234    //  diagonal symmetry:
4235    const float2 frame_sign = get_frame_sign(frame);
4236    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
4237    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
4238    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
4239    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
4240    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
4241    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
4242    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
4243    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
4244    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
4245    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
4246    //  Load samples, linearizing if necessary, etc.:
4247    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
4248    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
4249    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
4250    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
4251    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
4252    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
4253    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
4254    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
4255    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
4256    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
4257    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
4258    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
4259    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
4260    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
4261    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
4262    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
4263    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
4264    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
4265    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
4266    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
4267    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
4268    return w_sum_inv * (
4269        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
4270        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
4271        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
4272        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
4273        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19);
4274}
4275
4276float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv,
4277    const float2x2 pixel_to_tex_uv, const float frame)
4278{
4279    //  Use a diagonally symmetric 24-superqueens pattern where no 3 points are
4280    //  exactly collinear and superqueens have a squared attack radius of 13.
4281    //  . . . . . . Q . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24
4282    //  . . . . . . . . . . . . . . . . Q . . . . . . .  : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24
4283    //  . . . . . . . . . . Q . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24
4284    //  . . . . . . . . . . . . . . . . . . . . . Q . .  : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24
4285    //  . . . . . Q . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24
4286    //  . . . . . . . . . . . . . . . Q . . . . . . . .  : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24
4287    //  . Q . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24
4288    //  . . . . . . . . . . . Q . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24
4289    //  . . . . . . . . . . . . . . . . . . . Q . . . .  : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24
4290    //  . . . . . . . . . . . . . . . . . . . . . . . Q  : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24
4291    //  . . . Q . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24
4292    //  . . . . . . . . . . . . . . Q . . . . . . . . .  : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24
4293    //  . . . . . . . . . Q . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24
4294    //  . . . . . . . . . . . . . . . . . . . . Q . . .  : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24
4295    //  Q . . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24
4296    //  . . . . Q . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24
4297    //  . . . . . . . . . . . . Q . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24
4298    //  . . . . . . . . . . . . . . . . . . . . . . Q .  : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24
4299    //  . . . . . . . . Q . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24
4300    //  . . . . . . . . . . . . . . . . . . Q . . . . .  : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24
4301    //  . . Q . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24
4302    //  . . . . . . . . . . . . . Q . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24
4303    //  . . . . . . . Q . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24
4304    //  . . . . . . . . . . . . . . . . . Q . . . . . .  : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24
4305    static const float grid_size = 24.0;
4306    assign_aa_cubic_constants();
4307    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4308    const float2 subpixel_support_diameter = ssd_fai.xy;
4309    const float2 final_axis_importance = ssd_fai.zw;
4310    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
4311    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
4312    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
4313    const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step;
4314    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
4315    const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step;
4316    const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step;
4317    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
4318    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
4319    const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step;
4320    const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step;
4321    const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step;
4322    const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step;
4323    const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step;
4324    const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step;
4325    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
4326    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
4327    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
4328    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
4329    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
4330    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
4331    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
4332    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
4333    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
4334    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
4335    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
4336    const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
4337    const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
4338    const float3 w12 = w11.bgr;
4339    const float3 w13 = w10.bgr;
4340    const float3 w14 = w9.bgr;
4341    const float3 w15 = w8.bgr;
4342    const float3 w16 = w7.bgr;
4343    const float3 w17 = w6.bgr;
4344    const float3 w18 = w5.bgr;
4345    const float3 w19 = w4.bgr;
4346    const float3 w20 = w3.bgr;
4347    const float3 w21 = w2.bgr;
4348    const float3 w22 = w1.bgr;
4349    const float3 w23 = w0.bgr;
4350    //  Get the weight sum to normalize the total to 1.0 later:
4351    const float3 half_sum = w0 + w1 + w2 + w3 + w4 +
4352        w5 + w6 + w7 + w8 + w9 + w10 + w11;
4353    const float3 w_sum = half_sum + half_sum.bgr;
4354    const float3 w_sum_inv = float3(1.0)/(w_sum);
4355    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
4356    const float2x2 true_pixel_to_tex_uv =
4357        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4358    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
4359    //  diagonal symmetry:
4360    const float2 frame_sign = get_frame_sign(frame);
4361    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
4362    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
4363    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
4364    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
4365    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
4366    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
4367    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
4368    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
4369    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
4370    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
4371    const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign);
4372    const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign);
4373    //  Load samples, linearizing if necessary, etc.:
4374    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
4375    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
4376    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
4377    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
4378    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
4379    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
4380    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
4381    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
4382    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
4383    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
4384    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb;
4385    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb;
4386    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb;
4387    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb;
4388    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
4389    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
4390    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
4391    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
4392    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
4393    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
4394    const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
4395    const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
4396    const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
4397    const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
4398    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
4399    return w_sum_inv * (
4400        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
4401        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
4402        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
4403        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
4404        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 +
4405        w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23);
4406}
4407
4408float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv,
4409    const float2x2 pixel_to_tex_uv, const float frame)
4410{
4411    //  Sample on a regular 4x4 grid.  This is mainly for testing.
4412    static const float grid_size = 4.0;
4413    assign_aa_cubic_constants();
4414    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4415    const float2 subpixel_support_diameter = ssd_fai.xy;
4416    const float2 final_axis_importance = ssd_fai.zw;
4417    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
4418    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
4419    //  Get the xy offset of each sample:
4420    const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step;
4421    const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step;
4422    const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step;
4423    const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step;
4424    const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step;
4425    const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step;
4426    const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step;
4427    const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step;
4428    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
4429    //  (We can't exploit vertical or horizontal symmetry due to uncertain
4430    //  subpixel offsets.  We could fix that by rotating xy offsets with the
4431    //  subpixel structure, but...no.)
4432    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
4433    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
4434    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
4435    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
4436    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
4437    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
4438    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
4439    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
4440    const float3 w8 = w7.bgr;
4441    const float3 w9 = w6.bgr;
4442    const float3 w10 = w5.bgr;
4443    const float3 w11 = w4.bgr;
4444    const float3 w12 = w3.bgr;
4445    const float3 w13 = w2.bgr;
4446    const float3 w14 = w1.bgr;
4447    const float3 w15 = w0.bgr;
4448    //  Get the weight sum to normalize the total to 1.0 later:
4449    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
4450    const float3 w_sum = half_sum + half_sum.bgr;
4451    const float3 w_sum_inv = float3(1.0)/(w_sum);
4452    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
4453    const float2x2 true_pixel_to_tex_uv =
4454        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4455    //  Get uv sample offsets, taking advantage of row alignment:
4456    const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0));
4457    const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y));
4458    const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
4459    const float2 sample0_uv = tex_uv + uv_offset0;
4460    const float2 sample4_uv = sample0_uv + uv_step_y;
4461    const float2 sample8_uv = sample0_uv + uv_step_y * 2.0;
4462    const float2 sample12_uv = sample0_uv + uv_step_y * 3.0;
4463    //  Load samples, linearizing if necessary, etc.:
4464    const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb;
4465    const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb;
4466    const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb;
4467    const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb;
4468    const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb;
4469    const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb;
4470    const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb;
4471    const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb;
4472    const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb;
4473    const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb;
4474    const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb;
4475    const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb;
4476    const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb;
4477    const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb;
4478    const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb;
4479    const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb;
4480    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
4481    return w_sum_inv * (
4482        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
4483        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
4484        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
4485        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
4486}
4487
4488float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv,
4489    const float2x2 pixel_to_tex_uv, const float frame)
4490{
4491    //  This function is for testing only: Use an NxN grid with dynamic weights.
4492    static const int grid_size = 8;
4493    assign_aa_cubic_constants();
4494    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
4495    const float2 subpixel_support_diameter = ssd_fai.xy;
4496    const float2 final_axis_importance = ssd_fai.zw;
4497    const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0;
4498    const float2 filter_space_offset_step =
4499        subpixel_support_diameter/float2(grid_size);
4500    const float2 sample0_filter_space_offset =
4501        -grid_radius_in_samples * filter_space_offset_step;
4502    //  Compute xy sample offsets and subpixel weights:
4503    float3 weights[64]; //originally grid_size * grid_size
4504    float3 weight_sum = float3(0.0, 0.0, 0.0);
4505    for(int i = 0; i < grid_size; ++i)
4506    {
4507        for(int j = 0; j < grid_size; ++j)
4508        {
4509            //  Weights based on xy distances:
4510            const float2 offset = sample0_filter_space_offset +
4511                float2(j, i) * filter_space_offset_step;
4512            const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
4513            weights[i*grid_size + j] = weight;
4514            weight_sum += weight;
4515        }
4516    }
4517    //  Get uv offset vectors along x and y directions:
4518    const float2x2 true_pixel_to_tex_uv =
4519        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
4520    const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv,
4521        float2(filter_space_offset_step.x, 0.0));
4522    const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv,
4523        float2(0.0, filter_space_offset_step.y));
4524    //  Get a starting sample location:
4525    const float2 sample0_uv_offset = -grid_radius_in_samples *
4526        (uv_offset_step_x + uv_offset_step_y);
4527    const float2 sample0_uv = tex_uv + sample0_uv_offset;
4528    //  Load, weight, and sum [linearized] samples:
4529    float3 sum = float3(0.0, 0.0, 0.0);
4530    const float3 weight_sum_inv = float3(1.0)/weight_sum;
4531    for(int i = 0; i < grid_size; ++i)
4532    {
4533        const float2 row_i_first_sample_uv =
4534            sample0_uv + i * uv_offset_step_y;
4535        for(int j = 0; j < grid_size; ++j)
4536        {
4537            const float2 sample_uv =
4538                row_i_first_sample_uv + j * uv_offset_step_x;
4539            sum += weights[i*grid_size + j] *
4540                tex2Daa_tiled_linearize(tex, sample_uv).rgb;
4541        }
4542    }
4543    return sum * weight_sum_inv;
4544}
4545
4546
4547///////////////////////  ANTIALIASING CODEPATH SELECTION  //////////////////////
4548
4549inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv,
4550    const float2x2 pixel_to_tex_uv, const float frame)
4551{
4552//#define DEBUG
4553#ifdef DEBUG
4554	return tex2Daa_subpixel_weights_only(
4555            tex, tex_uv, pixel_to_tex_uv);
4556#else
4557	//  Statically switch between antialiasing modes/levels:
4558    return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb :
4559        (aa_level < 3.5) ? tex2Daa_subpixel_weights_only(
4560            tex, tex_uv, pixel_to_tex_uv) :
4561        (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) :
4562        (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) :
4563        (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) :
4564        (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) :
4565        (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) :
4566        (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) :
4567        (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) :
4568        (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) :
4569        (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) :
4570        (aa_level < 254.5) ? tex2Daa_debug_16x_regular(
4571            tex, tex_uv, pixel_to_tex_uv, frame) :
4572        tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame);
4573#endif
4574}
4575
4576
4577#endif  //  TEX2DANTIALIAS_H
4578
4579/////////////////////////  END TEX2DANTIALIAS  /////////////////////////
4580
4581//#include "geometry-functions.h"
4582
4583/////////////////////////  BEGIN GEOMETRY-FUNCTIONS  /////////////////////////
4584
4585#ifndef GEOMETRY_FUNCTIONS_H
4586#define GEOMETRY_FUNCTIONS_H
4587
4588/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
4589
4590//  crt-royale: A full-featured CRT shader, with cheese.
4591//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
4592//
4593//  This program is free software; you can redistribute it and/or modify it
4594//  under the terms of the GNU General Public License as published by the Free
4595//  Software Foundation; either version 2 of the License, or any later version.
4596//
4597//  This program is distributed in the hope that it will be useful, but WITHOUT
4598//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
4599//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
4600//  more details.
4601//
4602//  You should have received a copy of the GNU General Public License along with
4603//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
4604//  Place, Suite 330, Boston, MA 02111-1307 USA
4605
4606
4607//////////////////////////////////  INCLUDES  //////////////////////////////////
4608
4609// already included elsewhere
4610//#include "../user-settings.h"
4611//#include "derived-settings-and-constants.h"
4612//#include "bind-shader-h"
4613
4614
4615////////////////////////////  MACROS AND CONSTANTS  ////////////////////////////
4616
4617//  Curvature-related constants:
4618#define MAX_POINT_CLOUD_SIZE 9
4619
4620
4621/////////////////////////////  CURVATURE FUNCTIONS /////////////////////////////
4622
4623float2 quadratic_solve(const float a, const float b_over_2, const float c)
4624{
4625    //  Requires:   1.) a, b, and c are quadratic formula coefficients
4626    //              2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
4627    //              3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
4628    //  Returns:    Returns float2(first_solution, discriminant), so the caller
4629    //              can choose how to handle the "no intersection" case.  The
4630    //              Kahan or Citardauq formula is used for numerical robustness.
4631    const float discriminant = b_over_2*b_over_2 - a*c;
4632    const float solution0 = c/(-b_over_2 + sqrt(discriminant));
4633    return float2(solution0, discriminant);
4634}
4635
4636float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec)
4637{
4638    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
4639    //                  local coordinate frame (eye_pos_vec is a position, i.e.
4640    //                  a vector from the origin to the eye/camera)
4641    //              2.) geom_radius is a global containing the sphere's radius
4642    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
4643    //              sphere of radius geom_radius, and return the distance to
4644    //              the first intersection in units of length(view_vec).
4645    //              http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection
4646    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
4647    const float a = dot(view_vec, view_vec);
4648    const float b_over_2 = dot(view_vec, eye_pos_vec);  //  * 2.0 factored out
4649    const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
4650    return quadratic_solve(a, b_over_2, c);
4651}
4652
4653float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec)
4654{
4655    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
4656    //                  local coordinate frame (eye_pos_vec is a position, i.e.
4657    //                  a vector from the origin to the eye/camera)
4658    //              2.) geom_radius is a global containing the cylinder's radius
4659    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
4660    //              cylinder of radius geom_radius, and return the distance to
4661    //              the first intersection in units of length(view_vec).  The
4662    //              derivation of the coefficients is in Christer Ericson's
4663    //              Real-Time Collision Detection, p. 195-196, and this version
4664    //              uses LaGrange's identity to reduce operations.
4665    //  Arbitrary "cylinder top" reference point for an infinite cylinder:
4666    const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0);
4667    const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0);
4668    const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
4669    const float3 axis_x_view = cross(cylinder_axis_vec, view_vec);
4670    const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
4671    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
4672    const float a = dot(axis_x_view, axis_x_view);
4673    const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
4674    const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
4675        geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
4676    return quadratic_solve(a, b_over_2, c);
4677}
4678
4679float2 cylinder_xyz_to_uv(const float3 intersection_pos_local,
4680    const float2 geom_aspect)
4681{
4682    //  Requires:   An xyz intersection position on a cylinder.
4683    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
4684    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
4685    //              and define square_uv.y = -intersection_pos_local.y (+v = -y).
4686    //  Start with a numerically robust arc length calculation.
4687    const float angle_from_image_center = atan2(intersection_pos_local.x,
4688        intersection_pos_local.z);
4689    const float signed_arc_len = angle_from_image_center * geom_radius;
4690    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
4691    //  by the aspect ratio to stretch the mapping appropriately:
4692    const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y);
4693    const float2 video_uv = square_uv / geom_aspect;
4694    return video_uv;
4695}
4696
4697float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
4698{
4699    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
4700    //  Returns:    An xyz intersection position on a cylinder.  This is the
4701    //              inverse of cylinder_xyz_to_uv().
4702    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
4703    //  then calculate an xyz position for the cylindrical mapping above.
4704    const float2 square_uv = video_uv * geom_aspect;
4705    const float arc_len = square_uv.x;
4706    const float angle_from_image_center = arc_len / geom_radius;
4707    const float x_pos = sin(angle_from_image_center) * geom_radius;
4708    const float z_pos = cos(angle_from_image_center) * geom_radius;
4709    //  Or: z = sqrt(geom_radius**2 - x**2)
4710    //  Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
4711    const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos);
4712    return intersection_pos_local;
4713}
4714
4715float2 sphere_xyz_to_uv(const float3 intersection_pos_local,
4716    const float2 geom_aspect)
4717{
4718    //  Requires:   An xyz intersection position on a sphere.
4719    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
4720    //  Mapping:    First define square_uv.x/square_uv.y ==
4721    //              intersection_pos_local.x/intersection_pos_local.y.  Then,
4722    //              length(square_uv) is the arc length from the image center
4723    //              at (0.0, 0.0, geom_radius) along the tangent great circle.
4724    //              Credit for this mapping goes to cgwg: I never managed to
4725    //              understand his code, but he told me his mapping was based on
4726    //              great circle distances when I asked him about it, which
4727    //              informed this very similar (almost identical) mapping.
4728    //  Start with a numerically robust arc length calculation between the ray-
4729    //  sphere intersection point and the image center using a method posted by
4730    //  Roger Stafford on comp.soft-sys.matlab:
4731    //  https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
4732    const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius);
4733    const float cp_len =
4734        length(cross(intersection_pos_local, image_center_pos_local));
4735    const float dp = dot(intersection_pos_local, image_center_pos_local);
4736    const float angle_from_image_center = atan2(cp_len, dp);
4737    const float arc_len = angle_from_image_center * geom_radius;
4738    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
4739    //  by the aspect ratio to stretch the mapping appropriately:
4740    const float2 square_uv_unit = normalize(float2(intersection_pos_local.x,
4741        -intersection_pos_local.y));
4742    const float2 square_uv = arc_len * square_uv_unit;
4743    const float2 video_uv = square_uv / geom_aspect;
4744    return video_uv;
4745}
4746
4747float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
4748{
4749    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
4750    //  Returns:    An xyz intersection position on a sphere.  This is the
4751    //              inverse of sphere_xyz_to_uv().
4752    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
4753    //  then calculate an xyz position for the spherical mapping above.
4754    const float2 square_uv = video_uv * geom_aspect;
4755    //  Using length or sqrt here butchers the framerate on my 8800GTS if
4756    //  this function is called too many times, and so does taking the max
4757    //  component of square_uv/square_uv_unit (program length threshold?).
4758    //float arc_len = length(square_uv);
4759    const float2 square_uv_unit = normalize(square_uv);
4760    const float arc_len = square_uv.y/square_uv_unit.y;
4761    const float angle_from_image_center = arc_len / geom_radius;
4762    const float xy_dist_from_sphere_center =
4763        sin(angle_from_image_center) * geom_radius;
4764    //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
4765    const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
4766    const float z_pos = cos(angle_from_image_center) * geom_radius;
4767    const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos);
4768    return intersection_pos_local;
4769}
4770
4771float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local,
4772    const float2 geom_aspect)
4773{
4774    //  Requires:   An xyz intersection position on a cylinder.
4775    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
4776    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
4777    //              and define square_uv.y == signed arc length in yz-space.
4778    //  See cylinder_xyz_to_uv() for implementation details (very similar).
4779    const float2 angle_from_image_center = atan2(
4780        float2(intersection_pos_local.x, -intersection_pos_local.y),
4781        intersection_pos_local.zz);
4782    const float2 signed_arc_len = angle_from_image_center * geom_radius;
4783    const float2 video_uv = signed_arc_len / geom_aspect;
4784    return video_uv;
4785}
4786
4787float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
4788{
4789    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
4790    //  Returns:    An xyz intersection position on a sphere.  This is the
4791    //              inverse of sphere_alt_xyz_to_uv().
4792    //  See cylinder_uv_to_xyz() for implementation details (very similar).
4793    const float2 square_uv = video_uv * geom_aspect;
4794    const float2 arc_len = square_uv;
4795    const float2 angle_from_image_center = arc_len / geom_radius;
4796    const float2 xy_pos = sin(angle_from_image_center) * geom_radius;
4797    const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
4798    return float3(xy_pos.x, -xy_pos.y, z_pos);
4799}
4800
4801inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local,
4802    const float geom_mode)
4803{
4804    return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
4805        intersect_cylinder(view_vec_local, eye_pos_local);
4806}
4807
4808inline float2 xyz_to_uv(const float3 intersection_pos_local,
4809    const float2 geom_aspect, const float geom_mode)
4810{
4811    return geom_mode < 1.5 ?
4812            sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
4813        geom_mode < 2.5 ?
4814            sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
4815            cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
4816}
4817
4818inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect,
4819    const float geom_mode)
4820{
4821    return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
4822        geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
4823        cylinder_uv_to_xyz(uv, geom_aspect);
4824}
4825
4826float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local,
4827    const float2 geom_aspect, const float geom_mode, out float3 intersection_pos)
4828{
4829    //  Get the intersection point on the primitive, given an eye position
4830    //  and view vector already in its local coordinate frame:
4831    const float2 intersect_dist_and_discriminant = intersect(view_vec_local,
4832        eye_pos_local, geom_mode);
4833    const float3 intersection_pos_local = eye_pos_local +
4834        view_vec_local * intersect_dist_and_discriminant.x;
4835    //  Save the intersection position to an output parameter:
4836    intersection_pos = intersection_pos_local;
4837    //  Transform into uv coords, but give out-of-range coords if the
4838    //  view ray doesn't intersect the primitive in the first place:
4839    return intersect_dist_and_discriminant.y > 0.005 ?
4840        xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0);
4841}
4842
4843float3 get_ideal_global_eye_pos_for_points(float3 eye_pos,
4844    const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE],
4845    const int num_points)
4846{
4847    //  Requires:   Parameters:
4848    //              1.) Starting eye_pos is a global 3D position at which the
4849    //                  camera contains all points in global_coords[] in its FOV
4850    //              2.) geom_aspect = get_aspect_vector(
4851    //                      output_size.x / output_size.y);
4852    //              3.) global_coords is a point cloud containing global xyz
4853    //                  coords of extreme points on the simulated CRT screen.
4854    //              Globals:
4855    //              1.) geom_view_dist must be > 0.0.  It controls the "near
4856    //                  plane" used to interpret flat_video_uv as a view
4857    //                  vector, which controls the field of view (FOV).
4858    //              Eyespace coordinate frame: +x = right, +y = up, +z = back
4859    //  Returns:    Return an eye position at which the point cloud spans as
4860    //              much of the screen as possible (given the FOV controlled by
4861    //              geom_view_dist) without being cropped or sheared.
4862    //  Algorithm:
4863    //  1.) Move the eye laterally to a point which attempts to maximize the
4864    //      the amount we can move forward without clipping the CRT screen.
4865    //  2.) Move forward by as much as possible without clipping the CRT.
4866    //  Get the allowed movement range by solving for the eye_pos offsets
4867    //  that result in each point being projected to a screen edge/corner in
4868    //  pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
4869    //  and z = eyespace z):
4870    //      pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)*
4871    //      geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
4872    //  Notes:
4873    //  The field of view is controlled by geom_view_dist's magnitude relative to
4874    //  the view vector's x and y components:
4875    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect
4876    //      view_vec.z = -geom_view_dist
4877    //  But for the purposes of perspective divide, it should be considered:
4878    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
4879    //      view_vec.z = -1.0
4880    static const int max_centering_iters = 1;  //  Keep for easy testing.
4881    for(int iter = 0; iter < max_centering_iters; iter++)
4882    {
4883        //  0.) Get the eyespace coordinates of our point cloud:
4884        float3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
4885        for(int i = 0; i < num_points; i++)
4886        {
4887            eyespace_coords[i] = global_coords[i] - eye_pos;
4888        }
4889        //  1a.)For each point, find out how far we can move eye_pos in each
4890        //      lateral direction without the point clipping the frustum.
4891        //      Eyespace +y = up, screenspace +y = down, so flip y after
4892        //      applying the eyespace offset (on the way to "clip space").
4893        //  Solve for two offsets per point based on:
4894        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
4895        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5)
4896        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
4897        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5)
4898        //  offset_ul and offset_dr represent the farthest we can move the
4899        //  eye_pos up-left and down-right.  Save the min of all offset_dr's
4900        //  and the max of all offset_ul's (since it's negative).
4901        float abs_radius = abs(geom_radius);  //  In case anyone gets ideas. ;)
4902        float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius);
4903        float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius);
4904        for(int i = 0; i < num_points; i++)
4905        {
4906            static const float2 flipy = float2(1.0, -1.0);
4907            float3 eyespace_xyz = eyespace_coords[i];
4908            float2 offset_dr = eyespace_xyz.xy - float2(-0.5) *
4909                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
4910            float2 offset_ul = eyespace_xyz.xy - float2(0.5) *
4911                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
4912            offset_dr_min = min(offset_dr_min, offset_dr);
4913            offset_ul_max = max(offset_ul_max, offset_ul);
4914        }
4915        //  1b.)Update eye_pos: Adding the average of offset_ul_max and
4916        //      offset_dr_min gives it equal leeway on the top vs. bottom
4917        //      and left vs. right.  Recalculate eyespace_coords accordingly.
4918        float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
4919        eye_pos.xy += center_offset;
4920        for(int i = 0; i < num_points; i++)
4921        {
4922            eyespace_coords[i] = global_coords[i] - eye_pos;
4923        }
4924        //  2a.)For each point, find out how far we can move eye_pos forward
4925        //      without the point clipping the frustum.  Flip the y
4926        //      direction in advance (matters for a later step, not here).
4927        //      Solve for four offsets per point based on:
4928        //      eyespace_xyz_flipy.x * geom_view_dist /
4929        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5
4930        //      eyespace_xyz_flipy.y * geom_view_dist /
4931        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5
4932        //      eyespace_xyz_flipy.x * geom_view_dist /
4933        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5
4934        //      eyespace_xyz_flipy.y * geom_view_dist /
4935        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5
4936        //      We'll vectorize the actual computation.  Take the maximum of
4937        //      these four for a single offset, and continue taking the max
4938        //      for every point (use max because offset.z is negative).
4939        float offset_z_max = -10.0 * geom_radius * geom_view_dist;
4940        for(int i = 0; i < num_points; i++)
4941        {
4942            float3 eyespace_xyz_flipy = eyespace_coords[i] *
4943                float3(1.0, -1.0, 1.0);
4944            float4 offset_zzzz = eyespace_xyz_flipy.zzzz +
4945                (eyespace_xyz_flipy.xyxy * geom_view_dist) /
4946                (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect));
4947            //  Ignore offsets that push positive x/y values to opposite
4948            //  boundaries, and vice versa, and don't let the camera move
4949            //  past a point in the dead center of the screen:
4950            offset_z_max = (eyespace_xyz_flipy.x < 0.0) ?
4951                max(offset_z_max, offset_zzzz.x) : offset_z_max;
4952            offset_z_max = (eyespace_xyz_flipy.y < 0.0) ?
4953                max(offset_z_max, offset_zzzz.y) : offset_z_max;
4954            offset_z_max = (eyespace_xyz_flipy.x > 0.0) ?
4955                max(offset_z_max, offset_zzzz.z) : offset_z_max;
4956            offset_z_max = (eyespace_xyz_flipy.y > 0.0) ?
4957                max(offset_z_max, offset_zzzz.w) : offset_z_max;
4958            offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z);
4959        }
4960        //  2b.)Update eye_pos: Add the maximum (smallest negative) z offset.
4961        eye_pos.z += offset_z_max;
4962    }
4963    return eye_pos;
4964}
4965
4966float3 get_ideal_global_eye_pos(const float3x3 local_to_global,
4967    const float2 geom_aspect, const float geom_mode)
4968{
4969    //  Start with an initial eye_pos that includes the entire primitive
4970    //  (sphere or cylinder) in its field-of-view:
4971    const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist);
4972    const float3 low_view = high_view * float3(1.0, -1.0, 1.0);
4973    const float len_sq = dot(high_view, high_view);
4974    const float fov = abs(acos(dot(high_view, low_view)/len_sq));
4975    //  Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
4976    const float eye_z_spherical = geom_radius/sin(fov*0.5);
4977    const float3 eye_pos = geom_mode < 2.5 ?
4978        float3(0.0, 0.0, eye_z_spherical) :
4979        float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
4980
4981    //  Get global xyz coords of extreme sample points on the simulated CRT
4982    //  screen.  Start with the center, edge centers, and corners of the
4983    //  video image.  We can't ignore backfacing points: They're occluded
4984    //  by closer points on the primitive, but they may NOT be occluded by
4985    //  the convex hull of the remaining samples (i.e. the remaining convex
4986    //  hull might not envelope points that do occlude a back-facing point.)
4987    static const int num_points = MAX_POINT_CLOUD_SIZE;
4988    float3 global_coords[MAX_POINT_CLOUD_SIZE];
4989    global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode));
4990    global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode));
4991    global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode));
4992    global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode));
4993    global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode));
4994    global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode));
4995    global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode));
4996    global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode));
4997    global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode));
4998    //  Adding more inner image points could help in extreme cases, but too many
4999    //  points will kille the framerate.  For safety, default to the initial
5000    //  eye_pos if any z coords are negative:
5001    float num_negative_z_coords = 0.0;
5002    for(int i = 0; i < num_points; i++)
5003    {
5004        num_negative_z_coords += float(global_coords[0].z < 0.0);
5005    }
5006    //  Outsource the optimized eye_pos calculation:
5007    return num_negative_z_coords > 0.5 ? eye_pos :
5008        get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
5009            global_coords, num_points);
5010}
5011
5012float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local,
5013    const float3 eye_pos_local, const float3 view_vec_global,
5014    const float3 intersection_pos_local, const float3 normal,
5015    const float2 output_size_inv)
5016{
5017    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
5018    //              descriptions of each parameter.
5019    //  Returns:    Return a transformation matrix from 2D pixel-space vectors
5020    //              (where (+1.0, +1.0) is a vector to one pixel down-right,
5021    //              i.e. same directionality as uv texels) to 3D object-space
5022    //              vectors in the CRT's local coordinate frame (right-handed)
5023    //              ***which are tangent to the CRT surface at the intersection
5024    //              position.***  (Basically, we want to convert pixel-space
5025    //              vectors to 3D vectors along the CRT's surface, for later
5026    //              conversion to uv vectors.)
5027    //  Shorthand inputs:
5028    const float3 pos = intersection_pos_local;
5029    const float3 eye_pos = eye_pos_local;
5030    //  Get a piecewise-linear matrix transforming from "pixelspace" offset
5031    //  vectors (1.0 = one pixel) to object space vectors in the tangent
5032    //  plane (faster than finding 3 view-object intersections).
5033    //  1.) Get the local view vecs for the pixels to the right and down:
5034    const float3 view_vec_right_global = view_vec_global +
5035        float3(output_size_inv.x, 0.0, 0.0);
5036    const float3 view_vec_down_global = view_vec_global +
5037        float3(0.0, -output_size_inv.y, 0.0);
5038    const float3 view_vec_right_local =
5039        mul(global_to_local, view_vec_right_global);
5040    const float3 view_vec_down_local =
5041        mul(global_to_local, view_vec_down_global);
5042    //  2.) Using the true intersection point, intersect the neighboring
5043    //      view vectors with the tangent plane:
5044    const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal));
5045    const float3 right_pos = eye_pos + (intersection_vec_dot_normal /
5046        dot(view_vec_right_local, normal))*view_vec_right_local;
5047    const float3 down_pos = eye_pos + (intersection_vec_dot_normal /
5048        dot(view_vec_down_local, normal))*view_vec_down_local;
5049    //  3.) Subtract the original intersection pos from its neighbors; the
5050    //      resulting vectors are object-space vectors tangent to the plane.
5051    //      These vectors are the object-space transformations of (1.0, 0.0)
5052    //      and (0.0, 1.0) pixel offsets, so they form the first two basis
5053    //      vectors of a pixelspace to object space transformation.  This
5054    //      transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
5055    const float3 object_right_vec = right_pos - pos;
5056    const float3 object_down_vec = down_pos - pos;
5057    const float3x3 pixel_to_object = float3x3(
5058        object_right_vec.x, object_down_vec.x, 0.0,
5059        object_right_vec.y, object_down_vec.y, 0.0,
5060        object_right_vec.z, object_down_vec.z, 0.0);
5061    return pixel_to_object;
5062}
5063
5064float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local,
5065    const float3 normal, const float2 geom_aspect, const float geom_mode)
5066{
5067    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
5068    //              descriptions of each parameter.
5069    //  Returns:    Return a transformation matrix from 3D object-space vectors
5070    //              in the CRT's local coordinate frame (right-handed, +y = up)
5071    //              to 2D video_uv vectors (+v = down).
5072    //  Description:
5073    //  The TBN matrix formed by the [tangent, bitangent, normal] basis
5074    //  vectors transforms ordinary vectors from tangent->object space.
5075    //  The cotangent matrix formed by the [cotangent, cobitangent, normal]
5076    //  basis vectors transforms normal vectors (covectors) from
5077    //  tangent->object space.  It's the inverse-transpose of the TBN matrix.
5078    //  We want the inverse of the TBN matrix (transpose of the cotangent
5079    //  matrix), which transforms ordinary vectors from object->tangent space.
5080    //  Start by calculating the relevant basis vectors in accordance with
5081    //  Christian Schüler's blog post "Followup: Normal Mapping Without
5082    //  Precomputed Tangents":  http://www.thetenthplanet.de/archives/1180
5083    //  With our particular uv mapping, the scale of the u and v directions
5084    //  is determined entirely by the aspect ratio for cylindrical and ordinary
5085    //  spherical mappings, and so tangent and bitangent lengths are also
5086    //  determined by it (the alternate mapping is more complex).  Therefore, we
5087    //  must ensure appropriate cotangent and cobitangent lengths as well.
5088    //  Base these off the uv<=>xyz mappings for each primitive.
5089    const float3 pos = intersection_pos_local;
5090    static const float3 x_vec = float3(1.0, 0.0, 0.0);
5091    static const float3 y_vec = float3(0.0, 1.0, 0.0);
5092    //  The tangent and bitangent vectors correspond with increasing u and v,
5093    //  respectively.  Mathematically we'd base the cotangent/cobitangent on
5094    //  those, but we'll compute the cotangent/cobitangent directly when we can.
5095    float3 cotangent_unscaled, cobitangent_unscaled;
5096    //  geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
5097    if(geom_mode < 1.5)
5098    {
5099        //  Sphere:
5100        //  tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x
5101        //  bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y
5102        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
5103        //  cotangent = cross(normal, bitangent) * inv_determinant
5104        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
5105        //  cobitangent = cross(tangent, normal) * inv_determinant
5106        //            == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant
5107        //  Simplified (scale by inv_determinant below):
5108        cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y;
5109        cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x;
5110    }
5111    else if(geom_mode < 2.5)
5112    {
5113        //  Sphere, alternate mapping:
5114        //  This mapping works a bit like the cylindrical mapping in two
5115        //  directions, which makes the lengths and directions more complex.
5116        //  Unfortunately, I can't find much of a shortcut:
5117        const float3 tangent = normalize(
5118            cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x;
5119        const float3 bitangent = normalize(
5120            cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y;
5121        cotangent_unscaled = cross(normal, bitangent);
5122        cobitangent_unscaled = cross(tangent, normal);
5123    }
5124    else
5125    {
5126        //  Cylinder:
5127        //  tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
5128        //  bitangent = float3(0.0, -geom_aspect.y, 0.0);
5129        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
5130        //  cotangent = cross(normal, bitangent) * inv_determinant
5131        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
5132        //  cobitangent = cross(tangent, normal) * inv_determinant
5133        //            == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant
5134        cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
5135        cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0);
5136    }
5137    const float3 computed_normal =
5138        cross(cobitangent_unscaled, cotangent_unscaled);
5139    const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
5140    const float3 cotangent = cotangent_unscaled * inv_determinant;
5141    const float3 cobitangent = cobitangent_unscaled * inv_determinant;
5142    //  The [cotangent, cobitangent, normal] column vecs form the cotangent
5143    //  frame, i.e. the inverse-transpose TBN matrix.  Get its transpose:
5144    const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal);
5145    return object_to_tangent;
5146}
5147
5148float2 get_curved_video_uv_coords_and_tangent_matrix(
5149    const float2 flat_video_uv, const float3 eye_pos_local,
5150    const float2 output_size_inv, const float2 geom_aspect,
5151    const float geom_mode, const float3x3 global_to_local,
5152    out float2x2 pixel_to_tangent_video_uv)
5153{
5154    //  Requires:   Parameters:
5155    //              1.) flat_video_uv coords are in range [0.0, 1.0], where
5156    //                  (0.0, 0.0) is the top-left corner of the screen and
5157    //                  (1.0, 1.0) is the bottom-right corner.
5158    //              2.) eye_pos_local is the 3D camera position in the simulated
5159    //                  CRT's local coordinate frame.  For best results, it must
5160    //                  be computed based on the same geom_view_dist used here.
5161    //              3.) output_size_inv = float2(1.0)/output_size
5162    //              4.) geom_aspect = get_aspect_vector(
5163    //                      output_size.x / output_size.y);
5164    //              5.) geom_mode is a static or runtime mode setting:
5165    //                  0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder
5166    //              6.) global_to_local is a 3x3 matrix transforming (ordinary)
5167    //                  worldspace vectors to the CRT's local coordinate frame
5168    //              Globals:
5169    //              1.) geom_view_dist must be > 0.0.  It controls the "near
5170    //                  plane" used to interpret flat_video_uv as a view
5171    //                  vector, which controls the field of view (FOV).
5172    //  Returns:    Return final uv coords in [0.0, 1.0], and return a pixel-
5173    //              space to video_uv tangent-space matrix in the out parameter.
5174    //              (This matrix assumes pixel-space +y = down, like +v = down.)
5175    //              We'll transform flat_video_uv into a view vector, project
5176    //              the view vector from the camera/eye, intersect with a sphere
5177    //              or cylinder representing the simulated CRT, and convert the
5178    //              intersection position into final uv coords and a local
5179    //              transformation matrix.
5180    //  First get the 3D view vector (geom_aspect and geom_view_dist are globals):
5181    //  1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5)
5182    //      correspond to the top-left/bottom-right output screen corners.
5183    //  2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen-
5184    //      space 2D aspect correction.  We'll reapply it in uv-space.
5185    //  3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y
5186    //      is up in 3D worldspace (enforce a right-handed system).
5187    //  4.) The view vector z controls the "near plane" distance and FOV.
5188    //      For the effect of "looking through a window" at a CRT, it should be
5189    //      set equal to the user's distance from their physical screen, in
5190    //      units of the viewport's physical diagonal size.
5191    const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect;
5192    const float3 view_vec_global =
5193        float3(view_uv.x, -view_uv.y, -geom_view_dist);
5194    //  Transform the view vector into the CRT's local coordinate frame, convert
5195    //  to video_uv coords, and get the local 3D intersection position:
5196    const float3 view_vec_local = mul(global_to_local, view_vec_global);
5197    float3 pos;
5198    const float2 centered_uv = view_vec_to_uv(
5199        view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
5200    const float2 video_uv = centered_uv + float2(0.5);
5201    //  Get a pixel-to-tangent-video-uv matrix.  The caller could deal with
5202    //  all but one of these cases, but that would be more complicated.
5203    #ifdef DRIVERS_ALLOW_DERIVATIVES
5204        //  Derivatives obtain a matrix very fast, but the direction of pixel-
5205        //  space +y seems to depend on the pass.  Enforce the correct direction
5206        //  on a best-effort basis (but it shouldn't matter for antialiasing).
5207        const float2 duv_dx = ddx(video_uv);
5208        const float2 duv_dy = ddy(video_uv);
5209        #ifdef LAST_PASS
5210            pixel_to_tangent_video_uv = float2x2(
5211                duv_dx.x, duv_dy.x,
5212                -duv_dx.y, -duv_dy.y);
5213        #else
5214            pixel_to_tangent_video_uv = float2x2(
5215                duv_dx.x, duv_dy.x,
5216                duv_dx.y, duv_dy.y);
5217        #endif
5218    #else
5219        //  Manually define a transformation matrix.  We'll assume pixel-space
5220        //  +y = down, just like +v = down.
5221        if(geom_force_correct_tangent_matrix)
5222        {
5223            //  Get the surface normal based on the local intersection position:
5224            const float3 normal_base = geom_mode < 2.5 ? pos :
5225                float3(pos.x, 0.0, pos.z);
5226            const float3 normal = normalize(normal_base);
5227            //  Get pixel-to-object and object-to-tangent matrices and combine
5228            //  them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
5229            const float3x3 pixel_to_object = get_pixel_to_object_matrix(
5230                global_to_local, eye_pos_local, view_vec_global, pos, normal,
5231                output_size_inv);
5232            const float3x3 object_to_tangent = get_object_to_tangent_matrix(
5233                pos, normal, geom_aspect, geom_mode);
5234            const float3x3 pixel_to_tangent3x3 =
5235                mul(object_to_tangent, pixel_to_object);
5236            pixel_to_tangent_video_uv = float2x2(
5237                pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); //TODO/FIXME: needs to correct for column-major??
5238        }
5239        else
5240        {
5241            //  Ignore curvature, and just consider flat scaling.  The
5242            //  difference is only apparent with strong curvature:
5243            pixel_to_tangent_video_uv = float2x2(
5244                output_size_inv.x, 0.0, 0.0, output_size_inv.y);
5245        }
5246    #endif
5247    return video_uv;
5248}
5249
5250float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect)
5251{
5252    //  COPYRIGHT NOTE FOR THIS FUNCTION:
5253    //  Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
5254    //  This function uses an algorithm first coded in several of cgwg's GPL-
5255    //  licensed lines in crt-geom-curved.cg and its ancestors.  The line
5256    //  between algorithm and code is nearly indistinguishable here, so it's
5257    //  unclear whether I could even release this project under a non-GPL
5258    //  license with this function included.
5259
5260    //  Calculate border_dim_factor from the proximity to uv-space image
5261    //  borders; geom_aspect/border_size/border/darkness/border_compress are globals:
5262    const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) *
5263        geom_aspect;
5264    const float2 border_penetration =
5265        max(float2(border_size) - edge_dists, float2(0.0));
5266    const float penetration_ratio = length(border_penetration)/border_size;
5267    const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
5268    const float border_dim_factor =
5269        pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
5270    return min(border_dim_factor, 1.0);
5271}
5272
5273
5274
5275#endif  //  GEOMETRY_FUNCTIONS_H
5276
5277/////////////////////////  END GEOMETRY-FUNCTIONS  /////////////////////////
5278
5279///////////////////////////////////  HELPERS  //////////////////////////////////
5280
5281float2x2 mul_scale(float2 scale, float2x2 matrix)
5282{
5283    //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y);
5284    //return mul(scale_matrix, matrix);
5285    float4 intermed = float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy;
5286    return float2x2(intermed.x, intermed.y, intermed.z, intermed.w);
5287}
5288
5289#undef COMPAT_PRECISION
5290#undef COMPAT_TEXTURE
5291
5292#if defined(VERTEX)
5293
5294#if __VERSION__ >= 130
5295#define COMPAT_VARYING out
5296#define COMPAT_ATTRIBUTE in
5297#define COMPAT_TEXTURE texture
5298#else
5299#define COMPAT_VARYING varying
5300#define COMPAT_ATTRIBUTE attribute
5301#define COMPAT_TEXTURE texture2D
5302#endif
5303
5304#ifdef GL_ES
5305#define COMPAT_PRECISION mediump
5306#else
5307#define COMPAT_PRECISION
5308#endif
5309
5310COMPAT_ATTRIBUTE vec4 VertexCoord;
5311COMPAT_ATTRIBUTE vec4 COLOR;
5312COMPAT_ATTRIBUTE vec4 TexCoord;
5313COMPAT_VARYING vec4 COL0;
5314COMPAT_VARYING vec4 TEX0;
5315COMPAT_VARYING vec2 tex_uv;
5316COMPAT_VARYING vec4 video_and_texture_size_inv;
5317COMPAT_VARYING vec2 output_size_inv;
5318COMPAT_VARYING vec3 eye_pos_local;
5319COMPAT_VARYING vec4 geom_aspect_and_overscan;
5320COMPAT_VARYING vec3 global_to_local_row0;
5321COMPAT_VARYING vec3 global_to_local_row1;
5322COMPAT_VARYING vec3 global_to_local_row2;
5323
5324vec4 _oPosition1;
5325uniform mat4 MVPMatrix;
5326uniform COMPAT_PRECISION int FrameDirection;
5327uniform COMPAT_PRECISION int FrameCount;
5328uniform COMPAT_PRECISION vec2 OutputSize;
5329uniform COMPAT_PRECISION vec2 TextureSize;
5330uniform COMPAT_PRECISION vec2 InputSize;
5331
5332// compatibility #defines
5333#define vTexCoord TEX0.xy
5334#define SourceSize vec4(TextureSize, 1.0 / TextureSize) //either TextureSize or InputSize
5335#define OutSize vec4(OutputSize, 1.0 / OutputSize)
5336
5337void main()
5338{
5339    gl_Position = MVPMatrix * VertexCoord;
5340    TEX0.xy = TexCoord.xy;
5341	tex_uv = TEX0.xy;
5342    video_and_texture_size_inv =
5343        float4(1.0, 1.0, 1.0, 1.0) / float4(video_size, texture_size);
5344    output_size_inv = float2(1.0, 1.0)/output_size;
5345
5346    //  Get aspect/overscan vectors from scalar parameters (likely uniforms):
5347    const float viewport_aspect_ratio = output_size.x/output_size.y;
5348    const float2 geom_aspect = get_aspect_vector(viewport_aspect_ratio);
5349    const float2 geom_overscan = get_geom_overscan_vector();
5350    geom_aspect_and_overscan = float4(geom_aspect, geom_overscan);
5351
5352    #ifdef RUNTIME_GEOMETRY_TILT
5353        //  Create a local-to-global rotation matrix for the CRT's coordinate
5354        //  frame and its global-to-local inverse.  Rotate around the x axis
5355        //  first (pitch) and then the y axis (yaw) with yucky Euler angles.
5356        //  Positive angles go clockwise around the right-vec and up-vec.
5357        //  Runtime shader parameters prevent us from computing these globally,
5358        //  but we can still combine the pitch/yaw matrices by hand to cut a
5359        //  few instructions.  Note that cg matrices fill row1 first, then row2,
5360        //  etc. (row-major order).
5361        const float2 geom_tilt_angle = get_geom_tilt_angle_vector();
5362        const float2 sin_tilt = sin(geom_tilt_angle);
5363        const float2 cos_tilt = cos(geom_tilt_angle);
5364        //  Conceptual breakdown:
5365              static const float3x3 rot_x_matrix = float3x3(
5366                  1.0, 0.0, 0.0,
5367                  0.0, cos_tilt.y, -sin_tilt.y,
5368                  0.0, sin_tilt.y, cos_tilt.y);
5369              static const float3x3 rot_y_matrix = float3x3(
5370                  cos_tilt.x, 0.0, sin_tilt.x,
5371                  0.0, 1.0, 0.0,
5372                  -sin_tilt.x, 0.0, cos_tilt.x);
5373              static const float3x3 local_to_global =
5374                  mul(rot_y_matrix, rot_x_matrix);
5375/*              static const float3x3 global_to_local =
5376                  transpose(local_to_global);
5377        const float3x3 local_to_global = float3x3(
5378            cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
5379            0.0, cos_tilt.y, sin_tilt.y,
5380            sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
5381*/        //  This is a pure rotation, so transpose = inverse:
5382        const float3x3 global_to_local = transpose(local_to_global);
5383        //  Decompose the matrix into 3 float3's for output:
5384        global_to_local_row0 = float3(global_to_local[0][0], global_to_local[0][1], global_to_local[0][2]);//._m00_m01_m02);
5385        global_to_local_row1 = float3(global_to_local[1][0], global_to_local[1][1], global_to_local[1][2]);//._m10_m11_m12);
5386        global_to_local_row2 = float3(global_to_local[2][0], global_to_local[2][1], global_to_local[2][2]);//._m20_m21_m22);
5387    #else
5388        static const float3x3 global_to_local = geom_global_to_local_static;
5389        static const float3x3 local_to_global = geom_local_to_global_static;
5390    #endif
5391
5392    //  Get an optimal eye position based on geom_view_dist, viewport_aspect,
5393    //  and CRT radius/rotation:
5394    #ifdef RUNTIME_GEOMETRY_MODE
5395        const float geom_mode = geom_mode_runtime;
5396    #else
5397        static const float geom_mode = geom_mode_static;
5398    #endif
5399    const float3 eye_pos_global =
5400        get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode);
5401    eye_pos_local = mul(global_to_local, eye_pos_global);
5402}
5403
5404#elif defined(FRAGMENT)
5405
5406#ifdef GL_ES
5407#ifdef GL_FRAGMENT_PRECISION_HIGH
5408precision highp float;
5409#else
5410precision mediump float;
5411#endif
5412#define COMPAT_PRECISION mediump
5413#else
5414#define COMPAT_PRECISION
5415#endif
5416
5417#if __VERSION__ >= 130
5418#define COMPAT_VARYING in
5419#define COMPAT_TEXTURE texture
5420out COMPAT_PRECISION vec4 FragColor;
5421#else
5422#define COMPAT_VARYING varying
5423#define FragColor gl_FragColor
5424#define COMPAT_TEXTURE texture2D
5425#endif
5426
5427uniform COMPAT_PRECISION int FrameDirection;
5428uniform COMPAT_PRECISION int FrameCount;
5429uniform COMPAT_PRECISION vec2 OutputSize;
5430uniform COMPAT_PRECISION vec2 TextureSize;
5431uniform COMPAT_PRECISION vec2 InputSize;
5432uniform sampler2D Texture;
5433#define input_texture Texture
5434COMPAT_VARYING vec4 TEX0;
5435COMPAT_VARYING vec2 tex_uv;
5436COMPAT_VARYING vec4 video_and_texture_size_inv;
5437COMPAT_VARYING vec2 output_size_inv;
5438COMPAT_VARYING vec3 eye_pos_local;
5439COMPAT_VARYING vec4 geom_aspect_and_overscan;
5440COMPAT_VARYING vec3 global_to_local_row0;
5441COMPAT_VARYING vec3 global_to_local_row1;
5442COMPAT_VARYING vec3 global_to_local_row2;
5443
5444// compatibility #defines
5445#define Source Texture
5446#define vTexCoord TEX0.xy
5447
5448#define SourceSize vec4(TextureSize, 1.0 / TextureSize) //either TextureSize or InputSize
5449#define OutSize vec4(OutputSize, 1.0 / OutputSize)
5450
5451void main()
5452{
5453    //  Localize some parameters:
5454    const float2 geom_aspect = geom_aspect_and_overscan.xy;
5455    const float2 geom_overscan = geom_aspect_and_overscan.zw;
5456    const float2 video_size_inv = video_and_texture_size_inv.xy;
5457    const float2 texture_size_inv = video_and_texture_size_inv.zw;
5458    //const float2 output_size_inv = output_size_inv;
5459    #ifdef RUNTIME_GEOMETRY_TILT
5460        const float3x3 global_to_local = float3x3(global_to_local_row0,
5461            global_to_local_row1, global_to_local_row2);
5462    #else
5463        static const float3x3 global_to_local = geom_global_to_local_static;
5464    #endif
5465    #ifdef RUNTIME_GEOMETRY_MODE
5466        const float geom_mode = geom_mode_runtime;
5467    #else
5468        static const float geom_mode = geom_mode_static;
5469    #endif
5470
5471    //  Get flat and curved texture coords for the current fragment point sample
5472    //  and a pixel_to_tangent_video_uv matrix for transforming pixel offsets:
5473    //  video_uv = relative position in video frame, mapped to [0.0, 1.0] range
5474    //  tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range
5475    const float2 flat_video_uv = tex_uv * (texture_size * video_size_inv);
5476    float2x2 pixel_to_video_uv;
5477    float2 video_uv_no_geom_overscan;
5478    if(geom_mode > 0.5)
5479    {
5480        video_uv_no_geom_overscan =
5481            get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv,
5482                eye_pos_local, output_size_inv, geom_aspect,
5483                geom_mode, global_to_local, pixel_to_video_uv);
5484    }
5485    else
5486    {
5487        video_uv_no_geom_overscan = flat_video_uv;
5488        pixel_to_video_uv = float2x2(
5489            output_size_inv.x, 0.0, 0.0, output_size_inv.y);
5490    }
5491    //  Correct for overscan here (not in curvature code):
5492    const float2 video_uv =
5493        (video_uv_no_geom_overscan - float2(0.5, 0.5))/geom_overscan + float2(0.5, 0.5);
5494    const float2 tex_uv = video_uv * (video_size * texture_size_inv);
5495
5496    //  Get a matrix transforming pixel vectors to tex_uv vectors:
5497    const float2x2 pixel_to_tex_uv =
5498        mul_scale(video_size * texture_size_inv /
5499            geom_aspect_and_overscan.zw, pixel_to_video_uv);
5500
5501    //  Sample!  Skip antialiasing if aa_level < 0.5 or both of these hold:
5502    //  1.) Geometry/curvature isn't used
5503    //  2.) Overscan == float2(1.0, 1.0)
5504    //  Skipping AA is sharper, but it's only faster with dynamic branches.
5505    const float2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset());
5506    // this next check seems to always return true, even when it shouldn't so disabling it for now
5507    const bool need_subpixel_aa = false;//abs_aa_r_offset.x + abs_aa_r_offset.y > 0.0;
5508    float3 color;
5509    if(aa_level > 0.5 && (geom_mode > 0.5 || any(bool2((geom_overscan.x != 1.0), (geom_overscan.y != 1.0)))))
5510    {
5511        //  Sample the input with antialiasing (due to sharp phosphors, etc.):
5512        color = tex2Daa(input_texture, tex_uv, pixel_to_tex_uv, float(frame_count));
5513    }
5514
5515    else if(aa_level > 0.5 && need_subpixel_aa)
5516    {
5517        //  Sample at each subpixel location:
5518        color = tex2Daa_subpixel_weights_only(
5519            input_texture, tex_uv, pixel_to_tex_uv);
5520    }
5521    else
5522    {
5523        color = tex2D_linearize(input_texture, tex_uv).rgb;
5524    }
5525
5526    //  Dim borders and output the final result:
5527    const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect);
5528    const float3 final_color = color * border_dim_factor;
5529
5530    FragColor = encode_output(float4(final_color, 1.0));
5531}
5532#endif
5533