1/****************************************************************************
2**
3** Copyright (C) 2014 NVIDIA Corporation.
4** Copyright (C) 2019 The Qt Company Ltd.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of Qt 3D Studio.
8**
9** $QT_BEGIN_LICENSE:GPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU
20** General Public License version 3 or (at your option) any later version
21** approved by the KDE Free Qt Foundation. The licenses are as published by
22** the Free Software Foundation and appearing in the file LICENSE.GPL3
23** included in the packaging of this file. Please review the following
24** information to ensure the GNU General Public License requirements will
25** be met: https://www.gnu.org/licenses/gpl-3.0.html.
26**
27** $QT_END_LICENSE$
28**
29****************************************************************************/
30
31/*============================================================================
32
33
34                    NVIDIA FXAA 3.11 by TIMOTHY LOTTES
35
36------------------------------------------------------------------------------
37                           INTEGRATION CHECKLIST
38------------------------------------------------------------------------------
39(1.)
40In the shader source, setup defines for the desired configuration.
41When providing multiple shaders (for different presets),
42simply setup the defines differently in multiple files.
43Example,
44
45  #define FXAA_PC 1
46  #define FXAA_HLSL_5 1
47  #define FXAA_QUALITY__PRESET 12
48
49Or,
50
51  #define FXAA_360 1
52
53Or,
54
55  #define FXAA_PS3 1
56
57Etc.
58
59(2.)
60Then include this file,
61
62(3.)
63Then call the FXAA pixel shader from within your desired shader.
64Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
65As for FXAA 3.11 all inputs for all shaders are the same
66to enable easy porting between platforms.
67
68  return FxaaPixelShader(...);
69
70(4.)
71Insure pass prior to FXAA outputs RGBL (see next section).
72Or use,
73
74  #define FXAA_GREEN_AS_LUMA 1
75
76(5.)
77Setup engine to provide the following constants
78which are used in the FxaaPixelShader() inputs,
79
80  FxaaFloat2 fxaaQualityRcpFrame,
81  FxaaFloat4 fxaaConsoleRcpFrameOpt,
82  FxaaFloat4 fxaaConsoleRcpFrameOpt2,
83  FxaaFloat4 fxaaConsole360RcpFrameOpt2,
84  FxaaFloat fxaaQualitySubpix,
85  FxaaFloat fxaaQualityEdgeThreshold,
86  FxaaFloat fxaaQualityEdgeThresholdMin,
87  FxaaFloat fxaaConsoleEdgeSharpness,
88  FxaaFloat fxaaConsoleEdgeThreshold,
89  FxaaFloat fxaaConsoleEdgeThresholdMin,
90  FxaaFloat4 fxaaConsole360ConstDir
91
92Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
93
94(6.)
95Have FXAA vertex shader run as a full screen triangle,
96and output "pos" and "fxaaConsolePosPos"
97such that inputs in the pixel shader provide,
98
99  // {xy} = center of pixel
100  FxaaFloat2 pos,
101
102  // {xy__} = upper left of pixel
103  // {__zw} = lower right of pixel
104  FxaaFloat4 fxaaConsolePosPos,
105
106(7.)
107Insure the texture sampler(s) used by FXAA are set to bilinear filtering.
108
109
110------------------------------------------------------------------------------
111                    INTEGRATION - RGBL AND COLORSPACE
112------------------------------------------------------------------------------
113FXAA3 requires RGBL as input unless the following is set,
114
115  #define FXAA_GREEN_AS_LUMA 1
116
117In which case the engine uses green in place of luma,
118and requires RGB input is in a non-linear colorspace.
119
120RGB should be LDR (low dynamic range).
121Specifically do FXAA after tonemapping.
122
123RGB data as returned by a texture fetch can be non-linear,
124or linear when FXAA_GREEN_AS_LUMA is not set.
125Note an "sRGB format" texture counts as linear,
126because the result of a texture fetch is linear data.
127Regular "RGBA8" textures in the sRGB colorspace are non-linear.
128
129If FXAA_GREEN_AS_LUMA is not set,
130luma must be stored in the alpha channel prior to running FXAA.
131This luma should be in a perceptual space (could be gamma 2.0).
132Example pass before FXAA where output is gamma 2.0 encoded,
133
134  color.rgb = ToneMap(color.rgb); // linear color output
135  color.rgb = sqrt(color.rgb);    // gamma 2.0 color output
136  return color;
137
138To use FXAA,
139
140  color.rgb = ToneMap(color.rgb);  // linear color output
141  color.rgb = sqrt(color.rgb);     // gamma 2.0 color output
142  color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma
143  return color;
144
145Another example where output is linear encoded,
146say for instance writing to an sRGB formated render target,
147where the render target does the conversion back to sRGB after blending,
148
149  color.rgb = ToneMap(color.rgb); // linear color output
150  return color;
151
152To use FXAA,
153
154  color.rgb = ToneMap(color.rgb); // linear color output
155  color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma
156  return color;
157
158Getting luma correct is required for the algorithm to work correctly.
159
160
161------------------------------------------------------------------------------
162                          BEING LINEARLY CORRECT?
163------------------------------------------------------------------------------
164Applying FXAA to a framebuffer with linear RGB color will look worse.
165This is very counter intuitive, but happends to be true in this case.
166The reason is because dithering artifacts will be more visiable
167in a linear colorspace.
168
169
170------------------------------------------------------------------------------
171                             COMPLEX INTEGRATION
172------------------------------------------------------------------------------
173Q. What if the engine is blending into RGB before wanting to run FXAA?
174
175A. In the last opaque pass prior to FXAA,
176   have the pass write out luma into alpha.
177   Then blend into RGB only.
178   FXAA should be able to run ok
179   assuming the blending pass did not any add aliasing.
180   This should be the common case for particles and common blending passes.
181
182A. Or use FXAA_GREEN_AS_LUMA.
183
184============================================================================*/
185
186/*============================================================================
187
188                             INTEGRATION KNOBS
189
190============================================================================*/
191//
192// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE).
193// FXAA_360_OPT is a prototype for the new optimized 360 version.
194//
195// 1 = Use API.
196// 0 = Don't use API.
197//
198/*--------------------------------------------------------------------------*/
199#ifndef FXAA_PS3
200    #define FXAA_PS3 0
201#endif
202/*--------------------------------------------------------------------------*/
203#ifndef FXAA_360
204    #define FXAA_360 0
205#endif
206/*--------------------------------------------------------------------------*/
207#ifndef FXAA_360_OPT
208    #define FXAA_360_OPT 0
209#endif
210/*==========================================================================*/
211#ifndef FXAA_PC
212    //
213    // FXAA Quality
214    // The high quality PC algorithm.
215    //
216    #define FXAA_PC 0
217#endif
218/*--------------------------------------------------------------------------*/
219#ifndef FXAA_PC_CONSOLE
220    //
221    // The console algorithm for PC is included
222    // for developers targeting really low spec machines.
223    // Likely better to just run FXAA_PC, and use a really low preset.
224    //
225    #define FXAA_PC_CONSOLE 0
226#endif
227/*--------------------------------------------------------------------------*/
228#ifndef FXAA_GLSL_120
229    #define FXAA_GLSL_120 0
230#endif
231/*--------------------------------------------------------------------------*/
232#ifndef FXAA_GLSL_130
233    #define FXAA_GLSL_130 0
234#endif
235/*--------------------------------------------------------------------------*/
236#ifndef FXAA_HLSL_3
237    #define FXAA_HLSL_3 0
238#endif
239/*--------------------------------------------------------------------------*/
240#ifndef FXAA_HLSL_4
241    #define FXAA_HLSL_4 0
242#endif
243/*--------------------------------------------------------------------------*/
244#ifndef FXAA_HLSL_5
245    #define FXAA_HLSL_5 0
246#endif
247/*==========================================================================*/
248#ifndef FXAA_GREEN_AS_LUMA
249    //
250    // For those using non-linear color,
251    // and either not able to get luma in alpha, or not wanting to,
252    // this enables FXAA to run using green as a proxy for luma.
253    // So with this enabled, no need to pack luma in alpha.
254    //
255    // This will turn off AA on anything which lacks some amount of green.
256    // Pure red and blue or combination of only R and B, will get no AA.
257    //
258    // Might want to lower the settings for both,
259    //    fxaaConsoleEdgeThresholdMin
260    //    fxaaQualityEdgeThresholdMin
261    // In order to insure AA does not get turned off on colors
262    // which contain a minor amount of green.
263    //
264    // 1 = On.
265    // 0 = Off.
266    //
267    #define FXAA_GREEN_AS_LUMA 0
268#endif
269/*--------------------------------------------------------------------------*/
270#ifndef FXAA_EARLY_EXIT
271    //
272    // Controls algorithm's early exit path.
273    // On PS3 turning this ON adds 2 cycles to the shader.
274    // On 360 turning this OFF adds 10ths of a millisecond to the shader.
275    // Turning this off on console will result in a more blurry image.
276    // So this defaults to on.
277    //
278    // 1 = On.
279    // 0 = Off.
280    //
281    #define FXAA_EARLY_EXIT 1
282#endif
283/*--------------------------------------------------------------------------*/
284#ifndef FXAA_DISCARD
285    //
286    // Only valid for PC OpenGL currently.
287    // Probably will not work when FXAA_GREEN_AS_LUMA = 1.
288    //
289    // 1 = Use discard on pixels which don't need AA.
290    //     For APIs which enable concurrent TEX+ROP from same surface.
291    // 0 = Return unchanged color on pixels which don't need AA.
292    //
293    #define FXAA_DISCARD 0
294#endif
295/*--------------------------------------------------------------------------*/
296#ifndef FXAA_FAST_PIXEL_OFFSET
297    //
298    // Used for GLSL 120 only.
299    //
300    // 1 = GL API supports fast pixel offsets
301    // 0 = do not use fast pixel offsets
302    //
303    #ifdef GL_EXT_gpu_shader4
304        #define FXAA_FAST_PIXEL_OFFSET 1
305    #endif
306    #ifdef GL_NV_gpu_shader5
307        #define FXAA_FAST_PIXEL_OFFSET 1
308    #endif
309    #ifdef GL_ARB_gpu_shader5
310        #define FXAA_FAST_PIXEL_OFFSET 1
311    #endif
312    #ifndef FXAA_FAST_PIXEL_OFFSET
313        #define FXAA_FAST_PIXEL_OFFSET 0
314    #endif
315#endif
316/*--------------------------------------------------------------------------*/
317#ifndef FXAA_GATHER4_ALPHA
318
319    //
320    // 1 = API supports gather4 on alpha channel.
321    // 0 = API does not support gather4 on alpha channel.
322    //
323
324#ifdef GL_ES
325  #if __VERSION__ >= 310
326     #ifdef GL_EXT_gpu_shader5
327        #define FXAA_GATHER4_ALPHA 1
328    #endif
329  #endif
330    #ifndef FXAA_GATHER4_ALPHA
331        #define FXAA_GATHER4_ALPHA 0
332    #endif
333#else
334    #if (FXAA_HLSL_5 == 1)
335        #define FXAA_GATHER4_ALPHA 1
336    #endif
337    #ifdef GL_ARB_gpu_shader5
338        #define FXAA_GATHER4_ALPHA 1
339    #endif
340    #ifdef GL_NV_gpu_shader5
341        #define FXAA_GATHER4_ALPHA 1
342    #endif
343    #ifndef FXAA_GATHER4_ALPHA
344        #define FXAA_GATHER4_ALPHA 0
345    #endif
346  #endif
347#endif
348
349/*============================================================================
350                      FXAA CONSOLE PS3 - TUNING KNOBS
351============================================================================*/
352#ifndef FXAA_CONSOLE__PS3_EDGE_SHARPNESS
353    //
354    // Consoles the sharpness of edges on PS3 only.
355    // Non-PS3 tuning is done with shader input.
356    //
357    // Due to the PS3 being ALU bound,
358    // there are only two safe values here: 4 and 8.
359    // These options use the shaders ability to a free *|/ by 2|4|8.
360    //
361    // 8.0 is sharper
362    // 4.0 is softer
363    // 2.0 is really soft (good for vector graphics inputs)
364    //
365    #if 1
366        #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 8.0
367    #endif
368    #if 0
369        #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 4.0
370    #endif
371    #if 0
372        #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 2.0
373    #endif
374#endif
375/*--------------------------------------------------------------------------*/
376#ifndef FXAA_CONSOLE__PS3_EDGE_THRESHOLD
377    //
378    // Only effects PS3.
379    // Non-PS3 tuning is done with shader input.
380    //
381    // The minimum amount of local contrast required to apply algorithm.
382    // The console setting has a different mapping than the quality setting.
383    //
384    // This only applies when FXAA_EARLY_EXIT is 1.
385    //
386    // Due to the PS3 being ALU bound,
387    // there are only two safe values here: 0.25 and 0.125.
388    // These options use the shaders ability to a free *|/ by 2|4|8.
389    //
390    // 0.125 leaves less aliasing, but is softer
391    // 0.25 leaves more aliasing, and is sharper
392    //
393    #if 1
394        #define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.125
395    #else
396        #define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.25
397    #endif
398#endif
399
400/*============================================================================
401                        FXAA QUALITY - TUNING KNOBS
402------------------------------------------------------------------------------
403NOTE the other tuning knobs are now in the shader function inputs!
404============================================================================*/
405#ifndef FXAA_QUALITY__PRESET
406    //
407    // Choose the quality preset.
408    // This needs to be compiled into the shader as it effects code.
409    // Best option to include multiple presets is to
410    // in each shader define the preset, then include this file.
411    //
412    // OPTIONS
413    // -----------------------------------------------------------------------
414    // 10 to 15 - default medium dither (10=fastest, 15=highest quality)
415    // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality)
416    // 39       - no dither, very expensive
417    //
418    // NOTES
419    // -----------------------------------------------------------------------
420    // 12 = slightly faster then FXAA 3.9 and higher edge quality (default)
421    // 13 = about same speed as FXAA 3.9 and better than 12
422    // 23 = closest to FXAA 3.9 visually and performance wise
423    //  _ = the lowest digit is directly related to performance
424    // _  = the highest digit is directly related to style
425    //
426    #define FXAA_QUALITY__PRESET 12
427#endif
428
429
430/*============================================================================
431
432                           FXAA QUALITY - PRESETS
433
434============================================================================*/
435
436/*============================================================================
437                     FXAA QUALITY - MEDIUM DITHER PRESETS
438============================================================================*/
439#if (FXAA_QUALITY__PRESET == 10)
440    #define FXAA_QUALITY__PS 3
441    #define FXAA_QUALITY__P0 1.5
442    #define FXAA_QUALITY__P1 3.0
443    #define FXAA_QUALITY__P2 12.0
444#endif
445/*--------------------------------------------------------------------------*/
446#if (FXAA_QUALITY__PRESET == 11)
447    #define FXAA_QUALITY__PS 4
448    #define FXAA_QUALITY__P0 1.0
449    #define FXAA_QUALITY__P1 1.5
450    #define FXAA_QUALITY__P2 3.0
451    #define FXAA_QUALITY__P3 12.0
452#endif
453/*--------------------------------------------------------------------------*/
454#if (FXAA_QUALITY__PRESET == 12)
455    #define FXAA_QUALITY__PS 5
456    #define FXAA_QUALITY__P0 1.0
457    #define FXAA_QUALITY__P1 1.5
458    #define FXAA_QUALITY__P2 2.0
459    #define FXAA_QUALITY__P3 4.0
460    #define FXAA_QUALITY__P4 12.0
461#endif
462/*--------------------------------------------------------------------------*/
463#if (FXAA_QUALITY__PRESET == 13)
464    #define FXAA_QUALITY__PS 6
465    #define FXAA_QUALITY__P0 1.0
466    #define FXAA_QUALITY__P1 1.5
467    #define FXAA_QUALITY__P2 2.0
468    #define FXAA_QUALITY__P3 2.0
469    #define FXAA_QUALITY__P4 4.0
470    #define FXAA_QUALITY__P5 12.0
471#endif
472/*--------------------------------------------------------------------------*/
473#if (FXAA_QUALITY__PRESET == 14)
474    #define FXAA_QUALITY__PS 7
475    #define FXAA_QUALITY__P0 1.0
476    #define FXAA_QUALITY__P1 1.5
477    #define FXAA_QUALITY__P2 2.0
478    #define FXAA_QUALITY__P3 2.0
479    #define FXAA_QUALITY__P4 2.0
480    #define FXAA_QUALITY__P5 4.0
481    #define FXAA_QUALITY__P6 12.0
482#endif
483/*--------------------------------------------------------------------------*/
484#if (FXAA_QUALITY__PRESET == 15)
485    #define FXAA_QUALITY__PS 8
486    #define FXAA_QUALITY__P0 1.0
487    #define FXAA_QUALITY__P1 1.5
488    #define FXAA_QUALITY__P2 2.0
489    #define FXAA_QUALITY__P3 2.0
490    #define FXAA_QUALITY__P4 2.0
491    #define FXAA_QUALITY__P5 2.0
492    #define FXAA_QUALITY__P6 4.0
493    #define FXAA_QUALITY__P7 12.0
494#endif
495
496/*============================================================================
497                     FXAA QUALITY - LOW DITHER PRESETS
498============================================================================*/
499#if (FXAA_QUALITY__PRESET == 20)
500    #define FXAA_QUALITY__PS 3
501    #define FXAA_QUALITY__P0 1.5
502    #define FXAA_QUALITY__P1 2.0
503    #define FXAA_QUALITY__P2 8.0
504#endif
505/*--------------------------------------------------------------------------*/
506#if (FXAA_QUALITY__PRESET == 21)
507    #define FXAA_QUALITY__PS 4
508    #define FXAA_QUALITY__P0 1.0
509    #define FXAA_QUALITY__P1 1.5
510    #define FXAA_QUALITY__P2 2.0
511    #define FXAA_QUALITY__P3 8.0
512#endif
513/*--------------------------------------------------------------------------*/
514#if (FXAA_QUALITY__PRESET == 22)
515    #define FXAA_QUALITY__PS 5
516    #define FXAA_QUALITY__P0 1.0
517    #define FXAA_QUALITY__P1 1.5
518    #define FXAA_QUALITY__P2 2.0
519    #define FXAA_QUALITY__P3 2.0
520    #define FXAA_QUALITY__P4 8.0
521#endif
522/*--------------------------------------------------------------------------*/
523#if (FXAA_QUALITY__PRESET == 23)
524    #define FXAA_QUALITY__PS 6
525    #define FXAA_QUALITY__P0 1.0
526    #define FXAA_QUALITY__P1 1.5
527    #define FXAA_QUALITY__P2 2.0
528    #define FXAA_QUALITY__P3 2.0
529    #define FXAA_QUALITY__P4 2.0
530    #define FXAA_QUALITY__P5 8.0
531#endif
532/*--------------------------------------------------------------------------*/
533#if (FXAA_QUALITY__PRESET == 24)
534    #define FXAA_QUALITY__PS 7
535    #define FXAA_QUALITY__P0 1.0
536    #define FXAA_QUALITY__P1 1.5
537    #define FXAA_QUALITY__P2 2.0
538    #define FXAA_QUALITY__P3 2.0
539    #define FXAA_QUALITY__P4 2.0
540    #define FXAA_QUALITY__P5 3.0
541    #define FXAA_QUALITY__P6 8.0
542#endif
543/*--------------------------------------------------------------------------*/
544#if (FXAA_QUALITY__PRESET == 25)
545    #define FXAA_QUALITY__PS 8
546    #define FXAA_QUALITY__P0 1.0
547    #define FXAA_QUALITY__P1 1.5
548    #define FXAA_QUALITY__P2 2.0
549    #define FXAA_QUALITY__P3 2.0
550    #define FXAA_QUALITY__P4 2.0
551    #define FXAA_QUALITY__P5 2.0
552    #define FXAA_QUALITY__P6 4.0
553    #define FXAA_QUALITY__P7 8.0
554#endif
555/*--------------------------------------------------------------------------*/
556#if (FXAA_QUALITY__PRESET == 26)
557    #define FXAA_QUALITY__PS 9
558    #define FXAA_QUALITY__P0 1.0
559    #define FXAA_QUALITY__P1 1.5
560    #define FXAA_QUALITY__P2 2.0
561    #define FXAA_QUALITY__P3 2.0
562    #define FXAA_QUALITY__P4 2.0
563    #define FXAA_QUALITY__P5 2.0
564    #define FXAA_QUALITY__P6 2.0
565    #define FXAA_QUALITY__P7 4.0
566    #define FXAA_QUALITY__P8 8.0
567#endif
568/*--------------------------------------------------------------------------*/
569#if (FXAA_QUALITY__PRESET == 27)
570    #define FXAA_QUALITY__PS 10
571    #define FXAA_QUALITY__P0 1.0
572    #define FXAA_QUALITY__P1 1.5
573    #define FXAA_QUALITY__P2 2.0
574    #define FXAA_QUALITY__P3 2.0
575    #define FXAA_QUALITY__P4 2.0
576    #define FXAA_QUALITY__P5 2.0
577    #define FXAA_QUALITY__P6 2.0
578    #define FXAA_QUALITY__P7 2.0
579    #define FXAA_QUALITY__P8 4.0
580    #define FXAA_QUALITY__P9 8.0
581#endif
582/*--------------------------------------------------------------------------*/
583#if (FXAA_QUALITY__PRESET == 28)
584    #define FXAA_QUALITY__PS 11
585    #define FXAA_QUALITY__P0 1.0
586    #define FXAA_QUALITY__P1 1.5
587    #define FXAA_QUALITY__P2 2.0
588    #define FXAA_QUALITY__P3 2.0
589    #define FXAA_QUALITY__P4 2.0
590    #define FXAA_QUALITY__P5 2.0
591    #define FXAA_QUALITY__P6 2.0
592    #define FXAA_QUALITY__P7 2.0
593    #define FXAA_QUALITY__P8 2.0
594    #define FXAA_QUALITY__P9 4.0
595    #define FXAA_QUALITY__P10 8.0
596#endif
597/*--------------------------------------------------------------------------*/
598#if (FXAA_QUALITY__PRESET == 29)
599    #define FXAA_QUALITY__PS 12
600    #define FXAA_QUALITY__P0 1.0
601    #define FXAA_QUALITY__P1 1.5
602    #define FXAA_QUALITY__P2 2.0
603    #define FXAA_QUALITY__P3 2.0
604    #define FXAA_QUALITY__P4 2.0
605    #define FXAA_QUALITY__P5 2.0
606    #define FXAA_QUALITY__P6 2.0
607    #define FXAA_QUALITY__P7 2.0
608    #define FXAA_QUALITY__P8 2.0
609    #define FXAA_QUALITY__P9 2.0
610    #define FXAA_QUALITY__P10 4.0
611    #define FXAA_QUALITY__P11 8.0
612#endif
613
614/*============================================================================
615                     FXAA QUALITY - EXTREME QUALITY
616============================================================================*/
617#if (FXAA_QUALITY__PRESET == 39)
618    #define FXAA_QUALITY__PS 12
619    #define FXAA_QUALITY__P0 1.0
620    #define FXAA_QUALITY__P1 1.0
621    #define FXAA_QUALITY__P2 1.0
622    #define FXAA_QUALITY__P3 1.0
623    #define FXAA_QUALITY__P4 1.0
624    #define FXAA_QUALITY__P5 1.5
625    #define FXAA_QUALITY__P6 2.0
626    #define FXAA_QUALITY__P7 2.0
627    #define FXAA_QUALITY__P8 2.0
628    #define FXAA_QUALITY__P9 2.0
629    #define FXAA_QUALITY__P10 4.0
630    #define FXAA_QUALITY__P11 8.0
631#endif
632
633
634
635/*============================================================================
636
637                                API PORTING
638
639============================================================================*/
640#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1)
641    #define FxaaBool bool
642    #define FxaaDiscard discard
643    #define FxaaFloat float
644    #define FxaaFloat2 vec2
645    #define FxaaFloat3 vec3
646    #define FxaaFloat4 vec4
647    #define FxaaHalf float
648    #define FxaaHalf2 vec2
649    #define FxaaHalf3 vec3
650    #define FxaaHalf4 vec4
651    #define FxaaInt2 ivec2
652    #define FxaaSat(x) clamp(x, 0.0, 1.0)
653    #define FxaaTex sampler2D
654#else
655    #define FxaaBool bool
656    #define FxaaDiscard clip(-1)
657    #define FxaaFloat float
658    #define FxaaFloat2 float2
659    #define FxaaFloat3 float3
660    #define FxaaFloat4 float4
661    #define FxaaHalf half
662    #define FxaaHalf2 half2
663    #define FxaaHalf3 half3
664    #define FxaaHalf4 half4
665    #define FxaaSat(x) saturate(x)
666#endif
667/*--------------------------------------------------------------------------*/
668#if (FXAA_GLSL_120 == 1)
669    // Requires,
670    //  #version 120
671    // And at least,
672    //  #extension GL_EXT_gpu_shader4 : enable
673    //  (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9)
674    #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0)
675    #if (FXAA_FAST_PIXEL_OFFSET == 1)
676        #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o)
677    #else
678        #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0)
679    #endif
680    #if (FXAA_GATHER4_ALPHA == 1)
681        // use #extension GL_ARB_gpu_shader5 : enable
682        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
683        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
684        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
685        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
686    #endif
687#endif
688/*--------------------------------------------------------------------------*/
689#if (FXAA_GLSL_130 == 1)
690    // Requires "#version 130" or better
691    #define FxaaTexTop(t, p) textureLod(t, p, 0.0)
692    #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)
693    #if (FXAA_GATHER4_ALPHA == 1)
694        // use #extension GL_ARB_gpu_shader5 : enable
695        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
696        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
697        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
698        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
699    #endif
700#endif
701/*--------------------------------------------------------------------------*/
702#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1) || (FXAA_PS3 == 1)
703    #define FxaaInt2 float2
704    #define FxaaTex sampler2D
705    #define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0))
706    #define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0))
707#endif
708/*--------------------------------------------------------------------------*/
709#if (FXAA_HLSL_4 == 1)
710    #define FxaaInt2 int2
711    struct FxaaTex { SamplerState smpl; Texture2D tex; };
712    #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
713    #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
714#endif
715/*--------------------------------------------------------------------------*/
716#if (FXAA_HLSL_5 == 1)
717    #define FxaaInt2 int2
718    struct FxaaTex { SamplerState smpl; Texture2D tex; };
719    #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
720    #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
721    #define FxaaTexAlpha4(t, p) t.tex.GatherAlpha(t.smpl, p)
722    #define FxaaTexOffAlpha4(t, p, o) t.tex.GatherAlpha(t.smpl, p, o)
723    #define FxaaTexGreen4(t, p) t.tex.GatherGreen(t.smpl, p)
724    #define FxaaTexOffGreen4(t, p, o) t.tex.GatherGreen(t.smpl, p, o)
725#endif
726
727
728/*============================================================================
729                   GREEN AS LUMA OPTION SUPPORT FUNCTION
730============================================================================*/
731#if (FXAA_GREEN_AS_LUMA == 0)
732    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; }
733#else
734    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; }
735#endif
736
737
738
739
740/*============================================================================
741
742                             FXAA3 QUALITY - PC
743
744============================================================================*/
745#if (FXAA_PC == 1)
746/*--------------------------------------------------------------------------*/
747FxaaFloat4 FxaaPixelShader(
748    //
749    // Use noperspective interpolation here (turn off perspective interpolation).
750    // {xy} = center of pixel
751    FxaaFloat2 pos,
752    //
753    // Used only for FXAA Console, and not used on the 360 version.
754    // Use noperspective interpolation here (turn off perspective interpolation).
755    // {xy__} = upper left of pixel
756    // {__zw} = lower right of pixel
757    FxaaFloat4 fxaaConsolePosPos,
758    //
759    // Input color texture.
760    // {rgb_} = color in linear or perceptual color space
761    // if (FXAA_GREEN_AS_LUMA == 0)
762    //     {___a} = luma in perceptual color space (not linear)
763    FxaaTex tex,
764    //
765    // Only used on the optimized 360 version of FXAA Console.
766    // For everything but 360, just use the same input here as for "tex".
767    // For 360, same texture, just alias with a 2nd sampler.
768    // This sampler needs to have an exponent bias of -1.
769    FxaaTex fxaaConsole360TexExpBiasNegOne,
770    //
771    // Only used on the optimized 360 version of FXAA Console.
772    // For everything but 360, just use the same input here as for "tex".
773    // For 360, same texture, just alias with a 3nd sampler.
774    // This sampler needs to have an exponent bias of -2.
775    FxaaTex fxaaConsole360TexExpBiasNegTwo,
776    //
777    // Only used on FXAA Quality.
778    // This must be from a constant/uniform.
779    // {x_} = 1.0/screenWidthInPixels
780    // {_y} = 1.0/screenHeightInPixels
781    FxaaFloat2 fxaaQualityRcpFrame,
782    //
783    // Only used on FXAA Console.
784    // This must be from a constant/uniform.
785    // This effects sub-pixel AA quality and inversely sharpness.
786    //   Where N ranges between,
787    //     N = 0.50 (default)
788    //     N = 0.33 (sharper)
789    // {x___} = -N/screenWidthInPixels
790    // {_y__} = -N/screenHeightInPixels
791    // {__z_} =  N/screenWidthInPixels
792    // {___w} =  N/screenHeightInPixels
793    FxaaFloat4 fxaaConsoleRcpFrameOpt,
794    //
795    // Only used on FXAA Console.
796    // Not used on 360, but used on PS3 and PC.
797    // This must be from a constant/uniform.
798    // {x___} = -2.0/screenWidthInPixels
799    // {_y__} = -2.0/screenHeightInPixels
800    // {__z_} =  2.0/screenWidthInPixels
801    // {___w} =  2.0/screenHeightInPixels
802    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
803    //
804    // Only used on FXAA Console.
805    // Only used on 360 in place of fxaaConsoleRcpFrameOpt2.
806    // This must be from a constant/uniform.
807    // {x___} =  8.0/screenWidthInPixels
808    // {_y__} =  8.0/screenHeightInPixels
809    // {__z_} = -4.0/screenWidthInPixels
810    // {___w} = -4.0/screenHeightInPixels
811    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
812    //
813    // Only used on FXAA Quality.
814    // This used to be the FXAA_QUALITY__SUBPIX define.
815    // It is here now to allow easier tuning.
816    // Choose the amount of sub-pixel aliasing removal.
817    // This can effect sharpness.
818    //   1.00 - upper limit (softer)
819    //   0.75 - default amount of filtering
820    //   0.50 - lower limit (sharper, less sub-pixel aliasing removal)
821    //   0.25 - almost off
822    //   0.00 - completely off
823    FxaaFloat fxaaQualitySubpix,
824    //
825    // Only used on FXAA Quality.
826    // This used to be the FXAA_QUALITY__EDGE_THRESHOLD define.
827    // It is here now to allow easier tuning.
828    // The minimum amount of local contrast required to apply algorithm.
829    //   0.333 - too little (faster)
830    //   0.250 - low quality
831    //   0.166 - default
832    //   0.125 - high quality
833    //   0.063 - overkill (slower)
834    FxaaFloat fxaaQualityEdgeThreshold,
835    //
836    // Only used on FXAA Quality.
837    // This used to be the FXAA_QUALITY__EDGE_THRESHOLD_MIN define.
838    // It is here now to allow easier tuning.
839    // Trims the algorithm from processing darks.
840    //   0.0833 - upper limit (default, the start of visible unfiltered edges)
841    //   0.0625 - high quality (faster)
842    //   0.0312 - visible limit (slower)
843    // Special notes when using FXAA_GREEN_AS_LUMA,
844    //   Likely want to set this to zero.
845    //   As colors that are mostly not-green
846    //   will appear very dark in the green channel!
847    //   Tune by looking at mostly non-green content,
848    //   then start at zero and increase until aliasing is a problem.
849    FxaaFloat fxaaQualityEdgeThresholdMin,
850    //
851    // Only used on FXAA Console.
852    // This used to be the FXAA_CONSOLE__EDGE_SHARPNESS define.
853    // It is here now to allow easier tuning.
854    // This does not effect PS3, as this needs to be compiled in.
855    //   Use FXAA_CONSOLE__PS3_EDGE_SHARPNESS for PS3.
856    //   Due to the PS3 being ALU bound,
857    //   there are only three safe values here: 2 and 4 and 8.
858    //   These options use the shaders ability to a free *|/ by 2|4|8.
859    // For all other platforms can be a non-power of two.
860    //   8.0 is sharper (default!!!)
861    //   4.0 is softer
862    //   2.0 is really soft (good only for vector graphics inputs)
863    FxaaFloat fxaaConsoleEdgeSharpness,
864    //
865    // Only used on FXAA Console.
866    // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD define.
867    // It is here now to allow easier tuning.
868    // This does not effect PS3, as this needs to be compiled in.
869    //   Use FXAA_CONSOLE__PS3_EDGE_THRESHOLD for PS3.
870    //   Due to the PS3 being ALU bound,
871    //   there are only two safe values here: 1/4 and 1/8.
872    //   These options use the shaders ability to a free *|/ by 2|4|8.
873    // The console setting has a different mapping than the quality setting.
874    // Other platforms can use other values.
875    //   0.125 leaves less aliasing, but is softer (default!!!)
876    //   0.25 leaves more aliasing, and is sharper
877    FxaaFloat fxaaConsoleEdgeThreshold,
878    //
879    // Only used on FXAA Console.
880    // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD_MIN define.
881    // It is here now to allow easier tuning.
882    // Trims the algorithm from processing darks.
883    // The console setting has a different mapping than the quality setting.
884    // This only applies when FXAA_EARLY_EXIT is 1.
885    // This does not apply to PS3,
886    // PS3 was simplified to avoid more shader instructions.
887    //   0.06 - faster but more aliasing in darks
888    //   0.05 - default
889    //   0.04 - slower and less aliasing in darks
890    // Special notes when using FXAA_GREEN_AS_LUMA,
891    //   Likely want to set this to zero.
892    //   As colors that are mostly not-green
893    //   will appear very dark in the green channel!
894    //   Tune by looking at mostly non-green content,
895    //   then start at zero and increase until aliasing is a problem.
896    FxaaFloat fxaaConsoleEdgeThresholdMin,
897    //
898    // Extra constants for 360 FXAA Console only.
899    // Use zeros or anything else for other platforms.
900    // These must be in physical constant registers and NOT immedates.
901    // Immedates will result in compiler un-optimizing.
902    // {xyzw} = float4(1.0, -1.0, 0.25, -0.25)
903    FxaaFloat4 fxaaConsole360ConstDir
904) {
905/*--------------------------------------------------------------------------*/
906    FxaaFloat2 posM;
907    posM.x = pos.x;
908    posM.y = pos.y;
909    #if (FXAA_GATHER4_ALPHA == 1)
910        #if (FXAA_DISCARD == 0)
911            FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
912            #if (FXAA_GREEN_AS_LUMA == 0)
913                #define lumaM rgbyM.w
914            #else
915                #define lumaM rgbyM.y
916            #endif
917        #endif
918        #if (FXAA_GREEN_AS_LUMA == 0)
919            FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM);
920            FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1));
921        #else
922            FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM);
923            FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1));
924        #endif
925        #if (FXAA_DISCARD == 1)
926            #define lumaM luma4A.w
927        #endif
928        #define lumaE luma4A.z
929        #define lumaS luma4A.x
930        #define lumaSE luma4A.y
931        #define lumaNW luma4B.w
932        #define lumaN luma4B.z
933        #define lumaW luma4B.x
934    #else
935        FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
936        #if (FXAA_GREEN_AS_LUMA == 0)
937            #define lumaM rgbyM.w
938        #else
939            #define lumaM rgbyM.y
940        #endif
941        FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy));
942        FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy));
943        FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy));
944        FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy));
945    #endif
946/*--------------------------------------------------------------------------*/
947    FxaaFloat maxSM = max(lumaS, lumaM);
948    FxaaFloat minSM = min(lumaS, lumaM);
949    FxaaFloat maxESM = max(lumaE, maxSM);
950    FxaaFloat minESM = min(lumaE, minSM);
951    FxaaFloat maxWN = max(lumaN, lumaW);
952    FxaaFloat minWN = min(lumaN, lumaW);
953    FxaaFloat rangeMax = max(maxWN, maxESM);
954    FxaaFloat rangeMin = min(minWN, minESM);
955    FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold;
956    FxaaFloat range = rangeMax - rangeMin;
957    FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled);
958    FxaaBool earlyExit = range < rangeMaxClamped;
959/*--------------------------------------------------------------------------*/
960    if(earlyExit)
961        #if (FXAA_DISCARD == 1)
962            FxaaDiscard;
963        #else
964            return rgbyM;
965        #endif
966/*--------------------------------------------------------------------------*/
967    #if (FXAA_GATHER4_ALPHA == 0)
968        FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy));
969        FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy));
970        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy));
971        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
972    #else
973        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy));
974        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
975    #endif
976/*--------------------------------------------------------------------------*/
977    FxaaFloat lumaNS = lumaN + lumaS;
978    FxaaFloat lumaWE = lumaW + lumaE;
979    FxaaFloat subpixRcpRange = 1.0/range;
980    FxaaFloat subpixNSWE = lumaNS + lumaWE;
981    FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS;
982    FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE;
983/*--------------------------------------------------------------------------*/
984    FxaaFloat lumaNESE = lumaNE + lumaSE;
985    FxaaFloat lumaNWNE = lumaNW + lumaNE;
986    FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE;
987    FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE;
988/*--------------------------------------------------------------------------*/
989    FxaaFloat lumaNWSW = lumaNW + lumaSW;
990    FxaaFloat lumaSWSE = lumaSW + lumaSE;
991    FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);
992    FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);
993    FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;
994    FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE;
995    FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4;
996    FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4;
997/*--------------------------------------------------------------------------*/
998    FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE;
999    FxaaFloat lengthSign = fxaaQualityRcpFrame.x;
1000    FxaaBool horzSpan = edgeHorz >= edgeVert;
1001    FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;
1002/*--------------------------------------------------------------------------*/
1003    if(!horzSpan) lumaN = lumaW;
1004    if(!horzSpan) lumaS = lumaE;
1005    if(horzSpan) lengthSign = fxaaQualityRcpFrame.y;
1006    FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM;
1007/*--------------------------------------------------------------------------*/
1008    FxaaFloat gradientN = lumaN - lumaM;
1009    FxaaFloat gradientS = lumaS - lumaM;
1010    FxaaFloat lumaNN = lumaN + lumaM;
1011    FxaaFloat lumaSS = lumaS + lumaM;
1012    FxaaBool pairN = abs(gradientN) >= abs(gradientS);
1013    FxaaFloat gradient = max(abs(gradientN), abs(gradientS));
1014    if(pairN) lengthSign = -lengthSign;
1015    FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);
1016/*--------------------------------------------------------------------------*/
1017    FxaaFloat2 posB;
1018    posB.x = posM.x;
1019    posB.y = posM.y;
1020    FxaaFloat2 offNP;
1021    offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x;
1022    offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y;
1023    if(!horzSpan) posB.x += lengthSign * 0.5;
1024    if( horzSpan) posB.y += lengthSign * 0.5;
1025/*--------------------------------------------------------------------------*/
1026    FxaaFloat2 posN;
1027    posN.x = posB.x - offNP.x * FXAA_QUALITY__P0;
1028    posN.y = posB.y - offNP.y * FXAA_QUALITY__P0;
1029    FxaaFloat2 posP;
1030    posP.x = posB.x + offNP.x * FXAA_QUALITY__P0;
1031    posP.y = posB.y + offNP.y * FXAA_QUALITY__P0;
1032    FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0;
1033    FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN));
1034    FxaaFloat subpixE = subpixC * subpixC;
1035    FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP));
1036/*--------------------------------------------------------------------------*/
1037    if(!pairN) lumaNN = lumaSS;
1038    FxaaFloat gradientScaled = gradient * 1.0/4.0;
1039    FxaaFloat lumaMM = lumaM - lumaNN * 0.5;
1040    FxaaFloat subpixF = subpixD * subpixE;
1041    FxaaBool lumaMLTZero = lumaMM < 0.0;
1042/*--------------------------------------------------------------------------*/
1043    lumaEndN -= lumaNN * 0.5;
1044    lumaEndP -= lumaNN * 0.5;
1045    FxaaBool doneN = abs(lumaEndN) >= gradientScaled;
1046    FxaaBool doneP = abs(lumaEndP) >= gradientScaled;
1047    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1;
1048    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1;
1049    FxaaBool doneNP = (!doneN) || (!doneP);
1050    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1;
1051    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1;
1052/*--------------------------------------------------------------------------*/
1053    if(doneNP) {
1054        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1055        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1056        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1057        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1058        doneN = abs(lumaEndN) >= gradientScaled;
1059        doneP = abs(lumaEndP) >= gradientScaled;
1060        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2;
1061        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2;
1062        doneNP = (!doneN) || (!doneP);
1063        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2;
1064        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2;
1065/*--------------------------------------------------------------------------*/
1066        #if (FXAA_QUALITY__PS > 3)
1067        if(doneNP) {
1068            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1069            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1070            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1071            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1072            doneN = abs(lumaEndN) >= gradientScaled;
1073            doneP = abs(lumaEndP) >= gradientScaled;
1074            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3;
1075            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3;
1076            doneNP = (!doneN) || (!doneP);
1077            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3;
1078            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3;
1079/*--------------------------------------------------------------------------*/
1080            #if (FXAA_QUALITY__PS > 4)
1081            if(doneNP) {
1082                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1083                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1084                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1085                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1086                doneN = abs(lumaEndN) >= gradientScaled;
1087                doneP = abs(lumaEndP) >= gradientScaled;
1088                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4;
1089                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4;
1090                doneNP = (!doneN) || (!doneP);
1091                if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4;
1092                if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4;
1093/*--------------------------------------------------------------------------*/
1094                #if (FXAA_QUALITY__PS > 5)
1095                if(doneNP) {
1096                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1097                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1098                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1099                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1100                    doneN = abs(lumaEndN) >= gradientScaled;
1101                    doneP = abs(lumaEndP) >= gradientScaled;
1102                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5;
1103                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5;
1104                    doneNP = (!doneN) || (!doneP);
1105                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5;
1106                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5;
1107/*--------------------------------------------------------------------------*/
1108                    #if (FXAA_QUALITY__PS > 6)
1109                    if(doneNP) {
1110                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1111                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1112                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1113                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1114                        doneN = abs(lumaEndN) >= gradientScaled;
1115                        doneP = abs(lumaEndP) >= gradientScaled;
1116                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6;
1117                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6;
1118                        doneNP = (!doneN) || (!doneP);
1119                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6;
1120                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6;
1121/*--------------------------------------------------------------------------*/
1122                        #if (FXAA_QUALITY__PS > 7)
1123                        if(doneNP) {
1124                            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1125                            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1126                            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1127                            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1128                            doneN = abs(lumaEndN) >= gradientScaled;
1129                            doneP = abs(lumaEndP) >= gradientScaled;
1130                            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7;
1131                            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7;
1132                            doneNP = (!doneN) || (!doneP);
1133                            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7;
1134                            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7;
1135/*--------------------------------------------------------------------------*/
1136    #if (FXAA_QUALITY__PS > 8)
1137    if(doneNP) {
1138        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1139        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1140        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1141        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1142        doneN = abs(lumaEndN) >= gradientScaled;
1143        doneP = abs(lumaEndP) >= gradientScaled;
1144        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8;
1145        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8;
1146        doneNP = (!doneN) || (!doneP);
1147        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8;
1148        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8;
1149/*--------------------------------------------------------------------------*/
1150        #if (FXAA_QUALITY__PS > 9)
1151        if(doneNP) {
1152            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1153            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1154            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1155            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1156            doneN = abs(lumaEndN) >= gradientScaled;
1157            doneP = abs(lumaEndP) >= gradientScaled;
1158            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9;
1159            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9;
1160            doneNP = (!doneN) || (!doneP);
1161            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9;
1162            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9;
1163/*--------------------------------------------------------------------------*/
1164            #if (FXAA_QUALITY__PS > 10)
1165            if(doneNP) {
1166                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1167                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1168                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1169                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1170                doneN = abs(lumaEndN) >= gradientScaled;
1171                doneP = abs(lumaEndP) >= gradientScaled;
1172                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10;
1173                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10;
1174                doneNP = (!doneN) || (!doneP);
1175                if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10;
1176                if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10;
1177/*--------------------------------------------------------------------------*/
1178                #if (FXAA_QUALITY__PS > 11)
1179                if(doneNP) {
1180                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1181                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1182                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1183                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1184                    doneN = abs(lumaEndN) >= gradientScaled;
1185                    doneP = abs(lumaEndP) >= gradientScaled;
1186                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11;
1187                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11;
1188                    doneNP = (!doneN) || (!doneP);
1189                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11;
1190                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11;
1191/*--------------------------------------------------------------------------*/
1192                    #if (FXAA_QUALITY__PS > 12)
1193                    if(doneNP) {
1194                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
1195                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
1196                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
1197                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
1198                        doneN = abs(lumaEndN) >= gradientScaled;
1199                        doneP = abs(lumaEndP) >= gradientScaled;
1200                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12;
1201                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12;
1202                        doneNP = (!doneN) || (!doneP);
1203                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12;
1204                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12;
1205/*--------------------------------------------------------------------------*/
1206                    }
1207                    #endif
1208/*--------------------------------------------------------------------------*/
1209                }
1210                #endif
1211/*--------------------------------------------------------------------------*/
1212            }
1213            #endif
1214/*--------------------------------------------------------------------------*/
1215        }
1216        #endif
1217/*--------------------------------------------------------------------------*/
1218    }
1219    #endif
1220/*--------------------------------------------------------------------------*/
1221                        }
1222                        #endif
1223/*--------------------------------------------------------------------------*/
1224                    }
1225                    #endif
1226/*--------------------------------------------------------------------------*/
1227                }
1228                #endif
1229/*--------------------------------------------------------------------------*/
1230            }
1231            #endif
1232/*--------------------------------------------------------------------------*/
1233        }
1234        #endif
1235/*--------------------------------------------------------------------------*/
1236    }
1237/*--------------------------------------------------------------------------*/
1238    FxaaFloat dstN = posM.x - posN.x;
1239    FxaaFloat dstP = posP.x - posM.x;
1240    if(!horzSpan) dstN = posM.y - posN.y;
1241    if(!horzSpan) dstP = posP.y - posM.y;
1242/*--------------------------------------------------------------------------*/
1243    FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;
1244    FxaaFloat spanLength = (dstP + dstN);
1245    FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;
1246    FxaaFloat spanLengthRcp = 1.0/spanLength;
1247/*--------------------------------------------------------------------------*/
1248    FxaaBool directionN = dstN < dstP;
1249    FxaaFloat dst = min(dstN, dstP);
1250    FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP;
1251    FxaaFloat subpixG = subpixF * subpixF;
1252    FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5;
1253    FxaaFloat subpixH = subpixG * fxaaQualitySubpix;
1254/*--------------------------------------------------------------------------*/
1255    FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0;
1256    FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH);
1257    if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;
1258    if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;
1259    #if (FXAA_DISCARD == 1)
1260        return FxaaTexTop(tex, posM);
1261    #else
1262        return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM);
1263    #endif
1264}
1265/*==========================================================================*/
1266#endif
1267
1268
1269
1270
1271/*============================================================================
1272
1273                         FXAA3 CONSOLE - PC VERSION
1274
1275------------------------------------------------------------------------------
1276Instead of using this on PC, I'd suggest just using FXAA Quality with
1277    #define FXAA_QUALITY__PRESET 10
1278Or
1279    #define FXAA_QUALITY__PRESET 20
1280Either are higher qualilty and almost as fast as this on modern PC GPUs.
1281============================================================================*/
1282#if (FXAA_PC_CONSOLE == 1)
1283/*--------------------------------------------------------------------------*/
1284FxaaFloat4 FxaaPixelShader(
1285    // See FXAA Quality FxaaPixelShader() source for docs on Inputs!
1286    FxaaFloat2 pos,
1287    FxaaFloat4 fxaaConsolePosPos,
1288    FxaaTex tex,
1289    FxaaTex fxaaConsole360TexExpBiasNegOne,
1290    FxaaTex fxaaConsole360TexExpBiasNegTwo,
1291    FxaaFloat2 fxaaQualityRcpFrame,
1292    FxaaFloat4 fxaaConsoleRcpFrameOpt,
1293    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
1294    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
1295    FxaaFloat fxaaQualitySubpix,
1296    FxaaFloat fxaaQualityEdgeThreshold,
1297    FxaaFloat fxaaQualityEdgeThresholdMin,
1298    FxaaFloat fxaaConsoleEdgeSharpness,
1299    FxaaFloat fxaaConsoleEdgeThreshold,
1300    FxaaFloat fxaaConsoleEdgeThresholdMin,
1301    FxaaFloat4 fxaaConsole360ConstDir
1302) {
1303/*--------------------------------------------------------------------------*/
1304    FxaaFloat lumaNw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xy));
1305    FxaaFloat lumaSw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xw));
1306    FxaaFloat lumaNe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zy));
1307    FxaaFloat lumaSe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zw));
1308/*--------------------------------------------------------------------------*/
1309    FxaaFloat4 rgbyM = FxaaTexTop(tex, pos.xy);
1310    #if (FXAA_GREEN_AS_LUMA == 0)
1311        FxaaFloat lumaM = rgbyM.w;
1312    #else
1313        FxaaFloat lumaM = rgbyM.y;
1314    #endif
1315/*--------------------------------------------------------------------------*/
1316    FxaaFloat lumaMaxNwSw = max(lumaNw, lumaSw);
1317    lumaNe += 1.0/384.0;
1318    FxaaFloat lumaMinNwSw = min(lumaNw, lumaSw);
1319/*--------------------------------------------------------------------------*/
1320    FxaaFloat lumaMaxNeSe = max(lumaNe, lumaSe);
1321    FxaaFloat lumaMinNeSe = min(lumaNe, lumaSe);
1322/*--------------------------------------------------------------------------*/
1323    FxaaFloat lumaMax = max(lumaMaxNeSe, lumaMaxNwSw);
1324    FxaaFloat lumaMin = min(lumaMinNeSe, lumaMinNwSw);
1325/*--------------------------------------------------------------------------*/
1326    FxaaFloat lumaMaxScaled = lumaMax * fxaaConsoleEdgeThreshold;
1327/*--------------------------------------------------------------------------*/
1328    FxaaFloat lumaMinM = min(lumaMin, lumaM);
1329    FxaaFloat lumaMaxScaledClamped = max(fxaaConsoleEdgeThresholdMin, lumaMaxScaled);
1330    FxaaFloat lumaMaxM = max(lumaMax, lumaM);
1331    FxaaFloat dirSwMinusNe = lumaSw - lumaNe;
1332    FxaaFloat lumaMaxSubMinM = lumaMaxM - lumaMinM;
1333    FxaaFloat dirSeMinusNw = lumaSe - lumaNw;
1334    if(lumaMaxSubMinM < lumaMaxScaledClamped) return rgbyM;
1335/*--------------------------------------------------------------------------*/
1336    FxaaFloat2 dir;
1337    dir.x = dirSwMinusNe + dirSeMinusNw;
1338    dir.y = dirSwMinusNe - dirSeMinusNw;
1339/*--------------------------------------------------------------------------*/
1340    FxaaFloat2 dir1 = normalize(dir.xy);
1341    FxaaFloat4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * fxaaConsoleRcpFrameOpt.zw);
1342    FxaaFloat4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * fxaaConsoleRcpFrameOpt.zw);
1343/*--------------------------------------------------------------------------*/
1344    FxaaFloat dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * fxaaConsoleEdgeSharpness;
1345    FxaaFloat2 dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);
1346/*--------------------------------------------------------------------------*/
1347    FxaaFloat4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * fxaaConsoleRcpFrameOpt2.zw);
1348    FxaaFloat4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * fxaaConsoleRcpFrameOpt2.zw);
1349/*--------------------------------------------------------------------------*/
1350    FxaaFloat4 rgbyA = rgbyN1 + rgbyP1;
1351    FxaaFloat4 rgbyB = ((rgbyN2 + rgbyP2) * 0.25) + (rgbyA * 0.25);
1352/*--------------------------------------------------------------------------*/
1353    #if (FXAA_GREEN_AS_LUMA == 0)
1354        FxaaBool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);
1355    #else
1356        FxaaBool twoTap = (rgbyB.y < lumaMin) || (rgbyB.y > lumaMax);
1357    #endif
1358    if(twoTap) rgbyB.xyz = rgbyA.xyz * 0.5;
1359    return rgbyB; }
1360/*==========================================================================*/
1361#endif
1362
1363
1364
1365/*============================================================================
1366
1367                      FXAA3 CONSOLE - 360 PIXEL SHADER
1368
1369------------------------------------------------------------------------------
1370This optimized version thanks to suggestions from Andy Luedke.
1371Should be fully tex bound in all cases.
1372As of the FXAA 3.11 release, I have still not tested this code,
1373however I fixed a bug which was in both FXAA 3.9 and FXAA 3.10.
1374And note this is replacing the old unoptimized version.
1375If it does not work, please let me know so I can fix it.
1376============================================================================*/
1377#if (FXAA_360 == 1)
1378/*--------------------------------------------------------------------------*/
1379[reduceTempRegUsage(4)]
1380float4 FxaaPixelShader(
1381    // See FXAA Quality FxaaPixelShader() source for docs on Inputs!
1382    FxaaFloat2 pos,
1383    FxaaFloat4 fxaaConsolePosPos,
1384    FxaaTex tex,
1385    FxaaTex fxaaConsole360TexExpBiasNegOne,
1386    FxaaTex fxaaConsole360TexExpBiasNegTwo,
1387    FxaaFloat2 fxaaQualityRcpFrame,
1388    FxaaFloat4 fxaaConsoleRcpFrameOpt,
1389    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
1390    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
1391    FxaaFloat fxaaQualitySubpix,
1392    FxaaFloat fxaaQualityEdgeThreshold,
1393    FxaaFloat fxaaQualityEdgeThresholdMin,
1394    FxaaFloat fxaaConsoleEdgeSharpness,
1395    FxaaFloat fxaaConsoleEdgeThreshold,
1396    FxaaFloat fxaaConsoleEdgeThresholdMin,
1397    FxaaFloat4 fxaaConsole360ConstDir
1398) {
1399/*--------------------------------------------------------------------------*/
1400    float4 lumaNwNeSwSe;
1401    #if (FXAA_GREEN_AS_LUMA == 0)
1402        asm {
1403            tfetch2D lumaNwNeSwSe.w___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false
1404            tfetch2D lumaNwNeSwSe._w__, tex, pos.xy, OffsetX =  0.5, OffsetY = -0.5, UseComputedLOD=false
1405            tfetch2D lumaNwNeSwSe.__w_, tex, pos.xy, OffsetX = -0.5, OffsetY =  0.5, UseComputedLOD=false
1406            tfetch2D lumaNwNeSwSe.___w, tex, pos.xy, OffsetX =  0.5, OffsetY =  0.5, UseComputedLOD=false
1407        };
1408    #else
1409        asm {
1410            tfetch2D lumaNwNeSwSe.y___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false
1411            tfetch2D lumaNwNeSwSe._y__, tex, pos.xy, OffsetX =  0.5, OffsetY = -0.5, UseComputedLOD=false
1412            tfetch2D lumaNwNeSwSe.__y_, tex, pos.xy, OffsetX = -0.5, OffsetY =  0.5, UseComputedLOD=false
1413            tfetch2D lumaNwNeSwSe.___y, tex, pos.xy, OffsetX =  0.5, OffsetY =  0.5, UseComputedLOD=false
1414        };
1415    #endif
1416/*--------------------------------------------------------------------------*/
1417    lumaNwNeSwSe.y += 1.0/384.0;
1418    float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
1419    float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
1420    float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
1421    float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
1422/*--------------------------------------------------------------------------*/
1423    float4 rgbyM = tex2Dlod(tex, float4(pos.xy, 0.0, 0.0));
1424    #if (FXAA_GREEN_AS_LUMA == 0)
1425        float lumaMinM = min(lumaMin, rgbyM.w);
1426        float lumaMaxM = max(lumaMax, rgbyM.w);
1427    #else
1428        float lumaMinM = min(lumaMin, rgbyM.y);
1429        float lumaMaxM = max(lumaMax, rgbyM.y);
1430    #endif
1431    if((lumaMaxM - lumaMinM) < max(fxaaConsoleEdgeThresholdMin, lumaMax * fxaaConsoleEdgeThreshold)) return rgbyM;
1432/*--------------------------------------------------------------------------*/
1433    float2 dir;
1434    dir.x = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.yyxx);
1435    dir.y = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.xyxy);
1436    dir = normalize(dir);
1437/*--------------------------------------------------------------------------*/
1438    float4 dir1 = dir.xyxy * fxaaConsoleRcpFrameOpt.xyzw;
1439/*--------------------------------------------------------------------------*/
1440    float4 dir2;
1441    float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y)) * fxaaConsoleEdgeSharpness;
1442    dir2 = saturate(fxaaConsole360ConstDir.zzww * dir.xyxy / dirAbsMinTimesC + 0.5);
1443    dir2 = dir2 * fxaaConsole360RcpFrameOpt2.xyxy + fxaaConsole360RcpFrameOpt2.zwzw;
1444/*--------------------------------------------------------------------------*/
1445    float4 rgbyN1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.xy, 0.0, 0.0));
1446    float4 rgbyP1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.zw, 0.0, 0.0));
1447    float4 rgbyN2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.xy, 0.0, 0.0));
1448    float4 rgbyP2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.zw, 0.0, 0.0));
1449/*--------------------------------------------------------------------------*/
1450    float4 rgbyA = rgbyN1 + rgbyP1;
1451    float4 rgbyB = rgbyN2 + rgbyP2 + rgbyA * 0.5;
1452/*--------------------------------------------------------------------------*/
1453    float4 rgbyR = ((FxaaLuma(rgbyB) - lumaMax) > 0.0) ? rgbyA : rgbyB;
1454    rgbyR = ((FxaaLuma(rgbyB) - lumaMin) > 0.0) ? rgbyR : rgbyA;
1455    return rgbyR; }
1456/*==========================================================================*/
1457#endif
1458
1459
1460
1461/*============================================================================
1462
1463         FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)
1464
1465==============================================================================
1466The code below does not exactly match the assembly.
1467I have a feeling that 12 cycles is possible, but was not able to get there.
1468Might have to increase register count to get full performance.
1469Note this shader does not use perspective interpolation.
1470
1471Use the following cgc options,
1472
1473  --fenable-bx2 --fastmath --fastprecision --nofloatbindings
1474
1475------------------------------------------------------------------------------
1476                             NVSHADERPERF OUTPUT
1477------------------------------------------------------------------------------
1478For reference and to aid in debug, output of NVShaderPerf should match this,
1479
1480Shader to schedule:
1481  0: texpkb h0.w(TRUE), v5.zyxx, #0
1482  2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
1483  4: texpkb h0.w(TRUE), v5.xwxx, #0
1484  6: addh h0.z(TRUE), -h2, h0.w
1485  7: texpkb h1.w(TRUE), v5, #0
1486  9: addh h0.x(TRUE), h0.z, -h1.w
1487 10: addh h3.w(TRUE), h0.z, h1
1488 11: texpkb h2.w(TRUE), v5.zwzz, #0
1489 13: addh h0.z(TRUE), h3.w, -h2.w
1490 14: addh h0.x(TRUE), h2.w, h0
1491 15: nrmh h1.xz(TRUE), h0_n
1492 16: minh_m8 h0.x(TRUE), |h1|, |h1.z|
1493 17: maxh h4.w(TRUE), h0, h1
1494 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n
1495 19: movr r1.zw(TRUE), v4.xxxy
1496 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww
1497 22: minh h5.w(TRUE), h0, h1
1498 23: texpkb h0(TRUE), r2.xzxx, #0
1499 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1
1500 27: maxh h4.x(TRUE), h2.z, h2.w
1501 28: texpkb h1(TRUE), r0.zwzz, #0
1502 30: addh_d2 h1(TRUE), h0, h1
1503 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
1504 33: texpkb h0(TRUE), r0, #0
1505 35: minh h4.z(TRUE), h2, h2.w
1506 36: fenct TRUE
1507 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
1508 39: texpkb h2(TRUE), r1, #0
1509 41: addh_d2 h0(TRUE), h0, h2
1510 42: maxh h2.w(TRUE), h4, h4.x
1511 43: minh h2.x(TRUE), h5.w, h4.z
1512 44: addh_d2 h0(TRUE), h0, h1
1513 45: slth h2.x(TRUE), h0.w, h2
1514 46: sgth h2.w(TRUE), h0, h2
1515 47: movh h0(TRUE), h0
1516 48: addx.c0 rc(TRUE), h2, h2.w
1517 49: movh h0(c0.NE.x), h1
1518
1519IPU0 ------ Simplified schedule: --------
1520Pass |  Unit  |  uOp |  PC:  Op
1521-----+--------+------+-------------------------
1522   1 | SCT0/1 |  mov |   0:  TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
1523     |    TEX |  txl |   0:  TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
1524     |   SCB1 |  add |   2:  ADDh h2.z, h0.--w-, const.--x-;
1525     |        |      |
1526   2 | SCT0/1 |  mov |   4:  TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
1527     |    TEX |  txl |   4:  TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
1528     |   SCB1 |  add |   6:  ADDh h0.z,-h2, h0.--w-;
1529     |        |      |
1530   3 | SCT0/1 |  mov |   7:  TXLr h1.w, g[TEX1], const.xxxx, TEX0;
1531     |    TEX |  txl |   7:  TXLr h1.w, g[TEX1], const.xxxx, TEX0;
1532     |   SCB0 |  add |   9:  ADDh h0.x, h0.z---,-h1.w---;
1533     |   SCB1 |  add |  10:  ADDh h3.w, h0.---z, h1;
1534     |        |      |
1535   4 | SCT0/1 |  mov |  11:  TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
1536     |    TEX |  txl |  11:  TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
1537     |   SCB0 |  add |  14:  ADDh h0.x, h2.w---, h0;
1538     |   SCB1 |  add |  13:  ADDh h0.z, h3.--w-,-h2.--w-;
1539     |        |      |
1540   5 |   SCT1 |  mov |  15:  NRMh h1.xz, h0;
1541     |    SRB |  nrm |  15:  NRMh h1.xz, h0;
1542     |   SCB0 |  min |  16:  MINh*8 h0.x, |h1|, |h1.z---|;
1543     |   SCB1 |  max |  17:  MAXh h4.w, h0, h1;
1544     |        |      |
1545   6 |   SCT0 |  div |  18:  DIVx h2.xy, h1.xz--, h0;
1546     |   SCT1 |  mov |  19:  MOVr r1.zw, g[TEX0].--xy;
1547     |   SCB0 |  mad |  20:  MADr r2.xz,-h1, const.z-w-, r1.z-w-;
1548     |   SCB1 |  min |  22:  MINh h5.w, h0, h1;
1549     |        |      |
1550   7 | SCT0/1 |  mov |  23:  TXLr h0, r2.xzxx, const.xxxx, TEX0;
1551     |    TEX |  txl |  23:  TXLr h0, r2.xzxx, const.xxxx, TEX0;
1552     |   SCB0 |  max |  27:  MAXh h4.x, h2.z---, h2.w---;
1553     |   SCB1 |  mad |  25:  MADr r0.zw, h1.--xz, const, r1;
1554     |        |      |
1555   8 | SCT0/1 |  mov |  28:  TXLr h1, r0.zwzz, const.xxxx, TEX0;
1556     |    TEX |  txl |  28:  TXLr h1, r0.zwzz, const.xxxx, TEX0;
1557     | SCB0/1 |  add |  30:  ADDh/2 h1, h0, h1;
1558     |        |      |
1559   9 |   SCT0 |  mad |  31:  MADr r0.xy,-h2, const.xy--, r1.zw--;
1560     |   SCT1 |  mov |  33:  TXLr h0, r0, const.zzzz, TEX0;
1561     |    TEX |  txl |  33:  TXLr h0, r0, const.zzzz, TEX0;
1562     |   SCB1 |  min |  35:  MINh h4.z, h2, h2.--w-;
1563     |        |      |
1564  10 |   SCT0 |  mad |  37:  MADr r1.xy, h2, const.xy--, r1.zw--;
1565     |   SCT1 |  mov |  39:  TXLr h2, r1, const.zzzz, TEX0;
1566     |    TEX |  txl |  39:  TXLr h2, r1, const.zzzz, TEX0;
1567     | SCB0/1 |  add |  41:  ADDh/2 h0, h0, h2;
1568     |        |      |
1569  11 |   SCT0 |  min |  43:  MINh h2.x, h5.w---, h4.z---;
1570     |   SCT1 |  max |  42:  MAXh h2.w, h4, h4.---x;
1571     | SCB0/1 |  add |  44:  ADDh/2 h0, h0, h1;
1572     |        |      |
1573  12 |   SCT0 |  set |  45:  SLTh h2.x, h0.w---, h2;
1574     |   SCT1 |  set |  46:  SGTh h2.w, h0, h2;
1575     | SCB0/1 |  mul |  47:  MOVh h0, h0;
1576     |        |      |
1577  13 |   SCT0 |  mad |  48:  ADDxc0_s rc, h2, h2.w---;
1578     | SCB0/1 |  mul |  49:  MOVh h0(NE0.xxxx), h1;
1579
1580Pass   SCT  TEX  SCB
1581  1:   0% 100%  25%
1582  2:   0% 100%  25%
1583  3:   0% 100%  50%
1584  4:   0% 100%  50%
1585  5:   0%   0%  50%
1586  6: 100%   0%  75%
1587  7:   0% 100%  75%
1588  8:   0% 100% 100%
1589  9:   0% 100%  25%
1590 10:   0% 100% 100%
1591 11:  50%   0% 100%
1592 12:  50%   0% 100%
1593 13:  25%   0% 100%
1594
1595MEAN:  17%  61%  67%
1596
1597Pass   SCT0  SCT1   TEX  SCB0  SCB1
1598  1:    0%    0%  100%    0%  100%
1599  2:    0%    0%  100%    0%  100%
1600  3:    0%    0%  100%  100%  100%
1601  4:    0%    0%  100%  100%  100%
1602  5:    0%    0%    0%  100%  100%
1603  6:  100%  100%    0%  100%  100%
1604  7:    0%    0%  100%  100%  100%
1605  8:    0%    0%  100%  100%  100%
1606  9:    0%    0%  100%    0%  100%
1607 10:    0%    0%  100%  100%  100%
1608 11:  100%  100%    0%  100%  100%
1609 12:  100%  100%    0%  100%  100%
1610 13:  100%    0%    0%  100%  100%
1611
1612MEAN:   30%   23%   61%   76%  100%
1613Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
1614Results 13 cycles, 3 r regs, 923,076,923 pixels/s
1615============================================================================*/
1616#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)
1617/*--------------------------------------------------------------------------*/
1618#pragma regcount 7
1619#pragma disablepc all
1620#pragma option O3
1621#pragma option OutColorPrec=fp16
1622#pragma texformat default RGBA8
1623/*==========================================================================*/
1624half4 FxaaPixelShader(
1625    // See FXAA Quality FxaaPixelShader() source for docs on Inputs!
1626    FxaaFloat2 pos,
1627    FxaaFloat4 fxaaConsolePosPos,
1628    FxaaTex tex,
1629    FxaaTex fxaaConsole360TexExpBiasNegOne,
1630    FxaaTex fxaaConsole360TexExpBiasNegTwo,
1631    FxaaFloat2 fxaaQualityRcpFrame,
1632    FxaaFloat4 fxaaConsoleRcpFrameOpt,
1633    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
1634    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
1635    FxaaFloat fxaaQualitySubpix,
1636    FxaaFloat fxaaQualityEdgeThreshold,
1637    FxaaFloat fxaaQualityEdgeThresholdMin,
1638    FxaaFloat fxaaConsoleEdgeSharpness,
1639    FxaaFloat fxaaConsoleEdgeThreshold,
1640    FxaaFloat fxaaConsoleEdgeThresholdMin,
1641    FxaaFloat4 fxaaConsole360ConstDir
1642) {
1643/*--------------------------------------------------------------------------*/
1644// (1)
1645    half4 dir;
1646    half4 lumaNe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0));
1647    #if (FXAA_GREEN_AS_LUMA == 0)
1648        lumaNe.w += half(1.0/512.0);
1649        dir.x = -lumaNe.w;
1650        dir.z = -lumaNe.w;
1651    #else
1652        lumaNe.y += half(1.0/512.0);
1653        dir.x = -lumaNe.y;
1654        dir.z = -lumaNe.y;
1655    #endif
1656/*--------------------------------------------------------------------------*/
1657// (2)
1658    half4 lumaSw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0));
1659    #if (FXAA_GREEN_AS_LUMA == 0)
1660        dir.x += lumaSw.w;
1661        dir.z += lumaSw.w;
1662    #else
1663        dir.x += lumaSw.y;
1664        dir.z += lumaSw.y;
1665    #endif
1666/*--------------------------------------------------------------------------*/
1667// (3)
1668    half4 lumaNw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0));
1669    #if (FXAA_GREEN_AS_LUMA == 0)
1670        dir.x -= lumaNw.w;
1671        dir.z += lumaNw.w;
1672    #else
1673        dir.x -= lumaNw.y;
1674        dir.z += lumaNw.y;
1675    #endif
1676/*--------------------------------------------------------------------------*/
1677// (4)
1678    half4 lumaSe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0));
1679    #if (FXAA_GREEN_AS_LUMA == 0)
1680        dir.x += lumaSe.w;
1681        dir.z -= lumaSe.w;
1682    #else
1683        dir.x += lumaSe.y;
1684        dir.z -= lumaSe.y;
1685    #endif
1686/*--------------------------------------------------------------------------*/
1687// (5)
1688    half4 dir1_pos;
1689    dir1_pos.xy = normalize(dir.xyz).xz;
1690    half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS);
1691/*--------------------------------------------------------------------------*/
1692// (6)
1693    half4 dir2_pos;
1694    dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
1695    dir1_pos.zw = pos.xy;
1696    dir2_pos.zw = pos.xy;
1697    half4 temp1N;
1698    temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw;
1699/*--------------------------------------------------------------------------*/
1700// (7)
1701    temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
1702    half4 rgby1;
1703    rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw;
1704/*--------------------------------------------------------------------------*/
1705// (8)
1706    rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
1707    rgby1 = (temp1N + rgby1) * 0.5;
1708/*--------------------------------------------------------------------------*/
1709// (9)
1710    half4 temp2N;
1711    temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw;
1712    temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
1713/*--------------------------------------------------------------------------*/
1714// (10)
1715    half4 rgby2;
1716    rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw;
1717    rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
1718    rgby2 = (temp2N + rgby2) * 0.5;
1719/*--------------------------------------------------------------------------*/
1720// (11)
1721    // compilier moves these scalar ops up to other cycles
1722    #if (FXAA_GREEN_AS_LUMA == 0)
1723        half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
1724        half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
1725    #else
1726        half lumaMin = min(min(lumaNw.y, lumaSw.y), min(lumaNe.y, lumaSe.y));
1727        half lumaMax = max(max(lumaNw.y, lumaSw.y), max(lumaNe.y, lumaSe.y));
1728    #endif
1729    rgby2 = (rgby2 + rgby1) * 0.5;
1730/*--------------------------------------------------------------------------*/
1731// (12)
1732    #if (FXAA_GREEN_AS_LUMA == 0)
1733        bool twoTapLt = rgby2.w < lumaMin;
1734        bool twoTapGt = rgby2.w > lumaMax;
1735    #else
1736        bool twoTapLt = rgby2.y < lumaMin;
1737        bool twoTapGt = rgby2.y > lumaMax;
1738    #endif
1739/*--------------------------------------------------------------------------*/
1740// (13)
1741    if(twoTapLt || twoTapGt) rgby2 = rgby1;
1742/*--------------------------------------------------------------------------*/
1743    return rgby2; }
1744/*==========================================================================*/
1745#endif
1746
1747
1748
1749/*============================================================================
1750
1751       FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)
1752
1753==============================================================================
1754The code mostly matches the assembly.
1755I have a feeling that 14 cycles is possible, but was not able to get there.
1756Might have to increase register count to get full performance.
1757Note this shader does not use perspective interpolation.
1758
1759Use the following cgc options,
1760
1761 --fenable-bx2 --fastmath --fastprecision --nofloatbindings
1762
1763Use of FXAA_GREEN_AS_LUMA currently adds a cycle (16 clks).
1764Will look at fixing this for FXAA 3.12.
1765------------------------------------------------------------------------------
1766                             NVSHADERPERF OUTPUT
1767------------------------------------------------------------------------------
1768For reference and to aid in debug, output of NVShaderPerf should match this,
1769
1770Shader to schedule:
1771  0: texpkb h0.w(TRUE), v5.zyxx, #0
1772  2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
1773  4: texpkb h1.w(TRUE), v5.xwxx, #0
1774  6: addh h0.x(TRUE), h1.w, -h2.y
1775  7: texpkb h2.w(TRUE), v5.zwzz, #0
1776  9: minh h4.w(TRUE), h2.y, h2
1777 10: maxh h5.x(TRUE), h2.y, h2.w
1778 11: texpkb h0.w(TRUE), v5, #0
1779 13: addh h3.w(TRUE), -h0, h0.x
1780 14: addh h0.x(TRUE), h0.w, h0
1781 15: addh h0.z(TRUE), -h2.w, h0.x
1782 16: addh h0.x(TRUE), h2.w, h3.w
1783 17: minh h5.y(TRUE), h0.w, h1.w
1784 18: nrmh h2.xz(TRUE), h0_n
1785 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|
1786 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w
1787 21: movr r1.zw(TRUE), v4.xxxy
1788 22: maxh h2.w(TRUE), h0, h1
1789 23: fenct TRUE
1790 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
1791 26: texpkb h0(TRUE), r0, #0
1792 28: maxh h5.x(TRUE), h2.w, h5
1793 29: minh h5.w(TRUE), h5.y, h4
1794 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
1795 32: texpkb h2(TRUE), r1, #0
1796 34: addh_d2 h2(TRUE), h0, h2
1797 35: texpkb h1(TRUE), v4, #0
1798 37: maxh h5.y(TRUE), h5.x, h1.w
1799 38: minh h4.w(TRUE), h1, h5
1800 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
1801 41: texpkb h0(TRUE), r0, #0
1802 43: addh_m8 h5.z(TRUE), h5.y, -h4.w
1803 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
1804 46: texpkb h3(TRUE), r2, #0
1805 48: addh_d2 h0(TRUE), h0, h3
1806 49: addh_d2 h3(TRUE), h0, h2
1807 50: movh h0(TRUE), h3
1808 51: slth h3.x(TRUE), h3.w, h5.w
1809 52: sgth h3.w(TRUE), h3, h5.x
1810 53: addx.c0 rc(TRUE), h3.x, h3
1811 54: slth.c0 rc(TRUE), h5.z, h5
1812 55: movh h0(c0.NE.w), h2
1813 56: movh h0(c0.NE.x), h1
1814
1815IPU0 ------ Simplified schedule: --------
1816Pass |  Unit  |  uOp |  PC:  Op
1817-----+--------+------+-------------------------
1818   1 | SCT0/1 |  mov |   0:  TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
1819     |    TEX |  txl |   0:  TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
1820     |   SCB0 |  add |   2:  ADDh h2.y, h0.-w--, const.-x--;
1821     |        |      |
1822   2 | SCT0/1 |  mov |   4:  TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
1823     |    TEX |  txl |   4:  TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
1824     |   SCB0 |  add |   6:  ADDh h0.x, h1.w---,-h2.y---;
1825     |        |      |
1826   3 | SCT0/1 |  mov |   7:  TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
1827     |    TEX |  txl |   7:  TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
1828     |   SCB0 |  max |  10:  MAXh h5.x, h2.y---, h2.w---;
1829     |   SCB1 |  min |   9:  MINh h4.w, h2.---y, h2;
1830     |        |      |
1831   4 | SCT0/1 |  mov |  11:  TXLr h0.w, g[TEX1], const.xxxx, TEX0;
1832     |    TEX |  txl |  11:  TXLr h0.w, g[TEX1], const.xxxx, TEX0;
1833     |   SCB0 |  add |  14:  ADDh h0.x, h0.w---, h0;
1834     |   SCB1 |  add |  13:  ADDh h3.w,-h0, h0.---x;
1835     |        |      |
1836   5 |   SCT0 |  mad |  16:  ADDh h0.x, h2.w---, h3.w---;
1837     |   SCT1 |  mad |  15:  ADDh h0.z,-h2.--w-, h0.--x-;
1838     |   SCB0 |  min |  17:  MINh h5.y, h0.-w--, h1.-w--;
1839     |        |      |
1840   6 |   SCT1 |  mov |  18:  NRMh h2.xz, h0;
1841     |    SRB |  nrm |  18:  NRMh h2.xz, h0;
1842     |   SCB1 |  min |  19:  MINh*8 h2.w, |h2.---x|, |h2.---z|;
1843     |        |      |
1844   7 |   SCT0 |  div |  20:  DIVx h4.xy, h2.xz--, h2.ww--;
1845     |   SCT1 |  mov |  21:  MOVr r1.zw, g[TEX0].--xy;
1846     |   SCB1 |  max |  22:  MAXh h2.w, h0, h1;
1847     |        |      |
1848   8 |   SCT0 |  mad |  24:  MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;
1849     |   SCT1 |  mov |  26:  TXLr h0, r0, const.xxxx, TEX0;
1850     |    TEX |  txl |  26:  TXLr h0, r0, const.xxxx, TEX0;
1851     |   SCB0 |  max |  28:  MAXh h5.x, h2.w---, h5;
1852     |   SCB1 |  min |  29:  MINh h5.w, h5.---y, h4;
1853     |        |      |
1854   9 |   SCT0 |  mad |  30:  MADr r1.xy, h2.xz--, const.zw--, r1.zw--;
1855     |   SCT1 |  mov |  32:  TXLr h2, r1, const.xxxx, TEX0;
1856     |    TEX |  txl |  32:  TXLr h2, r1, const.xxxx, TEX0;
1857     | SCB0/1 |  add |  34:  ADDh/2 h2, h0, h2;
1858     |        |      |
1859  10 | SCT0/1 |  mov |  35:  TXLr h1, g[TEX0], const.xxxx, TEX0;
1860     |    TEX |  txl |  35:  TXLr h1, g[TEX0], const.xxxx, TEX0;
1861     |   SCB0 |  max |  37:  MAXh h5.y, h5.-x--, h1.-w--;
1862     |   SCB1 |  min |  38:  MINh h4.w, h1, h5;
1863     |        |      |
1864  11 |   SCT0 |  mad |  39:  MADr r0.xy,-h4, const.xy--, r1.zw--;
1865     |   SCT1 |  mov |  41:  TXLr h0, r0, const.zzzz, TEX0;
1866     |    TEX |  txl |  41:  TXLr h0, r0, const.zzzz, TEX0;
1867     |   SCB0 |  mad |  44:  MADr r2.xy, h4, const.xy--, r1.zw--;
1868     |   SCB1 |  add |  43:  ADDh*8 h5.z, h5.--y-,-h4.--w-;
1869     |        |      |
1870  12 | SCT0/1 |  mov |  46:  TXLr h3, r2, const.xxxx, TEX0;
1871     |    TEX |  txl |  46:  TXLr h3, r2, const.xxxx, TEX0;
1872     | SCB0/1 |  add |  48:  ADDh/2 h0, h0, h3;
1873     |        |      |
1874  13 | SCT0/1 |  mad |  49:  ADDh/2 h3, h0, h2;
1875     | SCB0/1 |  mul |  50:  MOVh h0, h3;
1876     |        |      |
1877  14 |   SCT0 |  set |  51:  SLTh h3.x, h3.w---, h5.w---;
1878     |   SCT1 |  set |  52:  SGTh h3.w, h3, h5.---x;
1879     |   SCB0 |  set |  54:  SLThc0 rc, h5.z---, h5;
1880     |   SCB1 |  add |  53:  ADDxc0_s rc, h3.---x, h3;
1881     |        |      |
1882  15 | SCT0/1 |  mul |  55:  MOVh h0(NE0.wwww), h2;
1883     | SCB0/1 |  mul |  56:  MOVh h0(NE0.xxxx), h1;
1884
1885Pass   SCT  TEX  SCB
1886  1:   0% 100%  25%
1887  2:   0% 100%  25%
1888  3:   0% 100%  50%
1889  4:   0% 100%  50%
1890  5:  50%   0%  25%
1891  6:   0%   0%  25%
1892  7: 100%   0%  25%
1893  8:   0% 100%  50%
1894  9:   0% 100% 100%
1895 10:   0% 100%  50%
1896 11:   0% 100%  75%
1897 12:   0% 100% 100%
1898 13: 100%   0% 100%
1899 14:  50%   0%  50%
1900 15: 100%   0% 100%
1901
1902MEAN:  26%  60%  56%
1903
1904Pass   SCT0  SCT1   TEX  SCB0  SCB1
1905  1:    0%    0%  100%  100%    0%
1906  2:    0%    0%  100%  100%    0%
1907  3:    0%    0%  100%  100%  100%
1908  4:    0%    0%  100%  100%  100%
1909  5:  100%  100%    0%  100%    0%
1910  6:    0%    0%    0%    0%  100%
1911  7:  100%  100%    0%    0%  100%
1912  8:    0%    0%  100%  100%  100%
1913  9:    0%    0%  100%  100%  100%
1914 10:    0%    0%  100%  100%  100%
1915 11:    0%    0%  100%  100%  100%
1916 12:    0%    0%  100%  100%  100%
1917 13:  100%  100%    0%  100%  100%
1918 14:  100%  100%    0%  100%  100%
1919 15:  100%  100%    0%  100%  100%
1920
1921MEAN:   33%   33%   60%   86%   80%
1922Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
1923Results 15 cycles, 3 r regs, 800,000,000 pixels/s
1924============================================================================*/
1925#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)
1926/*--------------------------------------------------------------------------*/
1927#pragma regcount 7
1928#pragma disablepc all
1929#pragma option O2
1930#pragma option OutColorPrec=fp16
1931#pragma texformat default RGBA8
1932/*==========================================================================*/
1933half4 FxaaPixelShader(
1934    // See FXAA Quality FxaaPixelShader() source for docs on Inputs!
1935    FxaaFloat2 pos,
1936    FxaaFloat4 fxaaConsolePosPos,
1937    FxaaTex tex,
1938    FxaaTex fxaaConsole360TexExpBiasNegOne,
1939    FxaaTex fxaaConsole360TexExpBiasNegTwo,
1940    FxaaFloat2 fxaaQualityRcpFrame,
1941    FxaaFloat4 fxaaConsoleRcpFrameOpt,
1942    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
1943    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
1944    FxaaFloat fxaaQualitySubpix,
1945    FxaaFloat fxaaQualityEdgeThreshold,
1946    FxaaFloat fxaaQualityEdgeThresholdMin,
1947    FxaaFloat fxaaConsoleEdgeSharpness,
1948    FxaaFloat fxaaConsoleEdgeThreshold,
1949    FxaaFloat fxaaConsoleEdgeThresholdMin,
1950    FxaaFloat4 fxaaConsole360ConstDir
1951) {
1952/*--------------------------------------------------------------------------*/
1953// (1)
1954    half4 rgbyNe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0));
1955    #if (FXAA_GREEN_AS_LUMA == 0)
1956        half lumaNe = rgbyNe.w + half(1.0/512.0);
1957    #else
1958        half lumaNe = rgbyNe.y + half(1.0/512.0);
1959    #endif
1960/*--------------------------------------------------------------------------*/
1961// (2)
1962    half4 lumaSw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0));
1963    #if (FXAA_GREEN_AS_LUMA == 0)
1964        half lumaSwNegNe = lumaSw.w - lumaNe;
1965    #else
1966        half lumaSwNegNe = lumaSw.y - lumaNe;
1967    #endif
1968/*--------------------------------------------------------------------------*/
1969// (3)
1970    half4 lumaNw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0));
1971    #if (FXAA_GREEN_AS_LUMA == 0)
1972        half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);
1973        half lumaMinNwSw = min(lumaNw.w, lumaSw.w);
1974    #else
1975        half lumaMaxNwSw = max(lumaNw.y, lumaSw.y);
1976        half lumaMinNwSw = min(lumaNw.y, lumaSw.y);
1977    #endif
1978/*--------------------------------------------------------------------------*/
1979// (4)
1980    half4 lumaSe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0));
1981    #if (FXAA_GREEN_AS_LUMA == 0)
1982        half dirZ =  lumaNw.w + lumaSwNegNe;
1983        half dirX = -lumaNw.w + lumaSwNegNe;
1984    #else
1985        half dirZ =  lumaNw.y + lumaSwNegNe;
1986        half dirX = -lumaNw.y + lumaSwNegNe;
1987    #endif
1988/*--------------------------------------------------------------------------*/
1989// (5)
1990    half3 dir;
1991    dir.y = 0.0;
1992    #if (FXAA_GREEN_AS_LUMA == 0)
1993        dir.x =  lumaSe.w + dirX;
1994        dir.z = -lumaSe.w + dirZ;
1995        half lumaMinNeSe = min(lumaNe, lumaSe.w);
1996    #else
1997        dir.x =  lumaSe.y + dirX;
1998        dir.z = -lumaSe.y + dirZ;
1999        half lumaMinNeSe = min(lumaNe, lumaSe.y);
2000    #endif
2001/*--------------------------------------------------------------------------*/
2002// (6)
2003    half4 dir1_pos;
2004    dir1_pos.xy = normalize(dir).xz;
2005    half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS);
2006/*--------------------------------------------------------------------------*/
2007// (7)
2008    half4 dir2_pos;
2009    dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));
2010    dir1_pos.zw = pos.xy;
2011    dir2_pos.zw = pos.xy;
2012    #if (FXAA_GREEN_AS_LUMA == 0)
2013        half lumaMaxNeSe = max(lumaNe, lumaSe.w);
2014    #else
2015        half lumaMaxNeSe = max(lumaNe, lumaSe.y);
2016    #endif
2017/*--------------------------------------------------------------------------*/
2018// (8)
2019    half4 temp1N;
2020    temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw;
2021    temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
2022    half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);
2023    half lumaMin = min(lumaMinNwSw, lumaMinNeSe);
2024/*--------------------------------------------------------------------------*/
2025// (9)
2026    half4 rgby1;
2027    rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw;
2028    rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
2029    rgby1 = (temp1N + rgby1) * 0.5;
2030/*--------------------------------------------------------------------------*/
2031// (10)
2032    half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));
2033    #if (FXAA_GREEN_AS_LUMA == 0)
2034        half lumaMaxM = max(lumaMax, rgbyM.w);
2035        half lumaMinM = min(lumaMin, rgbyM.w);
2036    #else
2037        half lumaMaxM = max(lumaMax, rgbyM.y);
2038        half lumaMinM = min(lumaMin, rgbyM.y);
2039    #endif
2040/*--------------------------------------------------------------------------*/
2041// (11)
2042    half4 temp2N;
2043    temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw;
2044    temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
2045    half4 rgby2;
2046    rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw;
2047    half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__PS3_EDGE_THRESHOLD;
2048/*--------------------------------------------------------------------------*/
2049// (12)
2050    rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
2051    rgby2 = (temp2N + rgby2) * 0.5;
2052/*--------------------------------------------------------------------------*/
2053// (13)
2054    rgby2 = (rgby2 + rgby1) * 0.5;
2055/*--------------------------------------------------------------------------*/
2056// (14)
2057    #if (FXAA_GREEN_AS_LUMA == 0)
2058        bool twoTapLt = rgby2.w < lumaMin;
2059        bool twoTapGt = rgby2.w > lumaMax;
2060    #else
2061        bool twoTapLt = rgby2.y < lumaMin;
2062        bool twoTapGt = rgby2.y > lumaMax;
2063    #endif
2064    bool earlyExit = lumaRangeM < lumaMax;
2065    bool twoTap = twoTapLt || twoTapGt;
2066/*--------------------------------------------------------------------------*/
2067// (15)
2068    if(twoTap) rgby2 = rgby1;
2069    if(earlyExit) rgby2 = rgbyM;
2070/*--------------------------------------------------------------------------*/
2071    return rgby2; }
2072/*==========================================================================*/
2073#endif
2074